{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Этот ноутбук работает только на линуксовских машинах. На Windows запустить не получится, потому что библиотека tensorflow-text есть только для Linux. Можно запустить его, например, на Google.Colab.\n", "\n", "### Решаем задачу автоматической проверки свободных ответов учеников на тестовые задания с помощью векторов и линейного классификатора\n", "Пример тестового задания есть на картинке в репозитории, который клонируется во второй клетке с кодом. Датасет состоит из 3 колонок: id, текст ответа и оценка." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# установка библиотеки tensorflow-text, USE использует ее для предобработки текстов\n", "\n", "!pip install -U tensorflow-text" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# клонируем репозиторий с гитхаба\n", "\n", "!git clone -l -s https://github.com/Krinistopen/hse-workshop.git\n", "%cd hse-workshop\n", "!ls" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import tensorflow_hub as hub\n", "import tensorflow as tf\n", "import numpy as np\n", "import tensorflow_text\n", "from google.colab import files\n", "import pandas as pd\n", "\n", "from sklearn.naive_bayes import GaussianNB, BernoulliNB\n", "from sklearn.svm import SVC\n", "from sklearn.linear_model import LogisticRegressionCV\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import precision_recall_fscore_support as prfs" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# загружаем модель Universal Sentence Encoder\n", "\n", "embed = hub.load(\"https://tfhub.dev/google/universal-sentence-encoder-multilingual/3\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# считываем датасет, указываем название колонки с текстами и оценкой\n", "\n", "data = pd.read_csv('data.csv', sep=';')\n", "text_var = 'text'\n", "class_var = 'mark'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data.head() " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# визуализация распреления оценок в датасете\n", "\n", "df = data[class_var].value_counts(normalize=True) * 100\n", "df.plot.bar(x=class_var)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# оценка -1 ставится за отсутствие ответа, такие случаи проще ловить отдельным кодом\n", "# здесь исключаем из датасета ответы с такой оценкой\n", "\n", "data = data[(data[class_var] != -1)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# в данном задании оценка ставится по трехбалльной шкале:\n", "# 0 баллов за неправильный вариант, 1 балл за правильный вариант,\n", "# 2 балла за правильный вариант и правильное объяснение своего ответа\n", "# но попробуем сначала обучить модель отличать неправильные ответы (0 баллов)\n", "# от сколько-нибудь правильных (1 и 2 балла), для этого все оценки 2\n", "# превратим в 1, останется 2 класса и задача бинарной классификации\n", "\n", "marks_new = []\n", "for i, mark in enumerate(data[class_var]):\n", " if mark == 2:\n", " marks_new.append(1)\n", " else:\n", " marks_new.append(mark)\n", "class_var = 'class_new'\n", "data[class_var] = marks_new\n", "del marks_new" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# устанавливаем размер обучающей выборки в процентах\n", "\n", "train_vol = 0.7" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# делим датасет на обучающую и тестовую выборки\n", "# из обучающей выборки удаляем дубликаты ответов\n", "\n", "train, test = train_test_split(data, train_size = train_vol, random_state = 99, stratify = data[class_var])\n", "train = train.drop_duplicates(subset=[text_var])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# из тестовой выборки удаляем все ответы, которые есть в обучающей\n", "\n", "check = []\n", "train_data = train[text_var].tolist()\n", "for text in test[text_var]:\n", " if text not in train_data:\n", " check.append('KEEP')\n", " else:\n", " check.append('IGNORE')\n", "test['check'] = check\n", "test = test.loc[test['check'] == 'KEEP']\n", "test = test.drop(['check'], axis=1)\n", "del check" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# переводим все ответы из обучающей выборки в вектора\n", "\n", "train_embeddings = []\n", "for text in train_data:\n", " embedding = embed(str(text))[0]\n", " train_embeddings.append(embedding)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# переводим все ответы из тестовой выборки в вектора\n", "\n", "test_embeddings = []\n", "for text in test[text_var]:\n", " embedding = embed(str(text))[0]\n", " test_embeddings.append(embedding)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# инициализируем классификатор с некоторыми параметрами\n", "\n", "# nbg = GaussianNB()\n", "# nbb = BernoulliNB()\n", "# lg = LogisticRegressionCV(cv=5, multi_class='ovr', max_iter=1200)\n", "svm = SVC(kernel='poly', gamma='scale', probability=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# получаем список оценок из обучающей и тестовой выборок\n", "\n", "classes = train[class_var].tolist()\n", "classes_to_check = test[class_var].tolist()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# обучаем классификатор\n", "\n", "# nbg.fit(train_embeddings,classes)\n", "# nbb.fit(train_embeddings,classes)\n", "# lg.fit(train_embeddings,classes)\n", "svm.fit(train_embeddings,classes)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# получаем оценки для ответов из тестовой выборки\n", "\n", "# classesNBG = nbg.predict(test_embeddings)\n", "# classesNBB = nbb.predict(test_embeddings)\n", "# classesLG = lg.predict(test_embeddings)\n", "classesSVM = svm.predict(test_embeddings)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# получаем точность, полноту и ф-меру для класса 0 (неправильные ответы)\n", "\n", "metrics = prfs(classes_to_check, classesSVM, pos_label = 0, average = 'binary')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "metrics" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### То же самое, только с SVD векторами\n", "\n", "Пока код работает не так, как я планировал. Причину выясняю." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# установка библиотеки для морфологического анализа слов русского языка\n", "\n", "!pip install -U pymorphy2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import gensim\n", "import pandas as pd\n", "import pymorphy2\n", "import pprint\n", "from collections import defaultdict\n", "from gensim import corpora, models\n", "import re" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "morph = pymorphy2.MorphAnalyzer()\n", "frequency = defaultdict(int)\n", "from rustoplist import stopwords" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def tokenize(document):\n", " '''\n", " Токенизатор. Фильтрует текст от пунктуации, цифр, если они не являются частью слова \n", " (чтобы оставить токены типа \"ТЕЛЕ2\", \"2gis\" и т.п), множественных пробелов.\n", " ''' \n", " punc_1 = re.compile(r'[\\.,;:\\!\\?/\\|\\\\@#\\$%\\^\\&\\*)(_=\\+\\]\\[}{\"`~<>»«\\'@]')\n", " punc_2 = re.compile(r'[،;؛¿!\"\\])}»›”؟¡%٪°±©®।॥…“‘„‚«‹「『\\–—]')\n", " document = document.lower()\n", " document = re.sub(punc_1, ' ', document)\n", " document = re.sub(punc_2, ' ', document)\n", " document = re.sub(r'(?= cutoff]\n", " for text in texts]\n", " return new_texts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def preproc(documents, cutoff=2, stoplist=stopwords):\n", " '''\n", " Подготовка текстовых данных к извлечению тематик.\n", " '''\n", " texts = []\n", "\n", " if stoplist:\n", " for document in documents:\n", " text = []\n", " for token in tokenize(document):\n", " token = morphanalyze(token)\n", " if token not in stoplist:\n", " text.append(token)\n", " texts.append(text)\n", " new_texts = applyCutoff(texts, cutoff)\n", "\n", " else:\n", " for document in documents:\n", " text = []\n", " for token in tokenize(document):\n", " token = morphanalyze(token)\n", " text.append(token)\n", " texts.append(text)\n", " new_texts = applyCutoff(texts, cutoff)\n", "\n", " del texts\n", " return new_texts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data = pd.read_csv('data.csv', sep=';')\n", "text_var = 'text'\n", "class_var = 'mark'\n", "data = data[(data[class_var] != -1)]\n", "marks_new = []\n", "for i, mark in enumerate(data[class_var]):\n", " if mark == 2:\n", " marks_new.append(1)\n", " else:\n", " marks_new.append(mark)\n", "class_var = 'class_new'\n", "data[class_var] = marks_new\n", "del marks_new" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "texts = data[text_var]\n", "ids = data['id']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "texts_preproc = preproc(texts)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# каждому слову присваиваем id, а тексты представляем не как \n", "# последовательность слов, а как последовательность id\n", "# именно здесь код работает не так, как я ожидал:\n", "# в словаре с id слов почти в 2 раза меньше, чем реально в корпусе\n", "\n", "dictionary = corpora.Dictionary(texts_preproc)\n", "corpus = [dictionary.doc2bow(text) for text in texts_preproc]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pprint.pprint(corpus[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# взвешиваем все слова в текстах\n", "\n", "tfidf = models.TfidfModel(corpus)\n", "corpus_weighted = tfidf[corpus]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "corpus_weighted[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# проводим сингулярное разложение терм-документной матрицы\n", "\n", "model = models.LsiModel(corpus_weighted, id2word=dictionary, num_topics=300)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model.show_topics()[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model[corpus_weighted[0]]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "embeddings = {}\n", "\n", "for i, ID in enumerate(ids):\n", " embedding_raw = model[corpus_weighted[i]]\n", " embedding = [value for ind, value in embedding_raw]\n", " embeddings.update({ID: embedding})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_vol = 0.7\n", "train, test = train_test_split(data, train_size = train_vol, random_state = 99, stratify = data[class_var])\n", "train = train.drop_duplicates(subset=[text_var])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "check = []\n", "train_data = train[text_var].tolist()\n", "for text in test[text_var]:\n", " if text not in train_data:\n", " check.append('KEEP')\n", " else:\n", " check.append('IGNORE')\n", "test['check'] = check\n", "test = test.loc[test['check'] == 'KEEP']\n", "test = test.drop(['check'], axis=1)\n", "del check" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_embeddings = [embeddings[ID] for ID in train['id'].tolist()]\n", "test_embeddings = [embeddings[ID] for ID in test['id'].tolist()]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "svm = SVC(kernel='poly', gamma='scale', probability=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "classes = train[class_var].tolist()\n", "classes_to_check = test[class_var].tolist()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "svm.fit(train_embeddings,classes)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Код для проверки среднего результата на 10 прогонах\n", "\n", "Когда мы один раз разделили датасет на обучающую и тестовую выборку, обучили классификатор и проверили качество, то эти метрики качества нельзя считать окончательными. Возможно, нам повезло, и обучающая выборка получилась хорошей. А если датасет разделить так, что на обучающей выборке классификатор обучится не так хорошо? Качество получится ниже. \n", "\n", "Чтобы получить адекватные метрики качества, нужно хотя бы 10 раз случайным образом, по-разному разделить датасет на обучающую и тестовую выборки и 10 раз обучить один и тот же классификатор. Затем вывести средние показатели качества. Код ниже проделывает эту операцию для 4 разных классификаторов и в конце выводит график для сравнения средних метрик." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_results = pd.DataFrame(columns=['classifier', 'train_size', 'average_precision',\n", " 'average_recall', 'average_fscore'])\n", "precision_all = {'NBG':[], 'NBB':[], 'SVM':[], 'LG':[]}\n", "recall_all = {'NBG':[], 'NBB':[], 'SVM':[], 'LG':[]}\n", "fscore_all = {'NBG':[], 'NBB':[], 'SVM':[], 'LG':[]}\n", "\n", "data = pd.read_csv('data.csv', sep=';')\n", "\n", "text_var = 'text'\n", "class_var = 'mark'\n", "\n", "data = data[data[class_var] != -1]\n", "\n", "marks_new = []\n", "for i, mark in enumerate(data[class_var]):\n", " if mark == 2:\n", " marks_new.append(1)\n", " else:\n", " marks_new.append(mark)\n", "class_var = 'class_new'\n", "data[class_var] = marks_new\n", "\n", "train_vol = 0.7\n", "\n", "for i in range(10):\n", " train, test = train_test_split(data, train_size = train_vol, stratify = data[class_var])\n", " train = train.drop_duplicates(subset=[text_var])\n", " check = []\n", " train_data = train[text_var].tolist()\n", " for text in test[text_var]:\n", " if text not in train_data:\n", " check.append('KEEP')\n", " else:\n", " check.append('IGNORE')\n", " test['check'] = check\n", " test = test.loc[test['check'] == 'KEEP']\n", " test = test.drop(['check'], axis=1)\n", " del check\n", "\n", " train_embeddings = []\n", " for text in train_data:\n", " embedding = embed(str(text))[0]\n", " train_embeddings.append(embedding)\n", " print('Прогон {}, вектора для обучения готовы.'.format(i+1))\n", "\n", " test_embeddings = []\n", " for text in test[text_var]:\n", " embedding = embed(str(text))[0]\n", " test_embeddings.append(embedding)\n", " print('Прогон {}, вектора для тестирования готовы.'.format(i+1))\n", "\n", " nbg = GaussianNB()\n", " nbb = BernoulliNB()\n", " svm = SVC(kernel='poly', gamma='scale', probability=True)\n", " lg = LogisticRegressionCV(cv=5, multi_class='ovr', max_iter=1200)\n", "\n", " classes = train[class_var].tolist()\n", "\n", " nbg.fit(train_embeddings,classes)\n", " nbb.fit(train_embeddings,classes)\n", " svm.fit(train_embeddings,classes)\n", " lg.fit(train_embeddings,classes)\n", " print('Прогон {}, все модели обучены.'.format(i+1))\n", "\n", " target = 0\n", "\n", " classesNBG = nbg.predict(test_embeddings)\n", " classesNBB = nbb.predict(test_embeddings)\n", " classesSVM = svm.predict(test_embeddings)\n", " classesLG = lg.predict(test_embeddings)\n", "\n", " classes_to_check = test[class_var].tolist()\n", "\n", " precisionNBG, recallNBG, fscoreNBG = prfs(classes_to_check, classesNBG, pos_label = 0, average = 'binary')[0:3]\n", " precisionNBB, recallNBB, fscoreNBB = prfs(classes_to_check, classesNBB, pos_label = 0, average = 'binary')[0:3]\n", " precisionSVM, recallSVM, fscoreSVM = prfs(classes_to_check, classesSVM, pos_label = 0, average = 'binary')[0:3]\n", " precisionLG, recallLG, fscoreLG = prfs(classes_to_check, classesLG, pos_label = 0, average = 'binary')[0:3]\n", "\n", " precision_all['NBG'].append(precisionNBG)\n", " precision_all['NBB'].append(precisionNBB)\n", " precision_all['SVM'].append(precisionSVM)\n", " precision_all['LG'].append(precisionLG)\n", " recall_all['NBG'].append(recallNBG)\n", " recall_all['NBB'].append(recallNBB)\n", " recall_all['SVM'].append(recallSVM)\n", " recall_all['LG'].append(recallLG)\n", " fscore_all['NBG'].append(fscoreNBG)\n", " fscore_all['NBB'].append(fscoreNBB)\n", " fscore_all['SVM'].append(fscoreSVM)\n", " fscore_all['LG'].append(fscoreLG)\n", " print('Прогон {} закончен.'.format(i+1))\n", "\n", "\n", "for model in ['NBG', 'NBB', 'SVM', 'LG']:\n", " df_results = df_results.append({'classifier': model,\n", " 'average_precision':sum(precision_all[model])/10,\n", " 'average_recall':sum(recall_all[model])/10,\n", " 'average_fscore':sum(fscore_all[model])/10},\n", " ignore_index=True)\n", "\n", "df_results.set_index('classifier')\n", "df_results_t = df_results.transpose()\n", "df_results_t.columns = df_results_t.iloc[0]\n", "df_results_t.reset_index(inplace=True)\n", "df_results_t = df_results_t.drop([0], axis=0)\n", "df_results_t.plot.bar('index')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }