{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "basic_nlp_tools.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "metadata": { "id": "4XqQzx7d7O2E" }, "source": [ "text = 'Жан Антуан Вердье родился 02.05.1767 в Тулузе и умер 30.05.1839 в Маконе, Франция.'" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "hZRTZHZ06Qmi" }, "source": [ "# Tokenization" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "STjw7w2r6TOk", "outputId": "9042120c-7776-43c1-ddd9-60b4ef277829" }, "source": [ "!pip install razdel" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Collecting razdel\n", " Downloading razdel-0.5.0-py3-none-any.whl (21 kB)\n", "Installing collected packages: razdel\n", "Successfully installed razdel-0.5.0\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "RNxdNqXC6Wre" }, "source": [ "from razdel import tokenize, sentenize" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "dUXxWoya6fpA", "outputId": "b57a144c-1c41-40b0-90db-87e5676a0eab" }, "source": [ "for t in tokenize(text):\n", " print(t)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Substring(0, 3, 'Жан')\n", "Substring(4, 10, 'Антуан')\n", "Substring(11, 17, 'Вердье')\n", "Substring(18, 25, 'родился')\n", "Substring(26, 36, '02.05.1767')\n", "Substring(37, 38, 'в')\n", "Substring(39, 45, 'Тулузе')\n", "Substring(46, 47, 'и')\n", "Substring(48, 52, 'умер')\n", "Substring(53, 63, '30.05.1839')\n", "Substring(64, 65, 'в')\n", "Substring(66, 72, 'Маконе')\n", "Substring(72, 73, ',')\n", "Substring(74, 81, 'Франция')\n", "Substring(81, 82, '.')\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "iiauFcmM6Z_B" }, "source": [ "# Morphology" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ptd5ZYKq6cMb", "outputId": "5640f3f0-7506-497a-ea94-c0e0cffedeb7" }, "source": [ "!pip install pymorphy2 pymorphy2-dicts-ru" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Collecting pymorphy2\n", " Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)\n", "\u001b[?25l\r\u001b[K |██████ | 10 kB 23.6 MB/s eta 0:00:01\r\u001b[K |███████████▉ | 20 kB 25.0 MB/s eta 0:00:01\r\u001b[K |█████████████████▊ | 30 kB 12.1 MB/s eta 0:00:01\r\u001b[K |███████████████████████▋ | 40 kB 9.3 MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▌ | 51 kB 5.2 MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 55 kB 2.1 MB/s \n", "\u001b[?25hCollecting pymorphy2-dicts-ru\n", " Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)\n", "\u001b[K |████████████████████████████████| 8.2 MB 10.0 MB/s \n", "\u001b[?25hRequirement already satisfied: docopt>=0.6 in /usr/local/lib/python3.7/dist-packages (from pymorphy2) (0.6.2)\n", "Collecting dawg-python>=0.7.1\n", " Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)\n", "Installing collected packages: pymorphy2-dicts-ru, dawg-python, pymorphy2\n", "Successfully installed dawg-python-0.7.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "I09DFMT16mXS" }, "source": [ "from pymorphy2 import MorphAnalyzer\n", "morph = MorphAnalyzer()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "EColXEF06q4i", "outputId": "f29442e5-87ab-41c3-bb81-a11c6802c552" }, "source": [ "hypotheses = morph.parse('родился')\n", "for h in hypotheses:\n", " print(h)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Parse(word='родился', tag=OpencorporaTag('VERB,perf,intr masc,sing,past,indc'), normal_form='родиться', score=0.5, methods_stack=((DictionaryAnalyzer(), 'родился', 2802, 1),))\n", "Parse(word='родился', tag=OpencorporaTag('VERB,impf,intr masc,sing,past,indc'), normal_form='родиться', score=0.5, methods_stack=((DictionaryAnalyzer(), 'родился', 2802, 21),))\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 34 }, "id": "iYMfJRLf60Aj", "outputId": "40cd6023-7f5e-4b9a-89e8-c759b9bf71cb" }, "source": [ "hypotheses[0].normal_form" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" }, "text/plain": [ "'родиться'" ] }, "metadata": { "tags": [] }, "execution_count": 11 } ] }, { "cell_type": "code", "metadata": { "id": "2pTwg0Cf65QR" }, "source": [ "def lemma_tokenize(text):\n", " result = []\n", " for token in tokenize(text):\n", " parsed = morph.parse(token.text)\n", " if not parsed or not parsed[0].normal_form:\n", " result.append(token.text.lower())\n", " else:\n", " result.append(parsed[0].normal_form)\n", " return result" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "zgR3Pq8U7SOz", "outputId": "ceb7c803-ca67-432f-96ae-d32e8341685b" }, "source": [ "print(lemma_tokenize(text))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "['жан', 'антуан', 'вердие', 'родиться', '02.05.1767', 'в', 'тулуза', 'и', 'умереть', '30.05.1839', 'в', 'макон', ',', 'франция', '.']\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "hz3fGL1u2M9y" }, "source": [ "# Regular Expresions\n", "https://docs.python.org/3/howto/regex.html" ] }, { "cell_type": "code", "metadata": { "id": "HHAapIL-3o1A" }, "source": [ "import re" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3NE4pJwg2bRv", "outputId": "39fc8291-1838-46ae-ad66-aadaf27bd89e" }, "source": [ "pattern = '[0-9]{1,2}\\.[0-9]{1,2}\\.[0-9]{4}'\n", "print(re.findall(pattern, text))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "['02.05.1767', '30.05.1839']\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "fxilGkUz33Kj" }, "source": [ "Для более детального анализа совпадений можно использовать именованные группы. " ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "sNFKjX5D2ppq", "outputId": "2374d1e5-a018-4ab6-aae6-be21da6e73c8" }, "source": [ "pattern = '(?P[0-9]{2})\\.(?P[0-9]{2})\\.(?P[0-9]{4})'\n", "print([match.groupdict() for match in re.finditer(pattern, text)])" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "[{'day': '02', 'month': '05', 'year': '1767'}, {'day': '30', 'month': '05', 'year': '1839'}]\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "jflQZl273SYs" }, "source": [ "# Grammars" ] }, { "cell_type": "code", "metadata": { "id": "bvbKUcXl5Pmy" }, "source": [ "from nltk import CFG, wordpunct_tokenize\n", "from nltk.parse import BottomUpLeftCornerChartParser" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "0knGYb0T5H1X" }, "source": [ "grammar_text = \"\"\"\n", "S -> ANY\n", "S -> FROM | FROM ANY | ANY FROM | ANY FROM ANY\n", "S -> EXPLICIT_TO | ANY EXPLICIT_TO | EXPLICIT_TO ANY | ANY EXPLICIT_TO ANY\n", "S -> FROM TO | ANY FROM TO | FROM TO ANY | ANY FROM TO ANY\n", "\n", "ANY -> TRASH | PARAM | TRAIN \n", "ANY -> TRASH ANY | PARAM ANY | TRAIN ANY\n", "\n", "PARAM -> TODAY | TOMORROW | CHANGES | BIRD | EXPRESS | DIRECTION | DATE | PRICE\n", "\n", "TRASH -> 'тут' | '2018' | '2019' | 'tutu' | 'ru' | 'ру' | 'яндекс' | 'направление'\n", "TRAIN -> 'билет' | 'на' 'электричка' | 'расписание' | 'электричка'\n", "\n", "TODAY -> 'сегодня' | 'на' 'сегодня'\n", "TOMORROW -> 'завтра' | 'на' 'завтра'\n", "CHANGES -> 'изменение' | 'с' 'изменение'\n", "BIRD -> 'ласточка'\n", "EXPRESS -> 'экспресс'\n", "PRICE -> 'цена' | 'стоимость'\n", "\n", "FROM -> FROM_PLACE | EXPLICIT_FROM\n", "TO -> TO_PLACE | EXPLICIT_TO\n", "\n", "EXPLICIT_FROM -> FROM_WORD FROM_PLACE\n", "EXPLICIT_TO -> TO_WORD TO_PLACE\n", "\n", "FROM_PLACE -> PLACE\n", "TO_PLACE -> PLACE\n", "\n", "FROM_WORD -> 'с' | 'от' | 'из'\n", "TO_WORD -> 'на' | 'к' | 'до' | 'в'\n", "\n", "DATE -> 'на' REAL_DATE | REAL_DATE\n", "REAL_DATE -> MONTH | DAY | DAY MONTH | WEEKDAY\n", "DAY -> '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' | '10' | '11' | '12' | '13' | '14' | '15' | '16' | '17' | '18' | '19' | '20' | '21' | '22' | '23' | '24' | '25' | '26' | '27' | '28' | '29' | '30' | '31'\n", "WEEKDAY -> 'понедельник' | 'вторник' | 'среда' | 'четверг' | 'пятница' | 'суббота' | 'воскресенье'\n", "MONTH -> 'январь' | 'февраль' | 'март' | 'апрель' | 'май' | 'июнь' | 'июль' | 'август' | 'сентябрь' | 'октябрь' | 'ноябрь' | 'декабрь'\n", "\n", "PLACE -> 'москва' | 'питер' | 'петушки'\n", "DIRECTION -> 'казанский'\n", "\"\"\"\n", "grammar = CFG.fromstring(grammar_text)\n", "grammar_parser = BottomUpLeftCornerChartParser(grammar)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "H3_LOCFU5IPw" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Hh9z-9-25Sd7" }, "source": [ "def try_parsing(parser, text):\n", " if isinstance(text, str):\n", " text = lemma_tokenize(text)\n", " try:\n", " result = parser.parse_one(text)\n", " return result\n", " except ValueError:\n", " return None" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "k00Ky2R-5UX5", "outputId": "dcc4d2d7-88c7-4227-fd2f-2743a6f2ed47" }, "source": [ "print(try_parsing(grammar_parser, 'расписание электричек от москвы до питера на завтра'))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "(S\n", " (ANY (TRAIN расписание) (ANY (TRAIN электричка)))\n", " (FROM (EXPLICIT_FROM (FROM_WORD от) (FROM_PLACE (PLACE москва))))\n", " (TO (EXPLICIT_TO (TO_WORD до) (TO_PLACE (PLACE питер))))\n", " (ANY (PARAM (TOMORROW на завтра))))\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "YcRUrBhs59Ah" }, "source": [ "# Тезаурусы\n", "\n", "https://github.com/avidale/python-ruwordnet/" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "L8eMv5Ta9GYR", "outputId": "6ce10c42-ef55-49aa-9b4e-0072f3ee56a7" }, "source": [ "!pip install ruwordnet\n", "!ruwordnet download" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Collecting ruwordnet\n", " Downloading ruwordnet-0.0.2.tar.gz (6.6 kB)\n", "Requirement already satisfied: sqlalchemy in /usr/local/lib/python3.7/dist-packages (from ruwordnet) (1.4.20)\n", "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.7/dist-packages (from sqlalchemy->ruwordnet) (1.1.0)\n", "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from sqlalchemy->ruwordnet) (4.6.1)\n", "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->sqlalchemy->ruwordnet) (3.5.0)\n", "Requirement already satisfied: typing-extensions>=3.6.4 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->sqlalchemy->ruwordnet) (3.7.4.3)\n", "Building wheels for collected packages: ruwordnet\n", " Building wheel for ruwordnet (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for ruwordnet: filename=ruwordnet-0.0.2-py3-none-any.whl size=7398 sha256=8ad6ba6ed3686652538285d288384f574c3396e9c7ec098750e3760e8c1a09a9\n", " Stored in directory: /root/.cache/pip/wheels/c8/51/8f/403dd402ec844cace9b5d54d26b5001fa8c4df68316330c612\n", "Successfully built ruwordnet\n", "Installing collected packages: ruwordnet\n", "Successfully installed ruwordnet-0.0.2\n", "downloading a ruwordnet model from https://github.com/avidale/python-ruwordnet/releases/download/0.0.2/ruwordnet.db\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "d_MNM4fb9ISl" }, "source": [ "from ruwordnet import RuWordNet\n", "wn = RuWordNet()" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "VFX0qEoN9ZDI", "outputId": "c57b07dd-3cbb-4ab2-e014-bcb36a123515" }, "source": [ "for sense in wn.get_senses('кошка'):\n", " print(sense.synset, [s.lemma for s in sense.synset.senses])" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Synset(id=\"6804-N\", title=\"КОШКА\") ['КОШКА', 'ДОМАШНИЙ КОШКА', 'КОШЕЧКА']\n", "Synset(id=\"110841-N\", title=\"КОШАЧЬИ\") ['КОШКА', 'КОШАЧЬИ', 'СЕМЕЙСТВО КОШАЧИЙ', 'КОШАЧИЙ ХИЩНИК']\n", "Synset(id=\"123870-N\", title=\"КОШКИ ДЛЯ ЛАЗАНИЯ\") ['КОШКА', 'КОШКА ДЛЯ ЛАЗАНИЕ']\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "a7Hauh0n9c3u", "outputId": "0479843c-eceb-473a-c174-fe5d038a9fcf" }, "source": [ "synset = sense.synset\n", "for _ in range(20):\n", " print(synset)\n", " if not synset.hypernyms:\n", " break\n", " print(synset.hypernyms)\n", " print()\n", " synset = synset.hypernyms[0]\n" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Synset(id=\"123870-N\", title=\"КОШКИ ДЛЯ ЛАЗАНИЯ\")\n", "[Synset(id=\"106553-N\", title=\"ПРИСПОСОБЛЕНИЕ, ИНСТРУМЕНТ\")]\n", "\n", "Synset(id=\"106553-N\", title=\"ПРИСПОСОБЛЕНИЕ, ИНСТРУМЕНТ\")\n", "[Synset(id=\"106554-N\", title=\"ПРЕДМЕТ, ВЕЩЬ\")]\n", "\n", "Synset(id=\"106554-N\", title=\"ПРЕДМЕТ, ВЕЩЬ\")\n", "[Synset(id=\"147133-N\", title=\"ФИЗИЧЕСКИЙ ОБЪЕКТ\")]\n", "\n", "Synset(id=\"147133-N\", title=\"ФИЗИЧЕСКИЙ ОБЪЕКТ\")\n", "[Synset(id=\"147134-N\", title=\"ФИЗИЧЕСКАЯ СУЩНОСТЬ\")]\n", "\n", "Synset(id=\"147134-N\", title=\"ФИЗИЧЕСКАЯ СУЩНОСТЬ\")\n", "[Synset(id=\"153782-N\", title=\"ПОСТОЯННАЯ СУЩНОСТЬ\")]\n", "\n", "Synset(id=\"153782-N\", title=\"ПОСТОЯННАЯ СУЩНОСТЬ\")\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yZwHfGjl-4aU", "outputId": "e48c7c85-4963-4993-af80-ac5f56df0ec6" }, "source": [ "wn[\"147134-N\"].hyponyms" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "[Synset(id=\"144149-N\", title=\"ПОКРЫВАЮЩАЯ ЧАСТЬ\"),\n", " Synset(id=\"106451-N\", title=\"МЕСТО В ПРОСТРАНСТВЕ\"),\n", " Synset(id=\"147133-N\", title=\"ФИЗИЧЕСКИЙ ОБЪЕКТ\"),\n", " Synset(id=\"106623-N\", title=\"ОПОРА, ОПОРНАЯ ЧАСТЬ\"),\n", " Synset(id=\"106610-N\", title=\"ПРОСТРАНСТВО\"),\n", " Synset(id=\"820-N\", title=\"ВЕЩЕСТВО\"),\n", " Synset(id=\"150414-N\", title=\"БИОЛОГИЧЕСКАЯ СУЩНОСТЬ\")]" ] }, "metadata": { "tags": [] }, "execution_count": 23 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nT2aN9vK9fJu", "outputId": "753ba4f9-8799-42e7-acff-d218662a6d2d" }, "source": [ "sense = wn.get_senses('гарантировать')[0]\n", "synset = sense.synset\n", "for _ in range(20):\n", " print(synset)\n", " if not synset.hypernyms:\n", " break\n", " print(synset.hypernyms)\n", " print()\n", " synset = synset.hypernyms[0]" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Synset(id=\"116802-V\", title=\"ГАРАНТИРОВАТЬ (ЗАЩИЩАТЬ)\")\n", "[Synset(id=\"106595-V\", title=\"ОБЕРЕГАТЬ, ЗАЩИЩАТЬ\"), Synset(id=\"120410-V\", title=\"ОБЕСПЕЧИТЬ, СОЗДАТЬ УСЛОВИЯ\")]\n", "\n", "Synset(id=\"106595-V\", title=\"ОБЕРЕГАТЬ, ЗАЩИЩАТЬ\")\n", "[Synset(id=\"106597-V\", title=\"ОХРАНЯТЬ\")]\n", "\n", "Synset(id=\"106597-V\", title=\"ОХРАНЯТЬ\")\n", "[Synset(id=\"106473-V\", title=\"ОБУСЛАВЛИВАТЬ, СПОСОБСТВОВАТЬ\")]\n", "\n", "Synset(id=\"106473-V\", title=\"ОБУСЛАВЛИВАТЬ, СПОСОБСТВОВАТЬ\")\n", "[Synset(id=\"111611-V\", title=\"ВЛИЯТЬ, ВОЗДЕЙСТВОВАТЬ\")]\n", "\n", "Synset(id=\"111611-V\", title=\"ВЛИЯТЬ, ВОЗДЕЙСТВОВАТЬ\")\n", "[Synset(id=\"106646-V\", title=\"ОТНОШЕНИЕ МЕЖДУ СУЩНОСТЯМИ\")]\n", "\n", "Synset(id=\"106646-V\", title=\"ОТНОШЕНИЕ МЕЖДУ СУЩНОСТЯМИ\")\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "OpqOZT7B-AJk" }, "source": [ "# Синтаксис" ] }, { "cell_type": "markdown", "metadata": { "id": "HT0bqpI9_LPP" }, "source": [ "https://spacy.io/models/ru" ] }, { "cell_type": "code", "metadata": { "id": "RHcJtUuK_OZF", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "3103f173-049f-4911-9b67-fc10a4134c31" }, "source": [ "!pip install --upgrade spacy\n", "!python -m spacy download ru_core_news_md" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Requirement already satisfied: spacy in /usr/local/lib/python3.7/dist-packages (2.2.4)\n", "Collecting spacy\n", " Downloading spacy-3.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.4 MB)\n", "\u001b[K |████████████████████████████████| 6.4 MB 4.2 MB/s \n", "\u001b[?25hCollecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4\n", " Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)\n", "\u001b[K |████████████████████████████████| 10.1 MB 39.6 MB/s \n", "\u001b[?25hRequirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (3.0.5)\n", "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (4.41.1)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.7/dist-packages (from spacy) (2.11.3)\n", "Requirement already satisfied: wasabi<1.1.0,>=0.8.1 in /usr/local/lib/python3.7/dist-packages (from spacy) (0.8.2)\n", "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.19.5)\n", "Requirement already satisfied: typing-extensions<4.0.0.0,>=3.7.4 in /usr/local/lib/python3.7/dist-packages (from spacy) (3.7.4.3)\n", "Collecting pathy>=0.3.5\n", " Downloading pathy-0.6.0-py3-none-any.whl (42 kB)\n", "\u001b[K |████████████████████████████████| 42 kB 1.1 MB/s \n", "\u001b[?25hCollecting typer<0.4.0,>=0.3.0\n", " Downloading typer-0.3.2-py3-none-any.whl (21 kB)\n", "Collecting catalogue<2.1.0,>=2.0.4\n", " Downloading catalogue-2.0.4-py3-none-any.whl (16 kB)\n", "Collecting thinc<8.1.0,>=8.0.8\n", " Downloading thinc-8.0.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (621 kB)\n", "\u001b[K |████████████████████████████████| 621 kB 41.2 MB/s \n", "\u001b[?25hRequirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (1.0.5)\n", "Collecting spacy-legacy<3.1.0,>=3.0.7\n", " Downloading spacy_legacy-3.0.8-py2.py3-none-any.whl (14 kB)\n", "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy) (2.0.5)\n", "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (2.23.0)\n", "Requirement already satisfied: blis<0.8.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (0.4.1)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from spacy) (21.0)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy) (57.2.0)\n", "Collecting srsly<3.0.0,>=2.4.1\n", " Downloading srsly-2.4.1-cp37-cp37m-manylinux2014_x86_64.whl (456 kB)\n", "\u001b[K |████████████████████████████████| 456 kB 31.5 MB/s \n", "\u001b[?25hRequirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from catalogue<2.1.0,>=2.0.4->spacy) (3.5.0)\n", "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->spacy) (2.4.7)\n", "Requirement already satisfied: smart-open<6.0.0,>=5.0.0 in /usr/local/lib/python3.7/dist-packages (from pathy>=0.3.5->spacy) (5.1.0)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2.10)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2021.5.30)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (1.24.3)\n", "Requirement already satisfied: click<7.2.0,>=7.1.1 in /usr/local/lib/python3.7/dist-packages (from typer<0.4.0,>=0.3.0->spacy) (7.1.2)\n", "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from jinja2->spacy) (2.0.1)\n", "Installing collected packages: catalogue, typer, srsly, pydantic, thinc, spacy-legacy, pathy, spacy\n", " Attempting uninstall: catalogue\n", " Found existing installation: catalogue 1.0.0\n", " Uninstalling catalogue-1.0.0:\n", " Successfully uninstalled catalogue-1.0.0\n", " Attempting uninstall: srsly\n", " Found existing installation: srsly 1.0.5\n", " Uninstalling srsly-1.0.5:\n", " Successfully uninstalled srsly-1.0.5\n", " Attempting uninstall: thinc\n", " Found existing installation: thinc 7.4.0\n", " Uninstalling thinc-7.4.0:\n", " Successfully uninstalled thinc-7.4.0\n", " Attempting uninstall: spacy\n", " Found existing installation: spacy 2.2.4\n", " Uninstalling spacy-2.2.4:\n", " Successfully uninstalled spacy-2.2.4\n", "Successfully installed catalogue-2.0.4 pathy-0.6.0 pydantic-1.8.2 spacy-3.1.1 spacy-legacy-3.0.8 srsly-2.4.1 thinc-8.0.8 typer-0.3.2\n", "2021-07-23 06:47:16.518013: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n", "Collecting ru-core-news-md==3.1.0\n", " Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_md-3.1.0/ru_core_news_md-3.1.0-py3-none-any.whl (42.7 MB)\n", "\u001b[K |████████████████████████████████| 42.7 MB 1.5 MB/s \n", "\u001b[?25hRequirement already satisfied: spacy<3.2.0,>=3.1.0 in /usr/local/lib/python3.7/dist-packages (from ru-core-news-md==3.1.0) (3.1.1)\n", "Requirement already satisfied: pymorphy2>=0.9 in /usr/local/lib/python3.7/dist-packages (from ru-core-news-md==3.1.0) (0.9.1)\n", "Requirement already satisfied: pymorphy2-dicts-ru<3.0,>=2.4 in /usr/local/lib/python3.7/dist-packages (from pymorphy2>=0.9->ru-core-news-md==3.1.0) (2.4.417127.4579844)\n", "Requirement already satisfied: dawg-python>=0.7.1 in /usr/local/lib/python3.7/dist-packages (from pymorphy2>=0.9->ru-core-news-md==3.1.0) (0.7.2)\n", "Requirement already satisfied: docopt>=0.6 in /usr/local/lib/python3.7/dist-packages (from pymorphy2>=0.9->ru-core-news-md==3.1.0) (0.6.2)\n", "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.7 in /usr/local/lib/python3.7/dist-packages (from spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (3.0.8)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.7/dist-packages (from spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (2.11.3)\n", "Requirement already satisfied: typing-extensions<4.0.0.0,>=3.7.4 in /usr/local/lib/python3.7/dist-packages (from spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (3.7.4.3)\n", "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (2.23.0)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (21.0)\n", "Requirement already satisfied: srsly<3.0.0,>=2.4.1 in /usr/local/lib/python3.7/dist-packages (from spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (2.4.1)\n", "Requirement already satisfied: pathy>=0.3.5 in /usr/local/lib/python3.7/dist-packages (from spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (0.6.0)\n", "Requirement already satisfied: blis<0.8.0,>=0.4.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (0.4.1)\n", "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (2.0.5)\n", "Requirement already satisfied: thinc<8.1.0,>=8.0.8 in /usr/local/lib/python3.7/dist-packages (from spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (8.0.8)\n", "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (1.19.5)\n", "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (1.0.5)\n", "Requirement already satisfied: catalogue<2.1.0,>=2.0.4 in /usr/local/lib/python3.7/dist-packages (from spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (2.0.4)\n", "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4 in /usr/local/lib/python3.7/dist-packages (from spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (1.8.2)\n", "Requirement already satisfied: typer<0.4.0,>=0.3.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (0.3.2)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (57.2.0)\n", "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.7/dist-packages (from spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (4.41.1)\n", "Requirement already satisfied: wasabi<1.1.0,>=0.8.1 in /usr/local/lib/python3.7/dist-packages (from spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (0.8.2)\n", "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (3.0.5)\n", "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from catalogue<2.1.0,>=2.0.4->spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (3.5.0)\n", "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=20.0->spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (2.4.7)\n", "Requirement already satisfied: smart-open<6.0.0,>=5.0.0 in /usr/local/lib/python3.7/dist-packages (from pathy>=0.3.5->spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (5.1.0)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (1.24.3)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (2.10)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (3.0.4)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (2021.5.30)\n", "Requirement already satisfied: click<7.2.0,>=7.1.1 in /usr/local/lib/python3.7/dist-packages (from typer<0.4.0,>=0.3.0->spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (7.1.2)\n", "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from jinja2->spacy<3.2.0,>=3.1.0->ru-core-news-md==3.1.0) (2.0.1)\n", "Installing collected packages: ru-core-news-md\n", "Successfully installed ru-core-news-md-3.1.0\n", "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", "You can now load the package via spacy.load('ru_core_news_md')\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "7_2ji2Cz_SWm" }, "source": [ "import spacy\n", "spacy_nlp = spacy.load('ru_core_news_md')" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "bRgHKpc-BE5g" }, "source": [ "Подробнее про синтаксис зависимостей можно почитать тут:\n", "\n", "https://universaldependencies.org/u/dep\n", "и\n", "https://universaldependencies.org/ru/index.html" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6BbhOOdoAWqv", "outputId": "d79b0243-bf68-4141-8ff1-6b0685298b12" }, "source": [ "doc = spacy_nlp('По утрам я кормлю своего любимого кота Мурзика.')\n", "for token in doc:\n", " print(f'{token.text:12} {token.dep_:10} {token.head}')" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "По case утрам\n", "утрам obl кормлю\n", "я nsubj кормлю\n", "кормлю ROOT кормлю\n", "своего det кота\n", "любимого amod кота\n", "кота obj кормлю\n", "Мурзика appos кота\n", ". punct кормлю\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "QQ1pRm5gAbrJ" }, "source": [ "from spacy import displacy" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 420 }, "id": "lVhKTTvTAe-z", "outputId": "ef35e761-40b9-4166-ef91-c1ce0aa58ece" }, "source": [ "displacy.render(doc, style=\"dep\", jupyter=True)" ], "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "text/html": [ "\n", "\n", " По\n", " ADP\n", "\n", "\n", "\n", " утрам\n", " NOUN\n", "\n", "\n", "\n", " я\n", " PRON\n", "\n", "\n", "\n", " кормлю\n", " VERB\n", "\n", "\n", "\n", " своего\n", " DET\n", "\n", "\n", "\n", " любимого\n", " ADJ\n", "\n", "\n", "\n", " кота\n", " NOUN\n", "\n", "\n", "\n", " Мурзика.\n", " PROPN\n", "\n", "\n", "\n", " \n", " \n", " case\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " obl\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " nsubj\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " det\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " amod\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " obj\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " appos\n", " \n", " \n", "\n", "" ], "text/plain": [ "" ] }, "metadata": { "tags": [] } } ] }, { "cell_type": "code", "metadata": { "id": "GUAADLpyKNup" }, "source": [ "" ], "execution_count": null, "outputs": [] } ] }