avidale/conceptnet5_russified.ipynb

## conceptnet5_russified.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 507,
   "id": "f1018ba7",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "dataset = load_dataset(\"conceptnet5\", \"conceptnet5\", streaming=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5e7392f4",
   "metadata": {},
   "source": [
    "Напрямую между русскими понятиями интересных отношений - очень немного. \n",
    "\n",
    "Но можно для каждого интересного отношения между нерусскими понятиями найти русские синонимы, и таким образом собрать русский concept-net. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "id": "1f7b0c6b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm.auto import tqdm, trange\n",
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "id": "42d9093c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7a0616f04240429ea14a4890eebdf0ea",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "cn_ru = []\n",
    "\n",
    "tq = tqdm(dataset['train'])\n",
    "for i, item in enumerate(tq):\n",
    "    if i % 1000 == 0:\n",
    "        tq.set_description(f'{len(cn_ru)} / {i} / {item[\"lang\"]}')\n",
    "    if item['lang'] and 'ru' in item['lang']:\n",
    "        cn_ru.append(item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "id": "962bb06f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('ru', 480208),\n",
       " ('ru/en.wiktionary.org', 374041),\n",
       " ('ru/fr.wiktionary.org', 175954),\n",
       " ('ru/en', 158924),\n",
       " ('en/ru', 71414),\n",
       " ('ru/ru.dbpedia.org', 32302),\n",
       " ('ru/fr', 22531),\n",
       " ('ja/ru', 13107),\n",
       " ('de/ru', 12705),\n",
       " ('fr/ru', 9553),\n",
       " ('rup', 6654),\n",
       " ('mul/ru', 5933),\n",
       " ('rup/en', 5029),\n",
       " ('rup/en.wiktionary.org', 2758),\n",
       " ('rup/la', 2426),\n",
       " ('rup/ro', 2085),\n",
       " ('en/rup', 1886),\n",
       " ('ru/de', 1683),\n",
       " ('ru/orv', 1069),\n",
       " ('ru/la', 925)]"
      ]
     },
     "execution_count": 152,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Counter(item['lang'] for item in cn_ru).most_common(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "id": "111139f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 502,
   "id": "39aa1b05",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'sentence': '',\n",
       " 'extra_info': '{\"dataset\": \"/d/wiktionary/en\", \"license\": \"cc:by-sa/4.0\", \"sources\": [{\"contributor\": \"/s/resource/wiktionary/en\", \"process\": \"/s/process/wikiparsec/2\"}], \"weight\": 1.0}\\n',\n",
       " 'rel': '/r/Synonym',\n",
       " 'weight': 1.0,\n",
       " 'arg1': \"/c/en/i_don't_care\",\n",
       " 'arg2': '/c/ru/мне_без_разницы',\n",
       " 'full_rel': \"/a/[/r/Synonym/,/c/en/i_don't_care/,/c/ru/мне_без_разницы/]\",\n",
       " 'lang': 'en/ru'}"
      ]
     },
     "execution_count": 502,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "random.choice(cn_ru)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "id": "61ab6fb3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('/r/ExternalURL', 585310),\n",
       " ('/r/FormOf', 344356),\n",
       " ('/r/RelatedTo', 234778),\n",
       " ('/r/Synonym', 155380),\n",
       " ('/r/DerivedFrom', 28476),\n",
       " ('/r/EtymologicallyRelatedTo', 17807),\n",
       " ('/r/HasContext', 12816),\n",
       " ('/r/EtymologicallyDerivedFrom', 10355),\n",
       " ('/r/SymbolOf', 5886),\n",
       " ('/r/IsA', 1682),\n",
       " ('/r/Antonym', 1365),\n",
       " ('/r/DistinctFrom', 846),\n",
       " ('/r/SimilarTo', 171),\n",
       " ('/r/PartOf', 16),\n",
       " ('/r/MannerOf', 1)]"
      ]
     },
     "execution_count": 158,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Counter(item['rel'] for item in cn_ru).most_common(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 180,
   "id": "894edd09",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "793022"
      ]
     },
     "execution_count": 180,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "subset = [item for item in cn_ru if item['rel'] != '/r/ExternalURL' and 'ru' in item['lang'].split('/')]\n",
    "len(subset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 271,
   "id": "485ea93b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "480208\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('/r/FormOf', 343855),\n",
       " ('/r/RelatedTo', 91119),\n",
       " ('/r/DerivedFrom', 27581),\n",
       " ('/r/Synonym', 11167),\n",
       " ('/r/EtymologicallyRelatedTo', 2469),\n",
       " ('/r/IsA', 1676),\n",
       " ('/r/Antonym', 1310),\n",
       " ('/r/DistinctFrom', 842),\n",
       " ('/r/SimilarTo', 171),\n",
       " ('/r/PartOf', 16),\n",
       " ('/r/EtymologicallyDerivedFrom', 1),\n",
       " ('/r/MannerOf', 1)]"
      ]
     },
     "execution_count": 271,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "subset = [item for item in cn_ru if item['rel'] != '/r/ExternalURL' and item['lang'] == 'ru']\n",
    "print(len(subset))\n",
    "Counter(item['rel'] for item in subset).most_common(20)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e15574c8",
   "metadata": {},
   "source": [
    "```\n",
    "[('/r/FormOf', 343855), - чисто грамматика\n",
    " ('/r/RelatedTo', 91119), - обычно однокоренные слова\n",
    " ('/r/DerivedFrom', 27581), - однокоренные слова\n",
    " + ('/r/Synonym', 11167), - в основно, нормальные синонимы\n",
    " ('/r/EtymologicallyRelatedTo', 2469),\n",
    " + ('/r/IsA', 1676), - норм гиперонимы\n",
    " + ('/r/Antonym', 1310), - норм антонимы\n",
    " + ('/r/DistinctFrom', 842), - норм когипонимы и квази-антонимы\n",
    " + ('/r/SimilarTo', 171), - норм когипонимы\n",
    " ('/r/PartOf', 16),\n",
    " ('/r/EtymologicallyDerivedFrom', 1),\n",
    " ('/r/MannerOf', 1)]\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 928,
   "id": "2a3c3b6c",
   "metadata": {},
   "outputs": [],
   "source": [
    "good_russian_relations = ['/r/Synonym', '/r/IsA', '/r/Antonym', '/r/DistinctFrom', '/r/SimilarTo']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 925,
   "id": "1705261c",
   "metadata": {},
   "outputs": [],
   "source": [
    "subset = [item for item in cn_ru if item['rel'] == '/r/SimilarTo' and item['lang'] == 'ru']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 926,
   "id": "c3a52fc9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'sentence': '',\n",
       " 'extra_info': '{\"dataset\": \"/d/wiktionary/en\", \"license\": \"cc:by-sa/4.0\", \"sources\": [{\"contributor\": \"/s/resource/wiktionary/en\", \"process\": \"/s/process/wikiparsec/2\"}], \"weight\": 1.0}\\n',\n",
       " 'rel': '/r/SimilarTo',\n",
       " 'weight': 1.0,\n",
       " 'arg1': '/c/ru/двоечник/n',\n",
       " 'arg2': '/c/ru/четвёрочник',\n",
       " 'full_rel': '/a/[/r/SimilarTo/,/c/ru/двоечник/n/,/c/ru/четвёрочник/]',\n",
       " 'lang': 'ru'}"
      ]
     },
     "execution_count": 926,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "random.choice(subset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 726,
   "id": "85c16c5a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "234778\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('/r/RelatedTo', 234778)]"
      ]
     },
     "execution_count": 726,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "subset = [item for item in cn_ru if item['rel'] == ]\n",
    "print(len(subset))\n",
    "Counter(item['rel'] for item in subset).most_common(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 504,
   "id": "88109998",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 506,
   "id": "68b1b725",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6c6026dd1b284557996281e2172acf35",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1399245 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "104937\n"
     ]
    }
   ],
   "source": [
    "russian_synonyms = defaultdict(set)\n",
    "\n",
    "for item in tqdm(cn_ru):\n",
    "    if item['rel'] != '/r/Synonym':\n",
    "        continue\n",
    "    ru1 = 'ru' in item['arg1'].split('/')\n",
    "    ru2 = 'ru' in item['arg2'].split('/')\n",
    "    if ru1 and not ru2:\n",
    "        russian_synonyms[item['arg2']].add(item['arg1'])\n",
    "    elif ru2 and not ru1:\n",
    "        russian_synonyms[item['arg1']].add(item['arg2'])\n",
    "print(len(russian_synonyms))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 637,
   "id": "eb3fffbe",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6096786923a64cf0af1106509afd3bc2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1399245 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "957314"
      ]
     },
     "execution_count": 637,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "russian_counts = Counter()\n",
    "for item in tqdm(cn_ru):\n",
    "    for a in [item['arg1'], item['arg2']]:\n",
    "        if 'ru' in a.split('/'):\n",
    "            russian_counts[a] += 1\n",
    "len(russian_counts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 638,
   "id": "edaf8d1c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('/c/ru/вода/n', 256),\n",
       " ('/c/ru/россия/n', 194),\n",
       " ('/c/ru/спасибо', 167),\n",
       " ('/c/ru/один', 162),\n",
       " ('/c/ru/рвать', 155),\n",
       " ('/c/ru/ходить/v', 144),\n",
       " ('/c/ru/бить/v', 141),\n",
       " ('/c/ru/читать/v', 140),\n",
       " ('/c/ru/валить', 129),\n",
       " ('/c/ru/двигать', 122)]"
      ]
     },
     "execution_count": 638,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "russian_counts.most_common(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1039,
   "id": "aad5e1bf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['/c/ru/полицай',\n",
       " '/c/ru/милиционер',\n",
       " '/c/ru/полицейский',\n",
       " '/c/ru/коп',\n",
       " '/c/ru/мент']"
      ]
     },
     "execution_count": 1039,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "def concept_to_rus(concept, max_size=5, sample=True):\n",
    "    if concept not in russian_synonyms:\n",
    "        return []\n",
    "    items = list(russian_synonyms[concept])\n",
    "    if not sample or len(items) <= max_size:\n",
    "        return sorted(items, key=lambda x: russian_counts[x], reverse=True)[:max_size]\n",
    "    p = np.array([russian_counts[item] for item in items])\n",
    "    p = p / p.sum()\n",
    "    return [items[i] for i in np.random.choice(len(items), size=max_size, replace=False, p=p)]\n",
    "\n",
    "concept_to_rus('/c/en/police_officer/n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 511,
   "id": "06f77bf3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "38369a92df30412bb0571e636c9cd1a6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "cn_translatable = []\n",
    "\n",
    "tq = tqdm(dataset['train'])\n",
    "for i, item in enumerate(tq):\n",
    "    if i % 1000 == 0:\n",
    "        tq.set_description(f'{len(cn_translatable)} / {i} / {item[\"lang\"]}')\n",
    "    if item['arg1'] in russian_synonyms and item['arg2'] in russian_synonyms:\n",
    "        cn_translatable.append(item)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 621,
   "id": "fc91e609",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('/r/RelatedTo', 118364),\n",
       " ('/r/Synonym', 61264),\n",
       " ('/r/IsA', 21010),\n",
       " ('/r/HasContext', 18906),\n",
       " ('/r/DerivedFrom', 5645),\n",
       " ('/r/AtLocation', 4860),\n",
       " ('/r/dbpedia/genre', 2134),\n",
       " ('/r/dbpedia/genus', 1929),\n",
       " ('/r/Antonym', 1608),\n",
       " ('/r/EtymologicallyRelatedTo', 1569),\n",
       " ('/r/PartOf', 1351),\n",
       " ('/r/dbpedia/influencedBy', 1237),\n",
       " ('/r/DistinctFrom', 1012),\n",
       " ('/r/InstanceOf', 760),\n",
       " ('/r/UsedFor', 682),\n",
       " ('/r/SimilarTo', 669),\n",
       " ('/r/dbpedia/language', 648),\n",
       " ('/r/dbpedia/occupation', 610),\n",
       " ('/r/dbpedia/field', 561),\n",
       " ('/r/dbpedia/knownFor', 423),\n",
       " ('/r/HasProperty', 330),\n",
       " ('/r/CapableOf', 284),\n",
       " ('/r/FormOf', 280),\n",
       " ('/r/dbpedia/product', 278),\n",
       " ('/r/dbpedia/capital', 248),\n",
       " ('/r/EtymologicallyDerivedFrom', 228),\n",
       " ('/r/Desires', 201),\n",
       " ('/r/HasA', 145),\n",
       " ('/r/NotDesires', 132),\n",
       " ('/r/HasPrerequisite', 123),\n",
       " ('/r/Causes', 119),\n",
       " ('/r/MadeOf', 114),\n",
       " ('/r/HasSubevent', 111),\n",
       " ('/r/MotivatedByGoal', 110),\n",
       " ('/r/dbpedia/leader', 78),\n",
       " ('/r/CausesDesire', 58),\n",
       " ('/r/CreatedBy', 45),\n",
       " ('/r/LocatedNear', 18),\n",
       " ('/r/HasFirstSubevent', 16),\n",
       " ('/r/NotHasProperty', 16)]"
      ]
     },
     "execution_count": 621,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Counter(item['rel'] for item in cn_translatable).most_common(40)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 596,
   "id": "3530f329",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/c/en/earth /c/en/mars\n",
      "{'/c/ru/земля'} {'/c/ru/марс'}\n",
      "\n",
      "/c/en/globular_cluster /c/en/galaxy\n",
      "{'/c/ru/шаровое_звёздное_скопление'} {'/c/ru/галактика'}\n",
      "\n",
      "/c/en/vacuum /c/en/outer_space\n",
      "{'/c/ru/вакуум'} {'/c/ru/космическое_пространство'}\n",
      "\n",
      "/c/en/mosquito /c/en/water\n",
      "{'/c/ru/комары'} {'/c/ru/вода', '/c/ru/вода/n'}\n",
      "\n",
      "/c/en/helium /c/en/star\n",
      "{'/c/ru/гелий'} {'/c/ru/звезда'}\n",
      "\n",
      "/c/en/mosquito /c/en/water\n",
      "{'/c/ru/комары'} {'/c/ru/вода', '/c/ru/вода/n'}\n",
      "\n",
      "/c/en/globular_cluster /c/en/galaxy\n",
      "{'/c/ru/шаровое_звёздное_скопление'} {'/c/ru/галактика'}\n",
      "\n",
      "/c/en/beaver /c/en/dam\n",
      "{'/c/ru/бобры'} {'/c/ru/плотина'}\n",
      "\n",
      "/c/en/squirrel /c/en/tree\n",
      "{'/c/ru/беличьи'} {'/c/ru/дерево'}\n",
      "\n",
      "/c/en/mosquito /c/en/water\n",
      "{'/c/ru/комары'} {'/c/ru/вода', '/c/ru/вода/n'}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "subset = [item for item in cn_translatable if item['rel'] == '/r/LocatedNear']\n",
    "for i in range(10):\n",
    "    item = random.choice(subset)\n",
    "    print(item['arg1'], item['arg2'])\n",
    "    print(russian_synonyms[item['arg1']], russian_synonyms[item['arg2']])\n",
    "    print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 996,
   "id": "5ecb328f",
   "metadata": {},
   "outputs": [],
   "source": [
    "relations_filter = {\n",
    "    '/r/RelatedTo': 'связано с',  # произвольные ассоциации\n",
    "    '/r/Synonym': 'синоним', # обычно это одно и то же слово\n",
    "    '/r/IsA': '– это', # гипоним - гипероним\n",
    "    '/r/HasContext': 'относится к теме', # предмет - тематика\n",
    "    '/r/DerivedFrom': None, # этимология; описывает не понятие, а слово \n",
    "    '/r/AtLocation': 'находится в',\n",
    "    '/r/dbpedia/genre': 'относится к жанру', # музыкальные жанры\n",
    "    '/r/dbpedia/genus': 'относитcя к роду', # биологичесий род\n",
    "    '/r/Antonym': 'это антоним',\n",
    "    '/r/EtymologicallyRelatedTo': None,\n",
    "    '/r/PartOf': 'это часть',\n",
    "    '/r/dbpedia/influencedBy': 'испытал влияние', # связи творческих людей\n",
    "    '/r/DistinctFrom': 'отличается от', # ко-гипонимы\n",
    "    '/r/InstanceOf': '– это один из',\n",
    "    '/r/UsedFor': 'используется для',\n",
    "    '/r/SimilarTo': 'похож на',\n",
    "    '/r/dbpedia/language': None,  # объект и его язык - но очень много путаницы\n",
    "    '/r/dbpedia/occupation': 'по профессии', # шумноватая категория\n",
    "    '/r/dbpedia/field': 'знаменит в области', # человек и область\n",
    "    '/r/dbpedia/knownFor': 'известен благодаря',\n",
    "    '/r/HasProperty': 'обладает свойством',\n",
    "    '/r/CapableOf': 'может', # предполагается, что слева - глагол\n",
    "    '/r/FormOf': None, # шумная категория\n",
    "    '/r/dbpedia/product': 'производит', # компания - продукт\n",
    "    '/r/dbpedia/capital': 'имеет в качестве столицы',\n",
    "    '/r/EtymologicallyDerivedFrom': None,\n",
    "    '/r/Desires': 'хочет',\n",
    "    '/r/HasA': 'имеет',\n",
    "    '/r/NotDesires': 'не хочет',\n",
    "    '/r/HasPrerequisite': 'требует', # для Х нужно Y\n",
    "    '/r/Causes': 'причиняет',\n",
    "    '/r/MadeOf': 'сделан из',\n",
    "    '/r/HasSubevent': None, # шумная категория\n",
    "    '/r/MotivatedByGoal': 'можно ради', # глагол, ради которого другой глагол\n",
    "    '/r/dbpedia/leader': None, # не очень информативно, территория - правитель\n",
    "    '/r/CausesDesire': 'вызывает желание',\n",
    "    '/r/CreatedBy': 'создан', # икс создан игреком (или иногда - из игрека)\n",
    "}\n",
    "filtered_relations = {k: v for k, v in relations_filter.items() if v}\n",
    "assert len(filtered_relations) == len(set(filtered_relations.values()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 997,
   "id": "6a3f5b41",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "нормальный связано с производная функции\n",
      "бес связано с демонология\n",
      "beretta ar 70 – это один из автомат\n",
      "лёгкий сон связано с сон\n",
      "досаждать синоним чума\n"
     ]
    }
   ],
   "source": [
    "for i in range(10):\n",
    "    item = random.choice(cn_translatable)\n",
    "    if item['rel'] not in filtered_relations:\n",
    "        continue\n",
    "    a1 = random.choice(list(russian_synonyms[item['arg1']])).split('/')[3].replace('_', ' ')\n",
    "    a2 = random.choice(list(russian_synonyms[item['arg2']])).split('/')[3].replace('_', ' ')\n",
    "    if a1 == a2:\n",
    "        continue\n",
    "    r = filtered_relations[item['rel']]\n",
    "    print(a1, r, a2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1047,
   "id": "27bf9d2c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fa09f870ebfb491a8dc9878805d768e7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1399245 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "15166\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5e8f49245e12414bad3e7672ccd4b80e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1399245 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "168940\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4296de5d01064b11a4bb61b9f508f3e5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/248217 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "543200\n"
     ]
    }
   ],
   "source": [
    "russified_conceptnet = []\n",
    "for item in tqdm(cn_ru):\n",
    "    if item['rel'] in good_russian_relations:\n",
    "        if 'ru' in item['arg1'].split('/') and 'ru' in item['arg2'].split('/'):\n",
    "            russified_conceptnet.append({\n",
    "                'arg1': item['arg1'],\n",
    "                'arg2': item['arg2'],\n",
    "                'rel': item['rel'],\n",
    "                'source': 'original',\n",
    "                'source_triplet': [item['arg1'], item['rel'], item['arg2']],\n",
    "            })\n",
    "print(len(russified_conceptnet))\n",
    "\n",
    "for item in tqdm(cn_ru):\n",
    "    if item['rel'] not in filtered_relations:\n",
    "        continue\n",
    "    for a1 in concept_to_rus(item['arg1']) + ([item['arg1']] if 'ru' in item['arg1'].split('/') else []):\n",
    "        for a2 in concept_to_rus(item['arg2']) + ([item['arg2']] if 'ru' in item['arg2'].split('/') else []):\n",
    "            if a1 == a2:\n",
    "                continue\n",
    "            if a1 == item['arg1'] and a2 == item['arg2']:\n",
    "                continue\n",
    "            russified_conceptnet.append({\n",
    "                'arg1': a1,\n",
    "                'arg2': a2,\n",
    "                'rel': item['rel'],\n",
    "                'source': 'half_translated',\n",
    "                'source_triplet': [item['arg1'], item['rel'], item['arg2']],\n",
    "            })\n",
    "print(len(russified_conceptnet))\n",
    "\n",
    "for item in tqdm(cn_translatable):\n",
    "    if item['rel'] not in filtered_relations:\n",
    "        continue\n",
    "    for a1 in concept_to_rus(item['arg1']):\n",
    "        for a2 in concept_to_rus(item['arg2']):\n",
    "            if a1 == a2:\n",
    "                continue\n",
    "            russified_conceptnet.append({\n",
    "                'arg1': a1,\n",
    "                'arg2': a2,\n",
    "                'rel': item['rel'],\n",
    "                'source': 'translated',\n",
    "                'source_triplet': [item['arg1'], item['rel'], item['arg2']],\n",
    "            })\n",
    "print(len(russified_conceptnet))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1144,
   "id": "ed2038b0",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'arg1': '/c/ru/бес',\n",
       " 'arg2': '/c/ru/сверхъестественное',\n",
       " 'rel': '/r/RelatedTo',\n",
       " 'source': 'translated',\n",
       " 'source_triplet': ['/c/en/demon/n', '/r/RelatedTo', '/c/en/supernatural']}"
      ]
     },
     "execution_count": 1144,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "random.choice(russified_conceptnet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1151,
   "id": "b4c16907",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('/r/RelatedTo', 251251),\n",
       " ('/r/Synonym', 168700),\n",
       " ('/r/IsA', 54001),\n",
       " ('/r/HasContext', 38493),\n",
       " ('/r/AtLocation', 6791),\n",
       " ('/r/Antonym', 4875),\n",
       " ('/r/DistinctFrom', 2862),\n",
       " ('/r/dbpedia/genre', 2270),\n",
       " ('/r/dbpedia/genus', 1949),\n",
       " ('/r/PartOf', 1947),\n",
       " ('/r/SimilarTo', 1363),\n",
       " ('/r/dbpedia/influencedBy', 1289),\n",
       " ('/r/UsedFor', 1185),\n",
       " ('/r/InstanceOf', 910),\n",
       " ('/r/dbpedia/occupation', 751),\n",
       " ('/r/HasProperty', 670),\n",
       " ('/r/CapableOf', 571),\n",
       " ('/r/dbpedia/field', 567),\n",
       " ('/r/dbpedia/knownFor', 435),\n",
       " ('/r/Desires', 330),\n",
       " ('/r/dbpedia/product', 328),\n",
       " ('/r/MotivatedByGoal', 271),\n",
       " ('/r/dbpedia/capital', 259),\n",
       " ('/r/HasA', 224),\n",
       " ('/r/HasPrerequisite', 211),\n",
       " ('/r/MadeOf', 183),\n",
       " ('/r/Causes', 174),\n",
       " ('/r/NotDesires', 165),\n",
       " ('/r/CausesDesire', 114),\n",
       " ('/r/CreatedBy', 61)]"
      ]
     },
     "execution_count": 1151,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Counter(item['rel'] for item in russified_conceptnet).most_common(30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a5b5ff8f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# todo: complete these templates using tools such as spacy/natasha and pymorphy2 and tons of rules\n",
    "\n",
    "def to_instrumental(x):\n",
    "    return x\n",
    "\n",
    "def adjust_short(v, x):\n",
    "    return v\n",
    "\n",
    "def adjust_verb(v, x):\n",
    "    return v\n",
    "\n",
    "def to_genitive(y):\n",
    "    return y\n",
    "\n",
    "def to_locative(y):\n",
    "    return y\n",
    "\n",
    "def to_accusative(y):\n",
    "    return y\n",
    "\n",
    "def to_dative(y):\n",
    "    return y\n",
    "\n",
    "def to_multiple(y):\n",
    "    return y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1224,
   "id": "a2d5db32",
   "metadata": {},
   "outputs": [],
   "source": [
    "verbalizers = {\n",
    "    '/r/RelatedTo': lambda x, y: f'{x} {adjust_short(\"связан\", v)} с {to_instrumental(y)}',\n",
    "    '/r/Synonym': lambda x, y: f'\"{x}\" – синоним \"{to_genitive(y)}\"', # обычно это одно и то же слово\n",
    "    '/r/IsA': lambda x, y: f'{x} – это {y}', # '– это', # гипоним - гипероним\n",
    "    '/r/HasContext': lambda x, y: f'{x} относится к теме \"{y}\"', # предмет - тематика\n",
    "    '/r/AtLocation': lambda x, y: f'{x} находится в {to_locative(y)}',\n",
    "    '/r/dbpedia/genre': lambda x, y: f'{x} относится к жанру {y}', # музыкальные жанры\n",
    "    '/r/dbpedia/genus': lambda x, y: f'{x} относитcя к роду {y}', # биологичесий род\n",
    "    '/r/Antonym': lambda x, y: f'\"{x}\" – это антоним \"{y}\"',\n",
    "    '/r/PartOf': lambda x, y: f'{x} – это часть {to_genitive(y)}',\n",
    "    '/r/dbpedia/influencedBy': lambda x, y: f'{x} испытал влияние {to_genitive(y)}', # связи творческих людей\n",
    "    '/r/DistinctFrom': lambda x, y: f'{x} отличается от {to_genitive(y)}', # ко-гипонимы\n",
    "    '/r/InstanceOf': lambda x, y: f'{x} – это один из {to_genitive(to_multiple(y))}',\n",
    "    '/r/UsedFor': lambda x, y: f'{x} используется для {to_genitive(y)}',\n",
    "    '/r/SimilarTo': lambda x, y: f'{x} {adjust_short(\"похож\", x)} на {to_genitive(y)}',\n",
    "    '/r/dbpedia/occupation': lambda x, y: f'{x} по профессии {y}', # шумноватая категория\n",
    "    '/r/dbpedia/field': lambda x, y: f'{x} {adjust_short(\"знаменит\", x)} в области {to_genitive(y)}', # человек и область\n",
    "    '/r/dbpedia/knownFor': lambda x, y: f'{x} {adjust_short(\"известен\", x)} благодаря {to_dative(y)}',\n",
    "    '/r/HasProperty':  lambda x, y: f'{x} обладает свойством {y}',\n",
    "    '/r/CapableOf': lambda x, y: f'{x} {adjust_verb(\"может\", x)} {y}', # предполагается, что слева - глагол\n",
    "    '/r/dbpedia/product': lambda x, y: f'{x} {adjust_verb(\"производит\", x)} {to_accusative(y)}', # компания - продукт\n",
    "    '/r/dbpedia/capital': lambda x, y: f'{y} – столица {to_genitive(y)}',\n",
    "    '/r/Desires': lambda x, y: f'{x} {adjust_verb(\"хочет\", x)} {to_accusative(y)}',\n",
    "    '/r/HasA': lambda x, y: f'{x} {adjust_verb(\"имеет\", x)} {to_accusative(y)}',\n",
    "    '/r/NotDesires': lambda x, y: f'{x} не {adjust_verb(\"хочет\", x)} {to_accusative(y)}',\n",
    "    '/r/HasPrerequisite': lambda x, y: f'{x} {adjust_verb(\"требует\", x)} {to_accusative(y)}', # для Х нужно Y\n",
    "    '/r/Causes': lambda x, y: f'{x} {adjust_verb(\"причиняет\", x)} {to_accusative(y)}',\n",
    "    '/r/MadeOf': lambda x, y: f'{x} {adjust_short(\"сделан\", x)} из {to_genitive(y)}',\n",
    "    '/r/MotivatedByGoal': lambda x, y: f'{x} можно ради {y}', # глагол, ради которого другой глагол\n",
    "    '/r/CausesDesire': lambda x, y: f'{x} вызывает желание {y}',\n",
    "    '/r/CreatedBy':  lambda x, y: f'{x} {adjust_short(\"создан\", x)} {to_instrumental(y)}', # икс создан игреком (или иногда - из игрека)\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1260,
   "id": "f1328ec7",
   "metadata": {},
   "outputs": [],
   "source": [
    "for item in russified_conceptnet:\n",
    "    text = verbalizers[item['rel']](item['arg1'].split('/')[3].replace('_', ' '), item['arg2'].split('/')[3].replace('_', ' '))\n",
    "    item['sentence_dirty'] = text "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1295,
   "id": "83b04788",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'arg1': '/c/ru/заглавная_страница', 'arg2': '/c/ru/автор/n', 'rel': '/r/RelatedTo', 'source': 'translated', 'source_triplet': ['/c/en/title_page/n', '/r/RelatedTo', '/c/en/author'], 'sentence_dirty': 'заглавная страница связан с автор'}\n",
      "заглавная страница связан с автор\n"
     ]
    }
   ],
   "source": [
    "item = random.choice(russified_conceptnet)\n",
    "print(item)\n",
    "print(verbalizers[item['rel']](item['arg1'].split('/')[3].replace('_', ' '), item['arg2'].split('/')[3].replace('_', ' ')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1296,
   "id": "be9d88f1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0d293709c4ac46219697b0d98cfed963",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/543200 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import json\n",
    "with open('conceptnet5_russified.jsonl', 'w') as f:\n",
    "    for i, item in enumerate(tqdm(russified_conceptnet)):\n",
    "        json.dump(item, f, ensure_ascii=False)\n",
    "        f.write('\\n')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}