Skip to content

Instantly share code, notes, and snippets.

@avidale
Created February 10, 2022 11:43
Show Gist options
  • Save avidale/70045403f6b5b678974956641c86f659 to your computer and use it in GitHub Desktop.
Save avidale/70045403f6b5b678974956641c86f659 to your computer and use it in GitHub Desktop.
conceptnet5_russified.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 507,
"id": "f1018ba7",
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"dataset = load_dataset(\"conceptnet5\", \"conceptnet5\", streaming=True)"
]
},
{
"cell_type": "markdown",
"id": "5e7392f4",
"metadata": {},
"source": [
"Напрямую между русскими понятиями интересных отношений - очень немного. \n",
"\n",
"Но можно для каждого интересного отношения между нерусскими понятиями найти русские синонимы, и таким образом собрать русский concept-net. "
]
},
{
"cell_type": "code",
"execution_count": 148,
"id": "1f7b0c6b",
"metadata": {},
"outputs": [],
"source": [
"from tqdm.auto import tqdm, trange\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 150,
"id": "42d9093c",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "7a0616f04240429ea14a4890eebdf0ea",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"0it [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"cn_ru = []\n",
"\n",
"tq = tqdm(dataset['train'])\n",
"for i, item in enumerate(tq):\n",
" if i % 1000 == 0:\n",
" tq.set_description(f'{len(cn_ru)} / {i} / {item[\"lang\"]}')\n",
" if item['lang'] and 'ru' in item['lang']:\n",
" cn_ru.append(item)"
]
},
{
"cell_type": "code",
"execution_count": 152,
"id": "962bb06f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('ru', 480208),\n",
" ('ru/en.wiktionary.org', 374041),\n",
" ('ru/fr.wiktionary.org', 175954),\n",
" ('ru/en', 158924),\n",
" ('en/ru', 71414),\n",
" ('ru/ru.dbpedia.org', 32302),\n",
" ('ru/fr', 22531),\n",
" ('ja/ru', 13107),\n",
" ('de/ru', 12705),\n",
" ('fr/ru', 9553),\n",
" ('rup', 6654),\n",
" ('mul/ru', 5933),\n",
" ('rup/en', 5029),\n",
" ('rup/en.wiktionary.org', 2758),\n",
" ('rup/la', 2426),\n",
" ('rup/ro', 2085),\n",
" ('en/rup', 1886),\n",
" ('ru/de', 1683),\n",
" ('ru/orv', 1069),\n",
" ('ru/la', 925)]"
]
},
"execution_count": 152,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Counter(item['lang'] for item in cn_ru).most_common(20)"
]
},
{
"cell_type": "code",
"execution_count": 153,
"id": "111139f5",
"metadata": {},
"outputs": [],
"source": [
"import random"
]
},
{
"cell_type": "code",
"execution_count": 502,
"id": "39aa1b05",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'sentence': '',\n",
" 'extra_info': '{\"dataset\": \"/d/wiktionary/en\", \"license\": \"cc:by-sa/4.0\", \"sources\": [{\"contributor\": \"/s/resource/wiktionary/en\", \"process\": \"/s/process/wikiparsec/2\"}], \"weight\": 1.0}\\n',\n",
" 'rel': '/r/Synonym',\n",
" 'weight': 1.0,\n",
" 'arg1': \"/c/en/i_don't_care\",\n",
" 'arg2': '/c/ru/мне_без_разницы',\n",
" 'full_rel': \"/a/[/r/Synonym/,/c/en/i_don't_care/,/c/ru/мне_без_разницы/]\",\n",
" 'lang': 'en/ru'}"
]
},
"execution_count": 502,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"random.choice(cn_ru)"
]
},
{
"cell_type": "code",
"execution_count": 158,
"id": "61ab6fb3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('/r/ExternalURL', 585310),\n",
" ('/r/FormOf', 344356),\n",
" ('/r/RelatedTo', 234778),\n",
" ('/r/Synonym', 155380),\n",
" ('/r/DerivedFrom', 28476),\n",
" ('/r/EtymologicallyRelatedTo', 17807),\n",
" ('/r/HasContext', 12816),\n",
" ('/r/EtymologicallyDerivedFrom', 10355),\n",
" ('/r/SymbolOf', 5886),\n",
" ('/r/IsA', 1682),\n",
" ('/r/Antonym', 1365),\n",
" ('/r/DistinctFrom', 846),\n",
" ('/r/SimilarTo', 171),\n",
" ('/r/PartOf', 16),\n",
" ('/r/MannerOf', 1)]"
]
},
"execution_count": 158,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Counter(item['rel'] for item in cn_ru).most_common(20)"
]
},
{
"cell_type": "code",
"execution_count": 180,
"id": "894edd09",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"793022"
]
},
"execution_count": 180,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subset = [item for item in cn_ru if item['rel'] != '/r/ExternalURL' and 'ru' in item['lang'].split('/')]\n",
"len(subset)"
]
},
{
"cell_type": "code",
"execution_count": 271,
"id": "485ea93b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"480208\n"
]
},
{
"data": {
"text/plain": [
"[('/r/FormOf', 343855),\n",
" ('/r/RelatedTo', 91119),\n",
" ('/r/DerivedFrom', 27581),\n",
" ('/r/Synonym', 11167),\n",
" ('/r/EtymologicallyRelatedTo', 2469),\n",
" ('/r/IsA', 1676),\n",
" ('/r/Antonym', 1310),\n",
" ('/r/DistinctFrom', 842),\n",
" ('/r/SimilarTo', 171),\n",
" ('/r/PartOf', 16),\n",
" ('/r/EtymologicallyDerivedFrom', 1),\n",
" ('/r/MannerOf', 1)]"
]
},
"execution_count": 271,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subset = [item for item in cn_ru if item['rel'] != '/r/ExternalURL' and item['lang'] == 'ru']\n",
"print(len(subset))\n",
"Counter(item['rel'] for item in subset).most_common(20)"
]
},
{
"cell_type": "markdown",
"id": "e15574c8",
"metadata": {},
"source": [
"```\n",
"[('/r/FormOf', 343855), - чисто грамматика\n",
" ('/r/RelatedTo', 91119), - обычно однокоренные слова\n",
" ('/r/DerivedFrom', 27581), - однокоренные слова\n",
" + ('/r/Synonym', 11167), - в основно, нормальные синонимы\n",
" ('/r/EtymologicallyRelatedTo', 2469),\n",
" + ('/r/IsA', 1676), - норм гиперонимы\n",
" + ('/r/Antonym', 1310), - норм антонимы\n",
" + ('/r/DistinctFrom', 842), - норм когипонимы и квази-антонимы\n",
" + ('/r/SimilarTo', 171), - норм когипонимы\n",
" ('/r/PartOf', 16),\n",
" ('/r/EtymologicallyDerivedFrom', 1),\n",
" ('/r/MannerOf', 1)]\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 928,
"id": "2a3c3b6c",
"metadata": {},
"outputs": [],
"source": [
"good_russian_relations = ['/r/Synonym', '/r/IsA', '/r/Antonym', '/r/DistinctFrom', '/r/SimilarTo']"
]
},
{
"cell_type": "code",
"execution_count": 925,
"id": "1705261c",
"metadata": {},
"outputs": [],
"source": [
"subset = [item for item in cn_ru if item['rel'] == '/r/SimilarTo' and item['lang'] == 'ru']"
]
},
{
"cell_type": "code",
"execution_count": 926,
"id": "c3a52fc9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'sentence': '',\n",
" 'extra_info': '{\"dataset\": \"/d/wiktionary/en\", \"license\": \"cc:by-sa/4.0\", \"sources\": [{\"contributor\": \"/s/resource/wiktionary/en\", \"process\": \"/s/process/wikiparsec/2\"}], \"weight\": 1.0}\\n',\n",
" 'rel': '/r/SimilarTo',\n",
" 'weight': 1.0,\n",
" 'arg1': '/c/ru/двоечник/n',\n",
" 'arg2': '/c/ru/четвёрочник',\n",
" 'full_rel': '/a/[/r/SimilarTo/,/c/ru/двоечник/n/,/c/ru/четвёрочник/]',\n",
" 'lang': 'ru'}"
]
},
"execution_count": 926,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"random.choice(subset)"
]
},
{
"cell_type": "code",
"execution_count": 726,
"id": "85c16c5a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"234778\n"
]
},
{
"data": {
"text/plain": [
"[('/r/RelatedTo', 234778)]"
]
},
"execution_count": 726,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subset = [item for item in cn_ru if item['rel'] == ]\n",
"print(len(subset))\n",
"Counter(item['rel'] for item in subset).most_common(20)"
]
},
{
"cell_type": "code",
"execution_count": 504,
"id": "88109998",
"metadata": {},
"outputs": [],
"source": [
"from collections import defaultdict"
]
},
{
"cell_type": "code",
"execution_count": 506,
"id": "68b1b725",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6c6026dd1b284557996281e2172acf35",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1399245 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"104937\n"
]
}
],
"source": [
"russian_synonyms = defaultdict(set)\n",
"\n",
"for item in tqdm(cn_ru):\n",
" if item['rel'] != '/r/Synonym':\n",
" continue\n",
" ru1 = 'ru' in item['arg1'].split('/')\n",
" ru2 = 'ru' in item['arg2'].split('/')\n",
" if ru1 and not ru2:\n",
" russian_synonyms[item['arg2']].add(item['arg1'])\n",
" elif ru2 and not ru1:\n",
" russian_synonyms[item['arg1']].add(item['arg2'])\n",
"print(len(russian_synonyms))"
]
},
{
"cell_type": "code",
"execution_count": 637,
"id": "eb3fffbe",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6096786923a64cf0af1106509afd3bc2",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1399245 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"957314"
]
},
"execution_count": 637,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"russian_counts = Counter()\n",
"for item in tqdm(cn_ru):\n",
" for a in [item['arg1'], item['arg2']]:\n",
" if 'ru' in a.split('/'):\n",
" russian_counts[a] += 1\n",
"len(russian_counts)"
]
},
{
"cell_type": "code",
"execution_count": 638,
"id": "edaf8d1c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('/c/ru/вода/n', 256),\n",
" ('/c/ru/россия/n', 194),\n",
" ('/c/ru/спасибо', 167),\n",
" ('/c/ru/один', 162),\n",
" ('/c/ru/рвать', 155),\n",
" ('/c/ru/ходить/v', 144),\n",
" ('/c/ru/бить/v', 141),\n",
" ('/c/ru/читать/v', 140),\n",
" ('/c/ru/валить', 129),\n",
" ('/c/ru/двигать', 122)]"
]
},
"execution_count": 638,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"russian_counts.most_common(10)"
]
},
{
"cell_type": "code",
"execution_count": 1039,
"id": "aad5e1bf",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['/c/ru/полицай',\n",
" '/c/ru/милиционер',\n",
" '/c/ru/полицейский',\n",
" '/c/ru/коп',\n",
" '/c/ru/мент']"
]
},
"execution_count": 1039,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"\n",
"def concept_to_rus(concept, max_size=5, sample=True):\n",
" if concept not in russian_synonyms:\n",
" return []\n",
" items = list(russian_synonyms[concept])\n",
" if not sample or len(items) <= max_size:\n",
" return sorted(items, key=lambda x: russian_counts[x], reverse=True)[:max_size]\n",
" p = np.array([russian_counts[item] for item in items])\n",
" p = p / p.sum()\n",
" return [items[i] for i in np.random.choice(len(items), size=max_size, replace=False, p=p)]\n",
"\n",
"concept_to_rus('/c/en/police_officer/n')"
]
},
{
"cell_type": "code",
"execution_count": 511,
"id": "06f77bf3",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "38369a92df30412bb0571e636c9cd1a6",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"0it [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"cn_translatable = []\n",
"\n",
"tq = tqdm(dataset['train'])\n",
"for i, item in enumerate(tq):\n",
" if i % 1000 == 0:\n",
" tq.set_description(f'{len(cn_translatable)} / {i} / {item[\"lang\"]}')\n",
" if item['arg1'] in russian_synonyms and item['arg2'] in russian_synonyms:\n",
" cn_translatable.append(item)"
]
},
{
"cell_type": "code",
"execution_count": 621,
"id": "fc91e609",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('/r/RelatedTo', 118364),\n",
" ('/r/Synonym', 61264),\n",
" ('/r/IsA', 21010),\n",
" ('/r/HasContext', 18906),\n",
" ('/r/DerivedFrom', 5645),\n",
" ('/r/AtLocation', 4860),\n",
" ('/r/dbpedia/genre', 2134),\n",
" ('/r/dbpedia/genus', 1929),\n",
" ('/r/Antonym', 1608),\n",
" ('/r/EtymologicallyRelatedTo', 1569),\n",
" ('/r/PartOf', 1351),\n",
" ('/r/dbpedia/influencedBy', 1237),\n",
" ('/r/DistinctFrom', 1012),\n",
" ('/r/InstanceOf', 760),\n",
" ('/r/UsedFor', 682),\n",
" ('/r/SimilarTo', 669),\n",
" ('/r/dbpedia/language', 648),\n",
" ('/r/dbpedia/occupation', 610),\n",
" ('/r/dbpedia/field', 561),\n",
" ('/r/dbpedia/knownFor', 423),\n",
" ('/r/HasProperty', 330),\n",
" ('/r/CapableOf', 284),\n",
" ('/r/FormOf', 280),\n",
" ('/r/dbpedia/product', 278),\n",
" ('/r/dbpedia/capital', 248),\n",
" ('/r/EtymologicallyDerivedFrom', 228),\n",
" ('/r/Desires', 201),\n",
" ('/r/HasA', 145),\n",
" ('/r/NotDesires', 132),\n",
" ('/r/HasPrerequisite', 123),\n",
" ('/r/Causes', 119),\n",
" ('/r/MadeOf', 114),\n",
" ('/r/HasSubevent', 111),\n",
" ('/r/MotivatedByGoal', 110),\n",
" ('/r/dbpedia/leader', 78),\n",
" ('/r/CausesDesire', 58),\n",
" ('/r/CreatedBy', 45),\n",
" ('/r/LocatedNear', 18),\n",
" ('/r/HasFirstSubevent', 16),\n",
" ('/r/NotHasProperty', 16)]"
]
},
"execution_count": 621,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Counter(item['rel'] for item in cn_translatable).most_common(40)"
]
},
{
"cell_type": "code",
"execution_count": 596,
"id": "3530f329",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/c/en/earth /c/en/mars\n",
"{'/c/ru/земля'} {'/c/ru/марс'}\n",
"\n",
"/c/en/globular_cluster /c/en/galaxy\n",
"{'/c/ru/шаровое_звёздное_скопление'} {'/c/ru/галактика'}\n",
"\n",
"/c/en/vacuum /c/en/outer_space\n",
"{'/c/ru/вакуум'} {'/c/ru/космическое_пространство'}\n",
"\n",
"/c/en/mosquito /c/en/water\n",
"{'/c/ru/комары'} {'/c/ru/вода', '/c/ru/вода/n'}\n",
"\n",
"/c/en/helium /c/en/star\n",
"{'/c/ru/гелий'} {'/c/ru/звезда'}\n",
"\n",
"/c/en/mosquito /c/en/water\n",
"{'/c/ru/комары'} {'/c/ru/вода', '/c/ru/вода/n'}\n",
"\n",
"/c/en/globular_cluster /c/en/galaxy\n",
"{'/c/ru/шаровое_звёздное_скопление'} {'/c/ru/галактика'}\n",
"\n",
"/c/en/beaver /c/en/dam\n",
"{'/c/ru/бобры'} {'/c/ru/плотина'}\n",
"\n",
"/c/en/squirrel /c/en/tree\n",
"{'/c/ru/беличьи'} {'/c/ru/дерево'}\n",
"\n",
"/c/en/mosquito /c/en/water\n",
"{'/c/ru/комары'} {'/c/ru/вода', '/c/ru/вода/n'}\n",
"\n"
]
}
],
"source": [
"subset = [item for item in cn_translatable if item['rel'] == '/r/LocatedNear']\n",
"for i in range(10):\n",
" item = random.choice(subset)\n",
" print(item['arg1'], item['arg2'])\n",
" print(russian_synonyms[item['arg1']], russian_synonyms[item['arg2']])\n",
" print()"
]
},
{
"cell_type": "code",
"execution_count": 996,
"id": "5ecb328f",
"metadata": {},
"outputs": [],
"source": [
"relations_filter = {\n",
" '/r/RelatedTo': 'связано с', # произвольные ассоциации\n",
" '/r/Synonym': 'синоним', # обычно это одно и то же слово\n",
" '/r/IsA': '– это', # гипоним - гипероним\n",
" '/r/HasContext': 'относится к теме', # предмет - тематика\n",
" '/r/DerivedFrom': None, # этимология; описывает не понятие, а слово \n",
" '/r/AtLocation': 'находится в',\n",
" '/r/dbpedia/genre': 'относится к жанру', # музыкальные жанры\n",
" '/r/dbpedia/genus': 'относитcя к роду', # биологичесий род\n",
" '/r/Antonym': 'это антоним',\n",
" '/r/EtymologicallyRelatedTo': None,\n",
" '/r/PartOf': 'это часть',\n",
" '/r/dbpedia/influencedBy': 'испытал влияние', # связи творческих людей\n",
" '/r/DistinctFrom': 'отличается от', # ко-гипонимы\n",
" '/r/InstanceOf': '– это один из',\n",
" '/r/UsedFor': 'используется для',\n",
" '/r/SimilarTo': 'похож на',\n",
" '/r/dbpedia/language': None, # объект и его язык - но очень много путаницы\n",
" '/r/dbpedia/occupation': 'по профессии', # шумноватая категория\n",
" '/r/dbpedia/field': 'знаменит в области', # человек и область\n",
" '/r/dbpedia/knownFor': 'известен благодаря',\n",
" '/r/HasProperty': 'обладает свойством',\n",
" '/r/CapableOf': 'может', # предполагается, что слева - глагол\n",
" '/r/FormOf': None, # шумная категория\n",
" '/r/dbpedia/product': 'производит', # компания - продукт\n",
" '/r/dbpedia/capital': 'имеет в качестве столицы',\n",
" '/r/EtymologicallyDerivedFrom': None,\n",
" '/r/Desires': 'хочет',\n",
" '/r/HasA': 'имеет',\n",
" '/r/NotDesires': 'не хочет',\n",
" '/r/HasPrerequisite': 'требует', # для Х нужно Y\n",
" '/r/Causes': 'причиняет',\n",
" '/r/MadeOf': 'сделан из',\n",
" '/r/HasSubevent': None, # шумная категория\n",
" '/r/MotivatedByGoal': 'можно ради', # глагол, ради которого другой глагол\n",
" '/r/dbpedia/leader': None, # не очень информативно, территория - правитель\n",
" '/r/CausesDesire': 'вызывает желание',\n",
" '/r/CreatedBy': 'создан', # икс создан игреком (или иногда - из игрека)\n",
"}\n",
"filtered_relations = {k: v for k, v in relations_filter.items() if v}\n",
"assert len(filtered_relations) == len(set(filtered_relations.values()))"
]
},
{
"cell_type": "code",
"execution_count": 997,
"id": "6a3f5b41",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"нормальный связано с производная функции\n",
"бес связано с демонология\n",
"beretta ar 70 – это один из автомат\n",
"лёгкий сон связано с сон\n",
"досаждать синоним чума\n"
]
}
],
"source": [
"for i in range(10):\n",
" item = random.choice(cn_translatable)\n",
" if item['rel'] not in filtered_relations:\n",
" continue\n",
" a1 = random.choice(list(russian_synonyms[item['arg1']])).split('/')[3].replace('_', ' ')\n",
" a2 = random.choice(list(russian_synonyms[item['arg2']])).split('/')[3].replace('_', ' ')\n",
" if a1 == a2:\n",
" continue\n",
" r = filtered_relations[item['rel']]\n",
" print(a1, r, a2)"
]
},
{
"cell_type": "code",
"execution_count": 1047,
"id": "27bf9d2c",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "fa09f870ebfb491a8dc9878805d768e7",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1399245 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"15166\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5e8f49245e12414bad3e7672ccd4b80e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/1399245 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"168940\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4296de5d01064b11a4bb61b9f508f3e5",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/248217 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"543200\n"
]
}
],
"source": [
"russified_conceptnet = []\n",
"for item in tqdm(cn_ru):\n",
" if item['rel'] in good_russian_relations:\n",
" if 'ru' in item['arg1'].split('/') and 'ru' in item['arg2'].split('/'):\n",
" russified_conceptnet.append({\n",
" 'arg1': item['arg1'],\n",
" 'arg2': item['arg2'],\n",
" 'rel': item['rel'],\n",
" 'source': 'original',\n",
" 'source_triplet': [item['arg1'], item['rel'], item['arg2']],\n",
" })\n",
"print(len(russified_conceptnet))\n",
"\n",
"for item in tqdm(cn_ru):\n",
" if item['rel'] not in filtered_relations:\n",
" continue\n",
" for a1 in concept_to_rus(item['arg1']) + ([item['arg1']] if 'ru' in item['arg1'].split('/') else []):\n",
" for a2 in concept_to_rus(item['arg2']) + ([item['arg2']] if 'ru' in item['arg2'].split('/') else []):\n",
" if a1 == a2:\n",
" continue\n",
" if a1 == item['arg1'] and a2 == item['arg2']:\n",
" continue\n",
" russified_conceptnet.append({\n",
" 'arg1': a1,\n",
" 'arg2': a2,\n",
" 'rel': item['rel'],\n",
" 'source': 'half_translated',\n",
" 'source_triplet': [item['arg1'], item['rel'], item['arg2']],\n",
" })\n",
"print(len(russified_conceptnet))\n",
"\n",
"for item in tqdm(cn_translatable):\n",
" if item['rel'] not in filtered_relations:\n",
" continue\n",
" for a1 in concept_to_rus(item['arg1']):\n",
" for a2 in concept_to_rus(item['arg2']):\n",
" if a1 == a2:\n",
" continue\n",
" russified_conceptnet.append({\n",
" 'arg1': a1,\n",
" 'arg2': a2,\n",
" 'rel': item['rel'],\n",
" 'source': 'translated',\n",
" 'source_triplet': [item['arg1'], item['rel'], item['arg2']],\n",
" })\n",
"print(len(russified_conceptnet))"
]
},
{
"cell_type": "code",
"execution_count": 1144,
"id": "ed2038b0",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"{'arg1': '/c/ru/бес',\n",
" 'arg2': '/c/ru/сверхъестественное',\n",
" 'rel': '/r/RelatedTo',\n",
" 'source': 'translated',\n",
" 'source_triplet': ['/c/en/demon/n', '/r/RelatedTo', '/c/en/supernatural']}"
]
},
"execution_count": 1144,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"random.choice(russified_conceptnet)"
]
},
{
"cell_type": "code",
"execution_count": 1151,
"id": "b4c16907",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('/r/RelatedTo', 251251),\n",
" ('/r/Synonym', 168700),\n",
" ('/r/IsA', 54001),\n",
" ('/r/HasContext', 38493),\n",
" ('/r/AtLocation', 6791),\n",
" ('/r/Antonym', 4875),\n",
" ('/r/DistinctFrom', 2862),\n",
" ('/r/dbpedia/genre', 2270),\n",
" ('/r/dbpedia/genus', 1949),\n",
" ('/r/PartOf', 1947),\n",
" ('/r/SimilarTo', 1363),\n",
" ('/r/dbpedia/influencedBy', 1289),\n",
" ('/r/UsedFor', 1185),\n",
" ('/r/InstanceOf', 910),\n",
" ('/r/dbpedia/occupation', 751),\n",
" ('/r/HasProperty', 670),\n",
" ('/r/CapableOf', 571),\n",
" ('/r/dbpedia/field', 567),\n",
" ('/r/dbpedia/knownFor', 435),\n",
" ('/r/Desires', 330),\n",
" ('/r/dbpedia/product', 328),\n",
" ('/r/MotivatedByGoal', 271),\n",
" ('/r/dbpedia/capital', 259),\n",
" ('/r/HasA', 224),\n",
" ('/r/HasPrerequisite', 211),\n",
" ('/r/MadeOf', 183),\n",
" ('/r/Causes', 174),\n",
" ('/r/NotDesires', 165),\n",
" ('/r/CausesDesire', 114),\n",
" ('/r/CreatedBy', 61)]"
]
},
"execution_count": 1151,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Counter(item['rel'] for item in russified_conceptnet).most_common(30)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "a5b5ff8f",
"metadata": {},
"outputs": [],
"source": [
"# todo: complete these templates using tools such as spacy/natasha and pymorphy2 and tons of rules\n",
"\n",
"def to_instrumental(x):\n",
" return x\n",
"\n",
"def adjust_short(v, x):\n",
" return v\n",
"\n",
"def adjust_verb(v, x):\n",
" return v\n",
"\n",
"def to_genitive(y):\n",
" return y\n",
"\n",
"def to_locative(y):\n",
" return y\n",
"\n",
"def to_accusative(y):\n",
" return y\n",
"\n",
"def to_dative(y):\n",
" return y\n",
"\n",
"def to_multiple(y):\n",
" return y"
]
},
{
"cell_type": "code",
"execution_count": 1224,
"id": "a2d5db32",
"metadata": {},
"outputs": [],
"source": [
"verbalizers = {\n",
" '/r/RelatedTo': lambda x, y: f'{x} {adjust_short(\"связан\", v)} с {to_instrumental(y)}',\n",
" '/r/Synonym': lambda x, y: f'\"{x}\" – синоним \"{to_genitive(y)}\"', # обычно это одно и то же слово\n",
" '/r/IsA': lambda x, y: f'{x} – это {y}', # '– это', # гипоним - гипероним\n",
" '/r/HasContext': lambda x, y: f'{x} относится к теме \"{y}\"', # предмет - тематика\n",
" '/r/AtLocation': lambda x, y: f'{x} находится в {to_locative(y)}',\n",
" '/r/dbpedia/genre': lambda x, y: f'{x} относится к жанру {y}', # музыкальные жанры\n",
" '/r/dbpedia/genus': lambda x, y: f'{x} относитcя к роду {y}', # биологичесий род\n",
" '/r/Antonym': lambda x, y: f'\"{x}\" – это антоним \"{y}\"',\n",
" '/r/PartOf': lambda x, y: f'{x} – это часть {to_genitive(y)}',\n",
" '/r/dbpedia/influencedBy': lambda x, y: f'{x} испытал влияние {to_genitive(y)}', # связи творческих людей\n",
" '/r/DistinctFrom': lambda x, y: f'{x} отличается от {to_genitive(y)}', # ко-гипонимы\n",
" '/r/InstanceOf': lambda x, y: f'{x} – это один из {to_genitive(to_multiple(y))}',\n",
" '/r/UsedFor': lambda x, y: f'{x} используется для {to_genitive(y)}',\n",
" '/r/SimilarTo': lambda x, y: f'{x} {adjust_short(\"похож\", x)} на {to_genitive(y)}',\n",
" '/r/dbpedia/occupation': lambda x, y: f'{x} по профессии {y}', # шумноватая категория\n",
" '/r/dbpedia/field': lambda x, y: f'{x} {adjust_short(\"знаменит\", x)} в области {to_genitive(y)}', # человек и область\n",
" '/r/dbpedia/knownFor': lambda x, y: f'{x} {adjust_short(\"известен\", x)} благодаря {to_dative(y)}',\n",
" '/r/HasProperty': lambda x, y: f'{x} обладает свойством {y}',\n",
" '/r/CapableOf': lambda x, y: f'{x} {adjust_verb(\"может\", x)} {y}', # предполагается, что слева - глагол\n",
" '/r/dbpedia/product': lambda x, y: f'{x} {adjust_verb(\"производит\", x)} {to_accusative(y)}', # компания - продукт\n",
" '/r/dbpedia/capital': lambda x, y: f'{y} – столица {to_genitive(y)}',\n",
" '/r/Desires': lambda x, y: f'{x} {adjust_verb(\"хочет\", x)} {to_accusative(y)}',\n",
" '/r/HasA': lambda x, y: f'{x} {adjust_verb(\"имеет\", x)} {to_accusative(y)}',\n",
" '/r/NotDesires': lambda x, y: f'{x} не {adjust_verb(\"хочет\", x)} {to_accusative(y)}',\n",
" '/r/HasPrerequisite': lambda x, y: f'{x} {adjust_verb(\"требует\", x)} {to_accusative(y)}', # для Х нужно Y\n",
" '/r/Causes': lambda x, y: f'{x} {adjust_verb(\"причиняет\", x)} {to_accusative(y)}',\n",
" '/r/MadeOf': lambda x, y: f'{x} {adjust_short(\"сделан\", x)} из {to_genitive(y)}',\n",
" '/r/MotivatedByGoal': lambda x, y: f'{x} можно ради {y}', # глагол, ради которого другой глагол\n",
" '/r/CausesDesire': lambda x, y: f'{x} вызывает желание {y}',\n",
" '/r/CreatedBy': lambda x, y: f'{x} {adjust_short(\"создан\", x)} {to_instrumental(y)}', # икс создан игреком (или иногда - из игрека)\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 1260,
"id": "f1328ec7",
"metadata": {},
"outputs": [],
"source": [
"for item in russified_conceptnet:\n",
" text = verbalizers[item['rel']](item['arg1'].split('/')[3].replace('_', ' '), item['arg2'].split('/')[3].replace('_', ' '))\n",
" item['sentence_dirty'] = text "
]
},
{
"cell_type": "code",
"execution_count": 1295,
"id": "83b04788",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'arg1': '/c/ru/заглавная_страница', 'arg2': '/c/ru/автор/n', 'rel': '/r/RelatedTo', 'source': 'translated', 'source_triplet': ['/c/en/title_page/n', '/r/RelatedTo', '/c/en/author'], 'sentence_dirty': 'заглавная страница связан с автор'}\n",
"заглавная страница связан с автор\n"
]
}
],
"source": [
"item = random.choice(russified_conceptnet)\n",
"print(item)\n",
"print(verbalizers[item['rel']](item['arg1'].split('/')[3].replace('_', ' '), item['arg2'].split('/')[3].replace('_', ' ')))"
]
},
{
"cell_type": "code",
"execution_count": 1296,
"id": "be9d88f1",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0d293709c4ac46219697b0d98cfed963",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/543200 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import json\n",
"with open('conceptnet5_russified.jsonl', 'w') as f:\n",
" for i, item in enumerate(tqdm(russified_conceptnet)):\n",
" json.dump(item, f, ensure_ascii=False)\n",
" f.write('\\n')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment