Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zilista/047d9dd24157b05813497c9ff63432e4 to your computer and use it in GitHub Desktop.
Save zilista/047d9dd24157b05813497c9ff63432e4 to your computer and use it in GitHub Desktop.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pymorphy2\n",
"import requests\n",
"import json\n",
"import re\n",
"\n",
"morph = pymorphy2.MorphAnalyzer()\n",
"\n",
"token = \"xxxxxxxxxxxxxxxxxxxxx\"\n",
"ser_id = 174 #174 #ID поисковой системы яндекс_спб"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
"\n",
"Предварительно заполняем словарь key-value. \n",
"\n",
"Key - название группы, \n",
"\n",
"Value - список, куда добавляем ключи каждой группы\n",
"\n",
"На входе txt-файл ('data_tz.txt') в формате: Ключ -> Группа\n",
"\n",
"\"\"\"\n",
"\n",
"item_dict = {}\n",
"flag = True\n",
"\n",
"with open('data_tz.txt') as file:\n",
" \n",
" for line in file:\n",
" \n",
" if flag:\n",
" flag = False # пропускаем строку заголовка\n",
" else:\n",
" line = line.strip().split('\t')\n",
" word = line[0]\n",
" group = line[1]\n",
"\n",
" if group not in item_dict:\n",
" item_dict[group] = []\n",
" item_dict[group].append(word)\n",
" else:\n",
" item_dict[group].append(word)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"end\n"
]
}
],
"source": [
"group_word_count_dict = {}\n",
"\n",
"\"\"\"\n",
"\n",
"Для каждого ключа обходим все ключевые фразы, разбиваем фразу на слова, нормализуем и добавляем в словарь\n",
"\n",
"\"\"\"\n",
"\n",
"for key, value in item_dict.items():\n",
" group_word_count_dict.setdefault(key, {})\n",
" \n",
" for item in value:\n",
" \n",
" for word in re.findall(\"([А-ЯЁа-яё0-9]+(-[А-ЯЁа-яё0-9]+)*)\", item):\n",
" word = word[0]\n",
" word = morph.parse(word)[0].normal_form\n",
" form = morph.parse(word)[0].tag\n",
" \n",
" #не добавляем в словарь местоимение-существительное, предлог, союз, частица, междометие\n",
" if ('NPRO' in form or 'PREP' in form or 'CONJ' in form or 'PRCL' in form or 'INTJ' in form):\n",
" continue\n",
" else:\n",
" group_word_count_dict[key].setdefault(word, 0)\n",
" \n",
" if word in group_word_count_dict[key]:\n",
" group_word_count_dict[key][word] += 1\n",
" \n",
"#Сортировка получивщегося словаря\n",
"for key, value in group_word_count_dict.items():\n",
" sorted_group_word_count_dict = list(value.items())\n",
" sorted_group_word_count_dict.sort(key=lambda i: i[1], reverse=True)\n",
" sorted_group_word_count_dict = dict(sorted_group_word_count_dict) \n",
" group_word_count_dict[key] = sorted_group_word_count_dict\n",
"\n",
"# print(group_word_count_dict)\n",
"print('end')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"#Получаем данные по api и парсим полученный текст\n",
"\n",
"def megaindex_text_score(key):\n",
"\n",
" try:\n",
" url = 'http://api.megaindex.com/visrep/text_score?key={}&words={}&ser_id={}'.format(token, key, ser_id)\n",
" r = requests.get(url)\n",
" json_string = r.text\n",
" parsed_string = json.loads(json_string)['data']\n",
" list_base_urls = parsed_string['serps'][0]['base_urls']\n",
" symbols_median = parsed_string['old_api']['fragments']['long']['symbols_median']\n",
" \n",
" except Exception as ex_megaindex:\n",
" print(\"API megaindex request error: {error}\".format(error=ex_megaindex))\n",
" list_base_urls = ['Данные не получены']\n",
" symbols_median = 0\n",
" \n",
" \n",
" return(list_base_urls, symbols_median)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# чистим файл\n",
"f = open('group_word_lemma.txt', 'w')\n",
"f.write('Группа' +'\\t' + 'Конкуренты' +'\\t' + 'Символов ЗБП'+ '\\n')\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"with open('group_word_lemma.txt' , 'a') as f:\n",
" \n",
" for key_dict, value_dict in group_word_count_dict.items():\n",
" \n",
" base_urls, symbols_median = megaindex_text_score(key_dict)\n",
" \n",
" if symbols_median < 8000: # Ограничение по количеству символов\n",
" \n",
" #print(key_dict, base_urls, symbols_median)\n",
" \n",
" f.write('{}\\t{}\\t{}\\n\\n'.format(key_dict, base_urls, symbols_median))\n",
" f.write('Лемма' +'\\t' + 'Количество повторений' + '\\n')\n",
" \n",
" for key, value in value_dict.items():\n",
" #print(key, value)\n",
" f.write('{}\\t{}\\n'.format(key, value))\n",
" f.write('\\n'+'\\n'+'\\n')\n",
"\n",
"print('end')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment