Created
October 11, 2019 13:28
-
-
Save zilista/e6545c497f381710272ac7c2aab64fa2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import requests\n", | |
"import json\n", | |
"import pymorphy2\n", | |
"import re\n", | |
"import urllib.request as urlrequest\n", | |
"from urllib.parse import urlencode\n", | |
"from collections import Counter" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Для работы понадобятся:\n", | |
" - token - токен api megaindex (https://ru.megaindex.com/api)\n", | |
" - ser_id - регион, по которому будут сниматься данные\n", | |
" - keywords_list - словарь ключевых слов, для которых ьудем получать данные" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"token = \"xxxxxxxxxxxxxxxxxxx\"\n", | |
"ser_id = 174 #ID поисковой системы яндекс_спб\n", | |
"keywords_list = ['основной маркерный запрос статьи №1', 'основной маркерный запрос статьи №2', 'основной маркерный запрос статьи №3']\n", | |
"\n", | |
"morph = pymorphy2.MorphAnalyzer() # создаем экземпляр pymorphy2, понадобится нам дальше для морфологического анализа" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Для получения ключевых слов по нужным нам маркерным запросам будем использовать метод url_keywords API Serpstat (https://serpstat.com/ru/api/117-url-adresa-organicheskih-slov-urlkeywords/). Данный метод возвращает ключевые фразы в топе поисковой системы по заданному URL.\n", | |
"\n", | |
"Для работы берем пример кода из документации и оборачиваем его в функцию serpstat_keywords. Подставляем свои значения для \"token\" и региону \"se\", по которому будем получать данные. Получить список регионов можно здесь https://serpstat.com/ru/api/272-spisok-dostupnih-baz-databasesinfo/" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def serpstat_keywords(url):\n", | |
" \n", | |
" host = 'http://api.serpstat.com/v3'\n", | |
" method = 'url_keywords'\n", | |
" params = {\n", | |
" 'query': '{}'.format(url), # string for get info\n", | |
" 'se': 'y_213', # string search engine, y_2 - спб, y_213 - мск\n", | |
" 'token': 'xxxxxxxxxxxxxxxxxxx', # string personal token\n", | |
" }\n", | |
"\n", | |
" api_url = \"{host}/{method}?{params}\".format(\n", | |
" host=host,\n", | |
" method=method,\n", | |
" params=urlencode(params)\n", | |
" )\n", | |
"\n", | |
" try:\n", | |
" json_data = urlrequest.urlopen(api_url).read()\n", | |
" except Exception as e0:\n", | |
" print(\"API request error: {error}\".format(error=e0))\n", | |
" pass\n", | |
" \n", | |
" data = json.loads(json_data)\n", | |
"\n", | |
" return data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"\"\"\"\n", | |
"Используя регулярное выражение разбиваем исходную фразу на слова. \n", | |
"\n", | |
"Каждое слово лемматизируем, проверяем на часть речи и добавляем в результирующий список. Возвращаем готовый список.\n", | |
"\n", | |
"! Не забываем что pymorphy2 работает только с русским языком. \n", | |
"\n", | |
"Если в словосочетаниях будут фразы на другом языке, он их пропустит.\n", | |
"\n", | |
"\"\"\"\n", | |
"\n", | |
"def morph_word_lemma(key):\n", | |
" \n", | |
" meaningfullPoSes=['NPRO', 'PREP', 'CONJ', 'PRCL', 'INTJ'] # фильтруем граммемы https://pymorphy2.readthedocs.io/en/latest/user/grammemes.html\n", | |
" reswords=[]\n", | |
" \n", | |
" for word in re.findall(\"([А-ЯЁа-яё0-9]+(-[А-ЯЁа-яё0-9]+)*)\", key): # фразу бьем на слова\n", | |
" word = word[0]\n", | |
" word_normal_form = morph.parse(word)[0].normal_form\n", | |
" form = morph.parse(word)[0].tag\n", | |
" \n", | |
" if form.POS in meaningfullPoSes:\n", | |
" continue\n", | |
" else:\n", | |
" reswords.append(word_normal_form)\n", | |
" \n", | |
" return reswords" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# result = morph_word_lemma('В этом texts слова на разных языках')\n", | |
"# print(result)\n", | |
"\n", | |
"# ['слово', 'разный', 'язык']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"\"\"\"\n", | |
"\n", | |
"Составляем словарь вида \"Лемма: [количество упоминаний леммы]\"\n", | |
"\n", | |
"\"\"\"\n", | |
"\n", | |
"def counter_dict_list(list_values):\n", | |
" \n", | |
" list_values_all=[]\n", | |
" \n", | |
" for item in list_values:\n", | |
" list_values_word_lemma = morph_word_lemma(item)\n", | |
" \n", | |
" for item in list_values_word_lemma:\n", | |
" list_values_all.append(item)\n", | |
" dict_values_word_lemma = dict(Counter(list_values_all))\n", | |
" \n", | |
" sorted_dict_values_word_lemma = list(dict_values_word_lemma.items())\n", | |
" sorted_dict_values_word_lemma.sort(key=lambda i: i[1], reverse=True)\n", | |
" sorted_dict_values_word_lemma = dict(sorted_dict_values_word_lemma)\n", | |
"\n", | |
" return (sorted_dict_values_word_lemma)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# чистим файл и записываем строку заголовка\n", | |
"f = open('api.txt', 'w')\n", | |
"f.write(\"key\"+'\\t' + \"base_urls\"+ '\\t' + 'symbols_median' + '\\t' + '\\n')\n", | |
"f.close()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Получаем данные по api megaindex и парсим полученный текст\n", | |
"\n", | |
"def megaindex_text_score(key):\n", | |
" \n", | |
" keyword_list = []\n", | |
" uniq_keyword_list = []\n", | |
" \n", | |
" try:\n", | |
" url = 'http://api.megaindex.com/visrep/text_score?key={}&words={}&ser_id={}'.format(token, key, ser_id)\n", | |
" r = requests.get(url)\n", | |
" json_string = r.text\n", | |
" parsed_string = json.loads(json_string)['data'] \n", | |
" list_base_urls = parsed_string['serps'][0]['base_urls'] \n", | |
" symbols_median = parsed_string['old_api']['fragments']['long']['symbols_median']\n", | |
" except Exception as ex_megaindex:\n", | |
" print(\"API megaindex request error: {error}\".format(error=ex_megaindex))\n", | |
" list_base_urls = []\n", | |
" symbols_median = 'Данные не получены'\n", | |
" \n", | |
" for url in list_base_urls:\n", | |
" url = url.replace('http:', 'https:')\n", | |
" data = serpstat_keywords(url)\n", | |
" \n", | |
" try:\n", | |
" for keyword in data['result']['hits']:\n", | |
" keyword_list.append(keyword['keyword'])\n", | |
" except:\n", | |
" pass\n", | |
" \n", | |
" for item in set(keyword_list):\n", | |
" uniq_keyword_list.append(item)\n", | |
" \n", | |
" count_lemma = counter_dict_list(uniq_keyword_list)\n", | |
" \n", | |
" return (list_base_urls, symbols_median, count_lemma)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Всего будет сгенерировано ТЗ: 7\n", | |
"морфологический анализ ИМЯ СУЩЕСТВИТЕЛЬНОЕ\n", | |
"морфологический анализ ГЛАГОЛ\n", | |
"морфологический анализ ДЕЕПРИЧАСТИЕ\n", | |
"морфологический анализ ИМЯ ПРИЛАГАТЕЛЬНОЕ\n", | |
"морфологический анализ МЕСТОИМЕНИЕ-СУЩЕСТВИТЕЛЬНОЕ\n", | |
"морфологический анализ НАРЕЧИЕ\n", | |
"морфологический анализ ПРИЧАСТИЕ\n", | |
"end\n" | |
] | |
} | |
], | |
"source": [ | |
"print ('Всего будет сгенерировано ТЗ: ', len(keywords_list))\n", | |
"\n", | |
"for keywords in keywords_list:\n", | |
" print(keywords)\n", | |
" \n", | |
" try:\n", | |
" list_base_urls, symbols_median, count_lemma = megaindex_text_score(keywords)\n", | |
" except Exception as ex:\n", | |
" pass\n", | |
" print(f'Errow: {ex}')\n", | |
" \n", | |
" \n", | |
" with open('api.txt', 'a') as f:\n", | |
" f.write('{}\\t{}\\t{}\\t\\n\\n'.format(keywords, list_base_urls, symbols_median)) \n", | |
" f.write('Лемма' +'\\t' + 'Количество повторений' + '\\n')\n", | |
"\n", | |
" for key, value in count_lemma.items():\n", | |
" f.write('{}\\t{}\\n'.format(key, value))\n", | |
" f.write('\\n'+'\\n'+'\\n')\n", | |
"\n", | |
"print ('end')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment