Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save zilista/e6545c497f381710272ac7c2aab64fa2 to your computer and use it in GitHub Desktop.
Save zilista/e6545c497f381710272ac7c2aab64fa2 to your computer and use it in GitHub Desktop.
{
"cells": [
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import json\n",
"import pymorphy2\n",
"import re\n",
"import urllib.request as urlrequest\n",
"from urllib.parse import urlencode\n",
"from collections import Counter"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Для работы понадобятся:\n",
" - token - токен api megaindex (https://ru.megaindex.com/api)\n",
" - ser_id - регион, по которому будут сниматься данные\n",
" - keywords_list - словарь ключевых слов, для которых ьудем получать данные"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"token = \"xxxxxxxxxxxxxxxxxxx\"\n",
"ser_id = 174 #ID поисковой системы яндекс_спб\n",
"keywords_list = ['основной маркерный запрос статьи №1', 'основной маркерный запрос статьи №2', 'основной маркерный запрос статьи №3']\n",
"\n",
"morph = pymorphy2.MorphAnalyzer() # создаем экземпляр pymorphy2, понадобится нам дальше для морфологического анализа"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Для получения ключевых слов по нужным нам маркерным запросам будем использовать метод url_keywords API Serpstat (https://serpstat.com/ru/api/117-url-adresa-organicheskih-slov-urlkeywords/). Данный метод возвращает ключевые фразы в топе поисковой системы по заданному URL.\n",
"\n",
"Для работы берем пример кода из документации и оборачиваем его в функцию serpstat_keywords. Подставляем свои значения для \"token\" и региону \"se\", по которому будем получать данные. Получить список регионов можно здесь https://serpstat.com/ru/api/272-spisok-dostupnih-baz-databasesinfo/"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"def serpstat_keywords(url):\n",
" \n",
" host = 'http://api.serpstat.com/v3'\n",
" method = 'url_keywords'\n",
" params = {\n",
" 'query': '{}'.format(url), # string for get info\n",
" 'se': 'y_213', # string search engine, y_2 - спб, y_213 - мск\n",
" 'token': 'xxxxxxxxxxxxxxxxxxx', # string personal token\n",
" }\n",
"\n",
" api_url = \"{host}/{method}?{params}\".format(\n",
" host=host,\n",
" method=method,\n",
" params=urlencode(params)\n",
" )\n",
"\n",
" try:\n",
" json_data = urlrequest.urlopen(api_url).read()\n",
" except Exception as e0:\n",
" print(\"API request error: {error}\".format(error=e0))\n",
" pass\n",
" \n",
" data = json.loads(json_data)\n",
"\n",
" return data"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
"Используя регулярное выражение разбиваем исходную фразу на слова. \n",
"\n",
"Каждое слово лемматизируем, проверяем на часть речи и добавляем в результирующий список. Возвращаем готовый список.\n",
"\n",
"! Не забываем что pymorphy2 работает только с русским языком. \n",
"\n",
"Если в словосочетаниях будут фразы на другом языке, он их пропустит.\n",
"\n",
"\"\"\"\n",
"\n",
"def morph_word_lemma(key):\n",
" \n",
" meaningfullPoSes=['NPRO', 'PREP', 'CONJ', 'PRCL', 'INTJ'] # фильтруем граммемы https://pymorphy2.readthedocs.io/en/latest/user/grammemes.html\n",
" reswords=[]\n",
" \n",
" for word in re.findall(\"([А-ЯЁа-яё0-9]+(-[А-ЯЁа-яё0-9]+)*)\", key): # фразу бьем на слова\n",
" word = word[0]\n",
" word_normal_form = morph.parse(word)[0].normal_form\n",
" form = morph.parse(word)[0].tag\n",
" \n",
" if form.POS in meaningfullPoSes:\n",
" continue\n",
" else:\n",
" reswords.append(word_normal_form)\n",
" \n",
" return reswords"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"# result = morph_word_lemma('В этом texts слова на разных языках')\n",
"# print(result)\n",
"\n",
"# ['слово', 'разный', 'язык']"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
"\n",
"Составляем словарь вида \"Лемма: [количество упоминаний леммы]\"\n",
"\n",
"\"\"\"\n",
"\n",
"def counter_dict_list(list_values):\n",
" \n",
" list_values_all=[]\n",
" \n",
" for item in list_values:\n",
" list_values_word_lemma = morph_word_lemma(item)\n",
" \n",
" for item in list_values_word_lemma:\n",
" list_values_all.append(item)\n",
" dict_values_word_lemma = dict(Counter(list_values_all))\n",
" \n",
" sorted_dict_values_word_lemma = list(dict_values_word_lemma.items())\n",
" sorted_dict_values_word_lemma.sort(key=lambda i: i[1], reverse=True)\n",
" sorted_dict_values_word_lemma = dict(sorted_dict_values_word_lemma)\n",
"\n",
" return (sorted_dict_values_word_lemma)\n"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"# чистим файл и записываем строку заголовка\n",
"f = open('api.txt', 'w')\n",
"f.write(\"key\"+'\\t' + \"base_urls\"+ '\\t' + 'symbols_median' + '\\t' + '\\n')\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"#Получаем данные по api megaindex и парсим полученный текст\n",
"\n",
"def megaindex_text_score(key):\n",
" \n",
" keyword_list = []\n",
" uniq_keyword_list = []\n",
" \n",
" try:\n",
" url = 'http://api.megaindex.com/visrep/text_score?key={}&words={}&ser_id={}'.format(token, key, ser_id)\n",
" r = requests.get(url)\n",
" json_string = r.text\n",
" parsed_string = json.loads(json_string)['data'] \n",
" list_base_urls = parsed_string['serps'][0]['base_urls'] \n",
" symbols_median = parsed_string['old_api']['fragments']['long']['symbols_median']\n",
" except Exception as ex_megaindex:\n",
" print(\"API megaindex request error: {error}\".format(error=ex_megaindex))\n",
" list_base_urls = []\n",
" symbols_median = 'Данные не получены'\n",
" \n",
" for url in list_base_urls:\n",
" url = url.replace('http:', 'https:')\n",
" data = serpstat_keywords(url)\n",
" \n",
" try:\n",
" for keyword in data['result']['hits']:\n",
" keyword_list.append(keyword['keyword'])\n",
" except:\n",
" pass\n",
" \n",
" for item in set(keyword_list):\n",
" uniq_keyword_list.append(item)\n",
" \n",
" count_lemma = counter_dict_list(uniq_keyword_list)\n",
" \n",
" return (list_base_urls, symbols_median, count_lemma)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Всего будет сгенерировано ТЗ: 7\n",
"морфологический анализ ИМЯ СУЩЕСТВИТЕЛЬНОЕ\n",
"морфологический анализ ГЛАГОЛ\n",
"морфологический анализ ДЕЕПРИЧАСТИЕ\n",
"морфологический анализ ИМЯ ПРИЛАГАТЕЛЬНОЕ\n",
"морфологический анализ МЕСТОИМЕНИЕ-СУЩЕСТВИТЕЛЬНОЕ\n",
"морфологический анализ НАРЕЧИЕ\n",
"морфологический анализ ПРИЧАСТИЕ\n",
"end\n"
]
}
],
"source": [
"print ('Всего будет сгенерировано ТЗ: ', len(keywords_list))\n",
"\n",
"for keywords in keywords_list:\n",
" print(keywords)\n",
" \n",
" try:\n",
" list_base_urls, symbols_median, count_lemma = megaindex_text_score(keywords)\n",
" except Exception as ex:\n",
" pass\n",
" print(f'Errow: {ex}')\n",
" \n",
" \n",
" with open('api.txt', 'a') as f:\n",
" f.write('{}\\t{}\\t{}\\t\\n\\n'.format(keywords, list_base_urls, symbols_median)) \n",
" f.write('Лемма' +'\\t' + 'Количество повторений' + '\\n')\n",
"\n",
" for key, value in count_lemma.items():\n",
" f.write('{}\\t{}\\n'.format(key, value))\n",
" f.write('\\n'+'\\n'+'\\n')\n",
"\n",
"print ('end')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment