Skip to content

Instantly share code, notes, and snippets.

@ischurov
Created September 30, 2016 21:58
Show Gist options
  • Save ischurov/8b5a231255bd0fff08b860fabcdf69f0 to your computer and use it in GitHub Desktop.
Save ischurov/8b5a231255bd0fff08b860fabcdf69f0 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from bs4 import BeautifulSoup\n",
"import re\n",
"from time import sleep"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"browser = webdriver.Chrome()\n",
"browser.implicitly_wait(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Подготовка\n",
"Здесь собирается список районов и список улиц и строится словарь, позволяющий искать район по названию улицы "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"browser.get(\"http://www.cikrf.ru/services/lk_address/?do=address\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"lnk = browser.find_element_by_partial_link_text(\"Севастополь\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"lnk.click()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.5/site-packages/bs4/__init__.py:166: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system (\"lxml\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n",
"\n",
"To get rid of this warning, change this:\n",
"\n",
" BeautifulSoup([your markup])\n",
"\n",
"to this:\n",
"\n",
" BeautifulSoup([your markup], \"lxml\")\n",
"\n",
" markup_type=markup_type))\n"
]
}
],
"source": [
"bs = BeautifulSoup(browser.page_source)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"sevastopol = bs.find(text=re.compile(\".*Севастополь.*\"))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"rayons = [a.text for a in sevastopol.parent.parent.find_all(\"a\")[1:]]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['Балаклавский район',\n",
" 'Гагаринский район',\n",
" 'Ленинский район',\n",
" 'Нахимовский район']"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rayons"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"for rayon in rayons:\n",
" browser.find_element_by_partial_link_text(rayon).click()\n",
"sleep(2)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"bs = BeautifulSoup(browser.page_source, 'lxml')"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"rayon_to_street = {}\n",
"for rayon in rayons:\n",
" r = bs.find(text=rayon)\n",
" streets = [a.text for a in r.parent.parent.find_all(\"a\")[1:]]\n",
" rayon_to_street[rayon] = streets\n",
" \n",
"street_to_rayon = {}\n",
"for rayon, streets in rayon_to_street.items():\n",
" for street in streets:\n",
" street_to_rayon[street] = rayon"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Главная функция"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def traverse(browser, path):\n",
" \"\"\"\n",
" Эта функция получает на вход «путь» и проходит по нему.\n",
" Например:\n",
" path = ['Город Севастополь', 'Балаклавский район']\n",
" Будет последовательно кликнута сначала ссылка «Город Севастополь», потом она раскроется, внутри этого города\n",
" будет найден «Балаклавский район» и тоже кликнут.\n",
" Путь может быть сколь угодно подробным, в том числе до дома (в этом случае последний клик приведёт к выводу\n",
" информации о УИКе, обслуживающим этот дом)\n",
" \"\"\"\n",
" for i in range(len(path)):\n",
" scope = browser\n",
" for element in path[:i + 1]:\n",
" lnk = scope.find_element_by_link_text(element)\n",
" scope = lnk.find_element_by_xpath(\"..\").find_element_by_xpath(\"..\")\n",
" lnk.click()\n",
" sleep(0.5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Тут начинается самое содержательное"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"addresses = [\"Аграрная, 6\", \"Балашова, 2\", \"Полевая, 2\", \"Гончарная, 4\"]\n",
"# Адреса должны быть в формате \"Улица, дом\""
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"uik_data = {}\n",
"for address in addresses:\n",
" m = re.search(r\"(.+),\\s*(\\d+)\", address)\n",
" if m is None:\n",
" print(\"Incorrect address string\", address)\n",
" continue\n",
" street, house = m.groups()\n",
" rayon = street_to_rayon.get(street)\n",
" if rayon is None:\n",
" print(\"Cannot find rayon for street\", street)\n",
" continue\n",
" browser.get(\"http://www.cikrf.ru/services/lk_address/?do=address\")\n",
" traverse(browser, [\"Город Севастополь\", rayon, street, house])\n",
" m = re.search(r\"Участковая избирательная комиссия №(\\d+)\", browser.page_source)\n",
" if m is None:\n",
" print(\"Cannot find UIK number for address\", address)\n",
" uik = \"\"\n",
" else:\n",
" uik = m.group(1)\n",
" m = re.search(r\"Адрес помещения для голосования: ([^<]+)\", browser.page_source)\n",
" if m is None:\n",
" print(\"Cannot find UIK address for address\", address)\n",
" addr = \"\"\n",
" else:\n",
" addr = m.group(1)\n",
" uik_data[address] = (uik, addr)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'Аграрная, 6': ('22',\n",
" 'Город Севастополь, Балаклавский район, Разъездная, 1Б'),\n",
" 'Балашова, 2': ('10',\n",
" 'Город Севастополь, Балаклавский район, Благодатная, 16'),\n",
" 'Гончарная, 4': ('8', 'Город Севастополь, Балаклавский район, Коммунаров, 2'),\n",
" 'Полевая, 2': ('14', 'Город Севастополь, Балаклавский район, Тимирязева, 23')}"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"uik_data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.0"
},
"toc": {
"toc_cell": false,
"toc_number_sections": true,
"toc_threshold": 6,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment