Skip to content

Instantly share code, notes, and snippets.

@osya
Last active August 29, 2015 14:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save osya/c80b22e6f89f9d1ff856 to your computer and use it in GitHub Desktop.
Save osya/c80b22e6f89f9d1ff856 to your computer and use it in GitHub Desktop.
Парсер избиркома на Python urllib, urllib2 & lxml
# -*- coding: utf-8 -*-
from django.core.management.base import AppCommand
from commission.izbirkom_parser import parser
from commission.models import Comission
import lxml.html as html
import urllib2, urllib
import json
import re
import time
class parser:
site = 'http://www.moscow_city.vybory.izbirkom.ru/'
tree_url = 'moscow_city/ik_tree/?first=1&id2=%s'
page_url = 'moscow_city/ik/%s'
def get_page(self, page_url):
#page = html.parse()
response = urllib2.urlopen('%s/%s' % (self.site,page_url))
text = response.read()
response.close()
return text.decode('cp1251')
def read_tree(self, id=''):
json_tree = json.loads(self.get_page(self.tree_url % id))
ids = []
elements = json_tree[0]['children']
for element in elements:
if id == '':
ids.append(element['id'])
else:
if element['id'] == id:
for uik in element['children']:
ids.append(uik['id'])
return ids
def parse_page(self, text, pid = False , type='gik'):
page = html.document_fromstring(text)
name = page.cssselect('div.center-colm h2')[0].text_content()
#c = Comission.objects.get(name=name)
#if c:
# print name + "- NO"
# return c
address = page.cssselect('div.center-colm p b span')[0].attrib['rel']
p = re.compile('\d+')
phone = ''.join(p.findall(page.cssselect('div.center-colm p')[2].text_content()))
info = html.tostring(page.cssselect('div.table')[0])
#print 'http://geocode-maps.yandex.ru/1.x/?geocode='+address+'&type=json'
yandex = urllib2.urlopen('http://geocode-maps.yandex.ru/1.x/?'+urllib.urlencode({'geocode':address.encode('utf-8'),'type':'json'}))
yandex_page = yandex.read()
yandex.close()
ya = html.document_fromstring(yandex_page)
geo = ya.cssselect('pos')[0].text_content()
comission = Comission(type=type, name=name, address=address, phone=phone, info=info, geo=geo)
if pid:
comission.parent_comission = pid
c = comission.save()
print name + "- OK"
time.sleep(1)
return comission
def __init__(self):
pid = self.parse_page(self.get_page(self.page_url % '2772000301927'))
tik_ids = self.read_tree()
for tik_id in tik_ids:
id = self.parse_page(self.get_page(self.page_url % tik_id), pid, 'tik')
uik_ids = self.read_tree(tik_id)
for uik_id in uik_ids:
self.parse_page(self.get_page(self.page_url % uik_id), id, 'uik')
class Command( AppCommand ):
help = u'Автодобавление ТИК/УИК'
def handle(self, *args, **options):
parser()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment