Last active
August 29, 2015 14:12
-
-
Save osya/c80b22e6f89f9d1ff856 to your computer and use it in GitHub Desktop.
Парсер избиркома на Python urllib, urllib2 & lxml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from django.core.management.base import AppCommand | |
from commission.izbirkom_parser import parser | |
from commission.models import Comission | |
import lxml.html as html | |
import urllib2, urllib | |
import json | |
import re | |
import time | |
class parser: | |
site = 'http://www.moscow_city.vybory.izbirkom.ru/' | |
tree_url = 'moscow_city/ik_tree/?first=1&id2=%s' | |
page_url = 'moscow_city/ik/%s' | |
def get_page(self, page_url): | |
#page = html.parse() | |
response = urllib2.urlopen('%s/%s' % (self.site,page_url)) | |
text = response.read() | |
response.close() | |
return text.decode('cp1251') | |
def read_tree(self, id=''): | |
json_tree = json.loads(self.get_page(self.tree_url % id)) | |
ids = [] | |
elements = json_tree[0]['children'] | |
for element in elements: | |
if id == '': | |
ids.append(element['id']) | |
else: | |
if element['id'] == id: | |
for uik in element['children']: | |
ids.append(uik['id']) | |
return ids | |
def parse_page(self, text, pid = False , type='gik'): | |
page = html.document_fromstring(text) | |
name = page.cssselect('div.center-colm h2')[0].text_content() | |
#c = Comission.objects.get(name=name) | |
#if c: | |
# print name + "- NO" | |
# return c | |
address = page.cssselect('div.center-colm p b span')[0].attrib['rel'] | |
p = re.compile('\d+') | |
phone = ''.join(p.findall(page.cssselect('div.center-colm p')[2].text_content())) | |
info = html.tostring(page.cssselect('div.table')[0]) | |
#print 'http://geocode-maps.yandex.ru/1.x/?geocode='+address+'&type=json' | |
yandex = urllib2.urlopen('http://geocode-maps.yandex.ru/1.x/?'+urllib.urlencode({'geocode':address.encode('utf-8'),'type':'json'})) | |
yandex_page = yandex.read() | |
yandex.close() | |
ya = html.document_fromstring(yandex_page) | |
geo = ya.cssselect('pos')[0].text_content() | |
comission = Comission(type=type, name=name, address=address, phone=phone, info=info, geo=geo) | |
if pid: | |
comission.parent_comission = pid | |
c = comission.save() | |
print name + "- OK" | |
time.sleep(1) | |
return comission | |
def __init__(self): | |
pid = self.parse_page(self.get_page(self.page_url % '2772000301927')) | |
tik_ids = self.read_tree() | |
for tik_id in tik_ids: | |
id = self.parse_page(self.get_page(self.page_url % tik_id), pid, 'tik') | |
uik_ids = self.read_tree(tik_id) | |
for uik_id in uik_ids: | |
self.parse_page(self.get_page(self.page_url % uik_id), id, 'uik') | |
class Command( AppCommand ): | |
help = u'Автодобавление ТИК/УИК' | |
def handle(self, *args, **options): | |
parser() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment