Skip to content

Instantly share code, notes, and snippets.

@e9t

e9t/crawler.py Secret

Created May 14, 2013 04:41
Show Gist options
  • Save e9t/551f9647f58800273025 to your computer and use it in GitHub Desktop.
Save e9t/551f9647f58800273025 to your computer and use it in GitHub Desktop.
#! /usr/bin/python2.7
# -*- coding: utf-8 -*-
import os
import re
import gevent
from gevent import monkey; monkey.patch_all()
import lxml
import utils
def extract(columns):
data = []
for j, c in enumerate(columns):
if j==1:
data.append(re.findall(r'[0-9]+', c.xpath('img/@src')[0])[0])
data.append(c.xpath('a/text()')[0])
data.append(re.findall(r'\w+', c.xpath('a/@href')[0])[2])
elif j==6:
url = c.xpath('img/@onclick')
if url:
d = '1'
else:
d = '0'
data.append(d)
else:
data.append(c.xpath('text()')[0].strip())
return data
def get_data(i, f):
url = BASEURL['list'] + 'PAGE=%d&PAGE_SIZE=%d' % (i*10+1, NUM_PAGES)
print url
utils.get_webpage(url, LIST_PAGE)
page = utils.read_webpage(LIST_PAGE)
rows = utils.get_elems(page, X['table'])
for r in rows:
columns = r.xpath(X['columns'])
if len(columns)==8:
f.write('"')
f.write('","'.join(extract(columns)).encode('utf-8'))
f.write('"\n')
print 'done'
if __name__=='__main__':
NUM_PAGES = 10
MAX_PAGE = 65535
MAX_PAGE = 65
LIST_PAGE = 'list.html'
LIST_DATA = 'list.csv'
BASEURL = {
'list': 'http://likms.assembly.go.kr/bill/jsp/BillSearchResult.jsp?AGE_FROM=19&AGE_TO=19&',
'summary': 'http://likms.assembly.go.kr/bill/jsp/SummaryPopup.jsp?bill_id=',
'specific': 'http://likms.assembly.go.kr/bill/jsp/BillDetail.jsp?bill_id='
}
X = {
'table' : '//table[@width="970"]//table[@width="100%"]//table[@width="100%"]//tr[not(@bgcolor="#DBDBDB")][position()>1]',
'columns': 'descendant::td',
}
with open(LIST_DATA, 'wa') as f:
for i in range(MAX_PAGE/NUM_PAGES+1):
get_data(i, f)
jobs = [gevent.spawn(get_data, i, f) for i in range(MAX_PAGE/NUM_PAGES+1)]
gevent.joinall(jobs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment