Skip to content

Instantly share code, notes, and snippets.

Created August 28, 2010 20:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/555510 to your computer and use it in GitHub Desktop.
Save anonymous/555510 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
from BeautifulSoup import BeautifulSoup
import csv, os, sys, urllib2
from bsoupxpath import Path
import chardet
import html5lib
from html5lib import treebuilders
DATAPATH = '/vol/grants/data/'
RAWDATAPATH = DATAPATH + 'raw/'
CSVDATAPATH = DATAPATH + 'csv/'
def extract_simple_html_table(url, xpath, csvfilename, encoding=None, skip_top=1):
bc = open(csvfilename, 'w')
bcw = csv.writer(bc, dialect=csv.excel)
tp = Path(xpath)
f = urllib2.urlopen(url)
data = f.read()
# print chardet.detect(data)
parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
if encoding:
soup = parser.parse(data, encoding=encoding)
# soup = BeautifulSoup(data, fromEncoding=encoding)
else:
soup = parser.parse(data)
# print soup
objs = tp.apply(soup)
if not objs or len(objs) == 0:
return
table = objs[0]
rows = table.findAll('tr')
print len(rows)
for row in rows[skip_top:]:
rdata = []
cols = row.findAll('td')
for col in cols:
text = u''.join([e for e in col.recursiveChildGenerator() if isinstance(e,unicode)]).replace(u'\n', u' ').strip()
rdata.append(text.encode('utf8'))
empty = True
for c in rdata:
if len(c) > 0:
empty = False
break
if empty: continue
bcw.writerow(rdata)
bc.close()
def run_all():
extract_simple_html_table('http://www.gosclub.ru/images/content/tablk.html', xpath='//table', csvfilename="data/csv/gosclub_2009.csv", encoding='KOI8-R', skip_top=6)
extract_simple_html_table('http://nbfond.ru/konkurs-grantov/granty-2009/spisok-pobeditelej/', xpath="//table[@border='1']", csvfilename='data/csv/nbfond_2009.csv', encoding="utf8", skip_top=1)
extract_simple_html_table('http://nbfond.ru/konkurs-grantov/granty-2008/spisok-pobeditelej/', xpath="//div[@id='content']/table", csvfilename='data/csv/nbfond_2008.csv', encoding="utf8", skip_top=1)
extract_simple_html_table('http://nbfond.ru/konkurs-grantov/granty-2007/spisok-pobeditelej/', xpath="//div[@id='content']/table", csvfilename='data/csv/nbfond_2007.csv', encoding="utf8", skip_top=1)
extract_simple_html_table('http://www.soprotivlenie.org/?id=124', xpath="//div[@class='kr-plan']/table", csvfilename='data/csv/soprotivlenie_2009.csv', encoding='windows-1251', skip_top=1)
extract_simple_html_table('http://www.inpgo.ru/495/496/579/', xpath="//table[@class='border']", csvfilename="data/csv/inpgo_2009.csv", encoding='windows-1251', skip_top=1)
extract_simple_html_table('http://www.inop.ru/page143/page298/page84/page576/', xpath="//table[@width='823']", csvfilename='data/csv/inop_2009.csv', encoding='windows-1251', skip_top=1)
extract_simple_html_table('http://www.inop.ru/page143/page530/page382/', xpath="//table[@width='648']", csvfilename='data/csv/inop_2008.csv', encoding='windows-1251', skip_top=1)
extract_simple_html_table('http://www.inop.ru/page143/page531/page505/', xpath="//table[@align='center']", csvfilename='data/csv/inop_2007.csv', encoding='windows-1251', skip_top=1)
extract_simple_html_table('http://znaniesvet.com/index.php?option=com_content&task=view&id=221', xpath="//table[@class='MsoNormalTable']", csvfilename='data/csv/znanie_2008.csv', encoding='windows-1251', skip_top=1)
extract_simple_html_table('http://znaniesvet.com/content/view/135/80/', xpath="//table[@width='957']", csvfilename='data/csv/znanie_2007.csv', encoding='windows-1251', skip_top=1)
if __name__ == "__main__":
run_all()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment