Created
October 10, 2013 05:12
-
-
Save nickjevershed/6913355 to your computer and use it in GitHub Desktop.
Example of expenses PDF scraping
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import scraperwiki | |
import urllib2 | |
import lxml.etree | |
urls = ["http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/TROOD_Russell.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/TRUSS_Warren.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/VAN_MANEN_Bert.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/VASTA_Ross.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/WATERS_Larissa.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/ADAMS_Judith.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BACK_Chris.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BISHOP_Julie.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BISHOP_Mark.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/CASH_Michaelia.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/CORMANN_Mathias.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/CROOK_Tony.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/EGGLESTON_Alan.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/EVANS_Chris.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/GRAY_Gary.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/HAASE_Barry.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/IRONS_Steve.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/JENSEN_Dennis.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/JOHNSTON_David.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/KEENAN_Michael.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/LUDLAM_Scott.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/MARINO_Nola.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/MOYLAN_Judi.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/PARKE_Melissa.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/PRATT_Louise.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/RANDALL_Don.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SIEWERT_Rachel.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SIMPKINS_Luke.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SMITH_Dean.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SMITH_Stephen.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/STERLE_Glenn.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/WASHER_Mal.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/WYATT_Ken.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BERNARDI_Cory.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BIRMINGHAM_Simon.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BRIGGS_Jamie.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BUTLER_Mark.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/CHAMPION_Nick.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/EDWARDS_Sean.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/ELLIS_Kate.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/FARRELL_Don.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/FAWCETT_David.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/FISHER_Mary_Jo.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/GALLACHER_Alex.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/GEORGANAS_Steve.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/HANSON-YOUNG_Sarah.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/HURLEY_Annette.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/MCEWEN_Anne.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/MINCHIN_Nick.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/PYNE_Christopher.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/RAMSEY_Rowan.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/RISHWORTH_Amanda.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SECKER_Patrick.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SOUTHCOTT_Andrew.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/WONG_Penny.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/WORTLEY_Dana.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/WRIGHT_Penny.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/XENOPHON_Nick.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/ZAPPIA_Tony.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/ABETZ_Eric.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/ADAMS_Dick.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BARNETT_Guy.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BILYK_Catryna.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BROWN_Bob2.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BROWN_Carol.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BUSHBY_David.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/COLBECK_Richard.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/COLLINS_Julie.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/LYONS_Geoff.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/MILNE_Christine.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/O'BRIEN_Kerry.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/PARRY_Stephen.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/POLLEY_Helen.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SHERRY_Nick.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SIDEBOTTOM_Sid.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SINGH_Lisa.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/THORP_Lin.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/URQUHART_Anne.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/WHISH-WILSON_Peter.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/WILKIE_Andrew.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/CROSSIN_Trish.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/GRIGGS_Natasha.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SCULLION_Nigel.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SNOWDON_Warren.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BRODTMANN_Gai.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/HUMPHRIES_Gary.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/LEIGH_Andrew.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/LUNDY_Kate.pdf"] | |
for url in urls: | |
pdfdata = urllib2.urlopen(url).read() | |
#print "The pdf file has %d bytes" % len(pdfdata) | |
xmldata = scraperwiki.pdftoxml(pdfdata) | |
#print "After converting to xml it has %d bytes" % len(xmldata) | |
#print "The first 2000 characters are: ", xmldata[:2000] | |
root = lxml.etree.fromstring(xmldata) | |
pages = list(root) | |
#print "The pages are numbered:", [ page.attrib.get("number") for page in pages ] | |
# this function has to work recursively because we might have "<b>Part1 <i>part 2</i></b>" | |
def gettext_with_bi_tags(el): | |
res = [ ] | |
if el.text: | |
res.append(el.text) | |
for lel in el: | |
res.append("<%s>" % lel.tag) | |
res.append(gettext_with_bi_tags(lel)) | |
res.append("</%s>" % lel.tag) | |
if el.tail: | |
res.append(el.tail) | |
return "".join(res) | |
#Get politician's name | |
politician = url.split('/')[-1].split('.pdf')[0].replace('_', ' ') | |
print politician | |
# check through the PDF to find the travel expense pages, store in list | |
travelExpPg = [] | |
for pageno, page in enumerate(pages): | |
for el in list(page)[:10]: | |
if gettext_with_bi_tags(el).strip() == "<b>Parliamentary Travelling Allowance</b>": | |
travelExpPg.append(pageno) | |
if not travelExpPg: | |
print "No expenses available" | |
else: | |
print "Travel expense pages: " + str(travelExpPg) | |
#Get the rows on each travel expense page | |
for pageno in travelExpPg: | |
rows = [] | |
for val, el in enumerate(list(pages[pageno])): | |
if el.tag == "text" and int(el.attrib['left']) == 46 and int(el.attrib['top']) > 229: | |
rows.append(val) | |
#print el.attrib, gettext_with_bi_tags(el) | |
print rows | |
for i in xrange(0, len(rows)): | |
rangerows = [] | |
for x in xrange(rows[i-1], rows[i]): | |
rangerows.append(gettext_with_bi_tags(list(pages[pageno])[x])) | |
print rangerows | |
rowlen = len(rangerows) | |
data = {} | |
for z in xrange(0, len(rangerows)): | |
data['politician'] = politician | |
data[str(z)] = rangerows[z] | |
data['key'] = str(rows[i]) + url + str(pageno) + politician | |
data['url'] = url | |
print data | |
if not data: | |
print "it's empty" | |
else: | |
scraperwiki.sqlite.save(unique_keys=["key"], data=data) | |
rangerows = [] | |
for x in xrange(rows[len(rows)-1], rows[len(rows) - 1] + rowlen): | |
rangerows.append(gettext_with_bi_tags(list(pages[pageno])[x])) | |
print rangerows | |
data = {} | |
for z in xrange(0, len(rangerows)): | |
data['politician'] = politician | |
data[str(z)] = rangerows[z] | |
data['key'] = str(rows[i]) + url + str(pageno) + politician | |
data['url'] = url | |
print data | |
if not data: | |
print "it's empty" | |
else: | |
scraperwiki.sqlite.save(unique_keys=["key"], data=data) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment