Skip to content

Instantly share code, notes, and snippets.

@nickjevershed
Created October 10, 2013 05:12
Show Gist options
  • Save nickjevershed/6913355 to your computer and use it in GitHub Desktop.
Save nickjevershed/6913355 to your computer and use it in GitHub Desktop.
Example of expenses PDF scraping
#!/usr/bin/env python
import scraperwiki
import urllib2
import lxml.etree
urls = ["http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/TROOD_Russell.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/TRUSS_Warren.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/VAN_MANEN_Bert.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/VASTA_Ross.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/WATERS_Larissa.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/ADAMS_Judith.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BACK_Chris.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BISHOP_Julie.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BISHOP_Mark.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/CASH_Michaelia.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/CORMANN_Mathias.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/CROOK_Tony.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/EGGLESTON_Alan.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/EVANS_Chris.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/GRAY_Gary.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/HAASE_Barry.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/IRONS_Steve.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/JENSEN_Dennis.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/JOHNSTON_David.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/KEENAN_Michael.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/LUDLAM_Scott.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/MARINO_Nola.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/MOYLAN_Judi.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/PARKE_Melissa.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/PRATT_Louise.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/RANDALL_Don.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SIEWERT_Rachel.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SIMPKINS_Luke.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SMITH_Dean.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SMITH_Stephen.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/STERLE_Glenn.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/WASHER_Mal.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/WYATT_Ken.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BERNARDI_Cory.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BIRMINGHAM_Simon.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BRIGGS_Jamie.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BUTLER_Mark.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/CHAMPION_Nick.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/EDWARDS_Sean.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/ELLIS_Kate.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/FARRELL_Don.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/FAWCETT_David.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/FISHER_Mary_Jo.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/GALLACHER_Alex.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/GEORGANAS_Steve.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/HANSON-YOUNG_Sarah.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/HURLEY_Annette.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/MCEWEN_Anne.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/MINCHIN_Nick.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/PYNE_Christopher.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/RAMSEY_Rowan.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/RISHWORTH_Amanda.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SECKER_Patrick.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SOUTHCOTT_Andrew.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/WONG_Penny.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/WORTLEY_Dana.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/WRIGHT_Penny.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/XENOPHON_Nick.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/ZAPPIA_Tony.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/ABETZ_Eric.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/ADAMS_Dick.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BARNETT_Guy.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BILYK_Catryna.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BROWN_Bob2.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BROWN_Carol.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BUSHBY_David.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/COLBECK_Richard.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/COLLINS_Julie.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/LYONS_Geoff.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/MILNE_Christine.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/O'BRIEN_Kerry.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/PARRY_Stephen.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/POLLEY_Helen.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SHERRY_Nick.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SIDEBOTTOM_Sid.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SINGH_Lisa.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/THORP_Lin.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/URQUHART_Anne.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/WHISH-WILSON_Peter.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/WILKIE_Andrew.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/CROSSIN_Trish.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/GRIGGS_Natasha.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SCULLION_Nigel.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/SNOWDON_Warren.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/BRODTMANN_Gai.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/HUMPHRIES_Gary.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/LEIGH_Andrew.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T30/LUNDY_Kate.pdf"]
for url in urls:
pdfdata = urllib2.urlopen(url).read()
#print "The pdf file has %d bytes" % len(pdfdata)
xmldata = scraperwiki.pdftoxml(pdfdata)
#print "After converting to xml it has %d bytes" % len(xmldata)
#print "The first 2000 characters are: ", xmldata[:2000]
root = lxml.etree.fromstring(xmldata)
pages = list(root)
#print "The pages are numbered:", [ page.attrib.get("number") for page in pages ]
# this function has to work recursively because we might have "<b>Part1 <i>part 2</i></b>"
def gettext_with_bi_tags(el):
res = [ ]
if el.text:
res.append(el.text)
for lel in el:
res.append("<%s>" % lel.tag)
res.append(gettext_with_bi_tags(lel))
res.append("</%s>" % lel.tag)
if el.tail:
res.append(el.tail)
return "".join(res)
#Get politician's name
politician = url.split('/')[-1].split('.pdf')[0].replace('_', ' ')
print politician
# check through the PDF to find the travel expense pages, store in list
travelExpPg = []
for pageno, page in enumerate(pages):
for el in list(page)[:10]:
if gettext_with_bi_tags(el).strip() == "<b>Parliamentary Travelling Allowance</b>":
travelExpPg.append(pageno)
if not travelExpPg:
print "No expenses available"
else:
print "Travel expense pages: " + str(travelExpPg)
#Get the rows on each travel expense page
for pageno in travelExpPg:
rows = []
for val, el in enumerate(list(pages[pageno])):
if el.tag == "text" and int(el.attrib['left']) == 46 and int(el.attrib['top']) > 229:
rows.append(val)
#print el.attrib, gettext_with_bi_tags(el)
print rows
for i in xrange(0, len(rows)):
rangerows = []
for x in xrange(rows[i-1], rows[i]):
rangerows.append(gettext_with_bi_tags(list(pages[pageno])[x]))
print rangerows
rowlen = len(rangerows)
data = {}
for z in xrange(0, len(rangerows)):
data['politician'] = politician
data[str(z)] = rangerows[z]
data['key'] = str(rows[i]) + url + str(pageno) + politician
data['url'] = url
print data
if not data:
print "it's empty"
else:
scraperwiki.sqlite.save(unique_keys=["key"], data=data)
rangerows = []
for x in xrange(rows[len(rows)-1], rows[len(rows) - 1] + rowlen):
rangerows.append(gettext_with_bi_tags(list(pages[pageno])[x]))
print rangerows
data = {}
for z in xrange(0, len(rangerows)):
data['politician'] = politician
data[str(z)] = rangerows[z]
data['key'] = str(rows[i]) + url + str(pageno) + politician
data['url'] = url
print data
if not data:
print "it's empty"
else:
scraperwiki.sqlite.save(unique_keys=["key"], data=data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment