Skip to content

Instantly share code, notes, and snippets.

@nickjevershed
Created October 14, 2013 22:08
Show Gist options
  • Save nickjevershed/6983049 to your computer and use it in GitHub Desktop.
Save nickjevershed/6983049 to your computer and use it in GitHub Desktop.
Another scraper for older expense PDFs
#!/usr/bin/env python
import scraperwiki
import urllib2
import lxml.etree
urls = ["http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/ABBOTT_Tony.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/ALBANESE_Anthony.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/ALEXANDER_John.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/ARBIB_Mark.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BALDWIN_Bob.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BISHOP_Bronwyn.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BOWEN_Chris.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BRADBURY_David.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/CAMERON_Doug.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/CLARE_Jason.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/COMBET_Greg.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/COONAN_Helen.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/ELLIOT_Justine.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/FARMER_Pat.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/FAULKNER_John.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/FERGUSON_Laurie.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/FIERRAVANTI-WELLS_Concetta.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/FITZGIBBON_Joel.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/FORSHAW_Michael.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/GARRETT_Peter.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/GEORGE_Jennie.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/GRIERSON_Sharon.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/HARTSUYKER_Luke.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/HEFFERNAN_Bill.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/HUTCHINS_Steve.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/KELLY_Mike.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MARKUS_Louise.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MATHESON_Russell.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MCCLELLAND_Robert.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MCCORMACK_Michael.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MCKEW_Maxine.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MORRISON_Scott.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MURPHY_John.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/NASH_Fiona.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/NEAL_Belinda.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/OAKESHOTT_Robert.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/PAYNE_Marise.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/PLIBERSEK_Tanya.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/PRICE_Roger.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/ROWLAND_Michelle.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/RUDDOCK_Philip.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/SAFFIN_Janelle.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/STEPHENS_Ursula.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/TURNBULL_Malcolm.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/VALE_Danna.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/WILLIAMS_John.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/ANDREWS_Kevin.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BAILEY_Fran.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BILLSON_Bruce.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BROADBENT_Russell.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BYRNE_Anthony.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/CARR_Kim.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/CHEESEMAN_Darren.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/CHESTER_Darren.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/COLLINS_Jacinta.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/CONROY_Stephen.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/CREAN_Simon.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/DREYFUS_Mark.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/FEENEY_David.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/FERGUSON_Martin.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/FIELDING_Steve.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/FIFIELD_Mitch.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/FRYDENBERG_Josh.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/GEORGIOU_Petro.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/GILLARD_Julia.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/GRIFFIN_Alan.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/HAWKER_David.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/KING_Catherine.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/KROGER_Helen.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MACKLIN_Jenny.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MARLES_Richard.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MARSHALL_Gavin.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MCGAURAN_Julian.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MIRABELLA_Sophie.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MITCHELL_Robert.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/O'CONNOR_Brendan.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/PEARCE_Chris.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/ROBB_Andrew.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/RONALDSON_Michael.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/ROXON_Nicola.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/RYAN_Scott.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/SHORTEN_Bill.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/STONE_Sharman.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/TANNER_Lindsay.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/THOMSON_Kelvin.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/TROETH_Judith.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/VAMVAKINOU_Maria.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/ANDREWS_Karen.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BEVIS_Arch.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BIDGOOD_James.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BOSWELL_Ronald.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BRANDIS_George.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BUCHHOLZ_Scott.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/CHRISTENSEN_George.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/DUTTON_Peter.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/EMERSON_Craig.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/ENTSCH_Warren.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/FURNER_Mark.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/GAMBARO_Teresa.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/HOGG_John.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/JOHNSON_Michael.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/JOYCE_Barnaby.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/LINDSAY_Peter.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/LIVERMORE_Kirsten.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/LUDWIG_Joe.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MACDONALD_Ian.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MACFARLANE_Ian.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MASON_Brett.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MAY_Margaret.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MCLUCAS_Jan.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MOORE_Claire.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/NEUMANN_Shayne.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/PERRETT_Graham.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/PRENTICE_Jane.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/RAGUSE_Brett.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/SCOTT_Bruce.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/SLIPPER_Peter.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/SOMLYAY_Alexander.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/SULLIVAN_Jon.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/TREVOR_Chris.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/TROOD_Russell.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/TRUSS_Warren.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/VAN%20MANEN_Bert.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/ADAMS_Judith.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BACK_Chris.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BISHOP_Julie.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BISHOP_Mark.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/CASH_Michaelia.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/CORMANN_Mathias.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/EGGLESTON_Alan.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/EVANS_Christopher.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/JACKSON_Sharryn.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/JOHNSTON_David.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/KEENAN_Michael.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/LUDLAM_Scott.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MOYLAN_Judi.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/PRATT_Louise.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/SIEWERT_Rachel.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/SMITH_Stephen.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/STERLE_Glenn.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/TUCKEY_Wilson.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BERNARDI_Cory.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BIRMINGHAM_Simon.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BUTLER_Mark.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/FARRELL_Don.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/FERGUSON_Alan.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/FISHER_Mary%20Jo.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/GEORGANAS_Steve.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/HANSON-YOUNG_Sarah.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/HURLEY_Annette.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MCEWEN_Anne.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MINCHIN_Nick.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/PYNE_Christopher.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/RISHWORTH_Amanda.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/SECKER_Patrick.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/SOUTHCOTT_Andrew.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/WONG_Penny.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/WORTLEY_Dana.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/XENOPHON_Nick.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/ABETZ_Eric.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BARNETT_Guy.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BILYK_Catryna.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BROWN_Carol.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/BUSHBY_David.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/CAMPBELL_Jodie.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/COLBECK_Richard.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/COLLINS_Julie.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/KERR_Duncan.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MILNE_Christine.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/O'BRIEN_Kerry.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/PARRY_Stephen.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/POLLEY_Helen.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/SHERRY_Nick.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/SIDEBOTTOM_Sid.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/CROSSIN_Trish.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/GRIGGS_Natasha.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/SCULLION_Nigel.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/SNOWDON_Warren.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/ELLIS_Annette.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/HUMPHRIES_Gary.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/LUNDY_Kate.pdf","http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/MCMULLAN_Bob.pdf"]
urls2 = ["http://www.finance.gov.au/publications/parliamentarians-reporting/docs/T27/LUNDY_Kate.pdf"]
for url in urls:
pdfdata = urllib2.urlopen(url).read()
print "The pdf file has %d bytes" % len(pdfdata)
xmldata = scraperwiki.pdftoxml(pdfdata)
#print "After converting to xml it has %d bytes" % len(pdfdata)
#print "The first 2000 characters are: ", pdfdata
parser = lxml.etree.XMLParser(recover=True)
root = lxml.etree.XML(xmldata, parser)
pages = list(root)
#print "The pages are numbered:", [ page.attrib.get("number") for page in pages ]
# this function has to work recursively because we might have "<b>Part1 <i>part 2</i></b>"
def gettext_with_bi_tags(el):
res = [ ]
if el.text:
res.append(el.text)
for lel in el:
res.append("<%s>" % lel.tag)
res.append(gettext_with_bi_tags(lel))
res.append("</%s>" % lel.tag)
if el.tail:
res.append(el.tail)
return "".join(res)
#Get politician's name
politician = url.split('/')[-1].split('.pdf')[0].replace('_', ' ')
print politician
# check through the PDF to find the travel expense pages, store in list
travelExpPg = []
for pageno, page in enumerate(pages):
for el in list(page):
#print gettext_with_bi_tags(el)
if gettext_with_bi_tags(el).strip().replace(' ', '') == "<i><b>TravellingAllowance</b></i>":
travelExpPg.append(pageno)
print travelExpPg
if not travelExpPg:
print "No expenses available"
else:
print "Travel expense pages: " + str(travelExpPg)
#Get the rows on each travel expense page
for pageno in travelExpPg:
rows = []
for val, el in enumerate(list(pages[pageno])):
if el.tag == "text" and int(el.attrib['left']) > 66 and int(el.attrib['left']) < 70 and int(el.attrib['top']) > 226:
rows.append(val)
print rows
for i in xrange(0, len(rows)):
rangerows = []
if not rows:
print "rows empty"
else:
for x in xrange(rows[i-1], rows[i]):
rangerows.append(gettext_with_bi_tags(list(pages[pageno])[x]))
print rangerows
rowlen = len(rangerows)
data = {}
for z in xrange(0, len(rangerows)):
data['politician'] = politician
data[str(z)] = rangerows[z]
data['key'] = str(rows[i]) + url + str(pageno) + politician
data['url'] = url
print data
if not data:
print "it's empty"
else:
scraperwiki.sqlite.save(unique_keys=["key"], data=data)
#Get the last row on each page
rangerows = []
if not rows:
print "rows empty"
else:
lastrowheight = int(list(pages[pageno])[rows[len(rows)-1]].attrib['top'])
for el in list(pages[pageno]):
if el.tag == "text" and int(el.attrib['top']) == lastrowheight:
rangerows.append(gettext_with_bi_tags(el))
print rangerows
data = {}
for z in xrange(0, len(rangerows)):
data['politician'] = politician
data[str(z)] = rangerows[z]
data['key'] = str(rows[i]) + url + str(pageno) + politician
data['url'] = url
print data
if not data:
print "it's empty"
else:
scraperwiki.sqlite.save(unique_keys=["key"], data=data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment