Skip to content

Instantly share code, notes, and snippets.

@caseycesari
Created February 23, 2012 05:09
Show Gist options
  • Save caseycesari/1890458 to your computer and use it in GitHub Desktop.
Save caseycesari/1890458 to your computer and use it in GitHub Desktop.
Scraperwiki lobbying data
Run the code below here: https://scraperwiki.com/scrapers/new/python?template=advanced-scraping-pdfs#
import scraperwiki
import urllib2
import lxml.etree
url = "http://www.phila.gov/ethicsboard/pdfs/lobbying%20registrations%20through%201.27.12_lobbyists.pdf"
pdfdata = urllib2.urlopen(url).read()
print "The pdf file has %d bytes" % len(pdfdata)
xmldata = scraperwiki.pdftoxml(pdfdata)
print "After converting to xml it has %d bytes" % len(xmldata)
print "The first 2000 characters are: ", xmldata[:2000]
root = lxml.etree.fromstring(xmldata)
pages = list(root)
print "The pages are numbered:", [ page.attrib.get("number") for page in pages ]
# this function has to work recursively because we might have "<b>Part1 <i>part 2</i></b>"
def gettext_with_bi_tags(el):
res = [ ]
if el.text:
res.append(el.text)
for lel in el:
res.append("<%s>" % lel.tag)
res.append(gettext_with_bi_tags(lel))
res.append("</%s>" % lel.tag)
if el.tail:
res.append(el.tail)
return "".join(res)
page0 = pages[0]
for el in list(page)[:100]:
if el.tag == "text":
print el.attrib, gettext_with_bi_tags(el)
#######
This outputs:
{'font': '0', 'width': '100', 'top': '128', 'height': '12', 'left': '75'} <b>Lobbyist Name </b>
{'font': '0', 'width': '65', 'top': '128', 'height': '12', 'left': '221'} <b>Address 1</b>
{'font': '0', 'width': '65', 'top': '128', 'height': '12', 'left': '394'} <b>Address 2</b>
{'font': '0', 'width': '65', 'top': '128', 'height': '12', 'left': '518'} <b>Address 3</b>
{'font': '0', 'width': '26', 'top': '128', 'height': '12', 'left': '638'} <b>City</b>
{'font': '0', 'width': '33', 'top': '128', 'height': '12', 'left': '730'} <b>State</b>
{'font': '0', 'width': '20', 'top': '128', 'height': '12', 'left': '786'} <b>Zip</b>
{'font': '0', 'width': '45', 'top': '128', 'height': '12', 'left': '850'} <b>Phone </b>
{'font': '0', 'width': '36', 'top': '129', 'height': '12', 'left': '924'} <b>Email</b>
{'font': '2', 'width': '620', 'top': '86', 'height': '15', 'left': '31'} <b>2012 Lobbying Registrati...more
{'font': '1', 'width': '87', 'top': '163', 'height': '12', 'left': '30'} Wilson, David
{'font': '1', 'width': '109', 'top': '163', 'height': '12', 'left': '221'} 3141 Chestnut ST
{'font': '1', 'width': '52', 'top': '163', 'height': '12', 'left': '394'} STE 228
{'font': '1', 'width': '74', 'top': '163', 'height': '12', 'left': '638'} Philadelphia
{'font': '1', 'width': '18', 'top': '163', 'height': '12', 'left': '741'} PA
{'font': '1', 'width': '260', 'top': '163', 'height': '13', 'left': '780'} 19104 215-895-2109 dew39@...more
{'font': '1', 'width': '108', 'top': '197', 'height': '12', 'left': '30'} Wojdak, Stephen
{'font': '1', 'width': '96', 'top': '197', 'height': '12', 'left': '221'} 200 S Broad ST
{'font': '1', 'width': '52', 'top': '197', 'height': '12', 'left': '394'} STE 850
{'font': '1', 'width': '74', 'top': '197', 'height': '12', 'left': '638'} Philadelphia
{'font': '1', 'width': '18', 'top': '197', 'height': '12', 'left': '741'} PA
{'font': '1', 'width': '277', 'top': '197', 'height': '13', 'left': '780'} 19102 215-735-6660 swojda...more
{'font': '1', 'width': '104', 'top': '232', 'height': '12', 'left': '30'} Zalenski, Andrew
{'font': '1', 'width': '79', 'top': '232', 'height': '12', 'left': '221'} 230 State ST
{'font': '1', 'width': '63', 'top': '232', 'height': '12', 'left': '638'} Harrisburg
{'font': '1', 'width': '18', 'top': '232', 'height': '12', 'left': '741'} PA
{'font': '1', 'width': '333', 'top': '232', 'height': '13', 'left': '780'} 17101 215-545-4980 andrew...more
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment