Created
February 23, 2012 05:09
-
-
Save caseycesari/1890458 to your computer and use it in GitHub Desktop.
Scraperwiki lobbying data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Run the code below here: https://scraperwiki.com/scrapers/new/python?template=advanced-scraping-pdfs# | |
import scraperwiki | |
import urllib2 | |
import lxml.etree | |
url = "http://www.phila.gov/ethicsboard/pdfs/lobbying%20registrations%20through%201.27.12_lobbyists.pdf" | |
pdfdata = urllib2.urlopen(url).read() | |
print "The pdf file has %d bytes" % len(pdfdata) | |
xmldata = scraperwiki.pdftoxml(pdfdata) | |
print "After converting to xml it has %d bytes" % len(xmldata) | |
print "The first 2000 characters are: ", xmldata[:2000] | |
root = lxml.etree.fromstring(xmldata) | |
pages = list(root) | |
print "The pages are numbered:", [ page.attrib.get("number") for page in pages ] | |
# this function has to work recursively because we might have "<b>Part1 <i>part 2</i></b>" | |
def gettext_with_bi_tags(el): | |
res = [ ] | |
if el.text: | |
res.append(el.text) | |
for lel in el: | |
res.append("<%s>" % lel.tag) | |
res.append(gettext_with_bi_tags(lel)) | |
res.append("</%s>" % lel.tag) | |
if el.tail: | |
res.append(el.tail) | |
return "".join(res) | |
page0 = pages[0] | |
for el in list(page)[:100]: | |
if el.tag == "text": | |
print el.attrib, gettext_with_bi_tags(el) | |
####### | |
This outputs: | |
{'font': '0', 'width': '100', 'top': '128', 'height': '12', 'left': '75'} <b>Lobbyist Name </b> | |
{'font': '0', 'width': '65', 'top': '128', 'height': '12', 'left': '221'} <b>Address 1</b> | |
{'font': '0', 'width': '65', 'top': '128', 'height': '12', 'left': '394'} <b>Address 2</b> | |
{'font': '0', 'width': '65', 'top': '128', 'height': '12', 'left': '518'} <b>Address 3</b> | |
{'font': '0', 'width': '26', 'top': '128', 'height': '12', 'left': '638'} <b>City</b> | |
{'font': '0', 'width': '33', 'top': '128', 'height': '12', 'left': '730'} <b>State</b> | |
{'font': '0', 'width': '20', 'top': '128', 'height': '12', 'left': '786'} <b>Zip</b> | |
{'font': '0', 'width': '45', 'top': '128', 'height': '12', 'left': '850'} <b>Phone </b> | |
{'font': '0', 'width': '36', 'top': '129', 'height': '12', 'left': '924'} <b>Email</b> | |
{'font': '2', 'width': '620', 'top': '86', 'height': '15', 'left': '31'} <b>2012 Lobbying Registrati...more | |
{'font': '1', 'width': '87', 'top': '163', 'height': '12', 'left': '30'} Wilson, David | |
{'font': '1', 'width': '109', 'top': '163', 'height': '12', 'left': '221'} 3141 Chestnut ST | |
{'font': '1', 'width': '52', 'top': '163', 'height': '12', 'left': '394'} STE 228 | |
{'font': '1', 'width': '74', 'top': '163', 'height': '12', 'left': '638'} Philadelphia | |
{'font': '1', 'width': '18', 'top': '163', 'height': '12', 'left': '741'} PA | |
{'font': '1', 'width': '260', 'top': '163', 'height': '13', 'left': '780'} 19104 215-895-2109 dew39@...more | |
{'font': '1', 'width': '108', 'top': '197', 'height': '12', 'left': '30'} Wojdak, Stephen | |
{'font': '1', 'width': '96', 'top': '197', 'height': '12', 'left': '221'} 200 S Broad ST | |
{'font': '1', 'width': '52', 'top': '197', 'height': '12', 'left': '394'} STE 850 | |
{'font': '1', 'width': '74', 'top': '197', 'height': '12', 'left': '638'} Philadelphia | |
{'font': '1', 'width': '18', 'top': '197', 'height': '12', 'left': '741'} PA | |
{'font': '1', 'width': '277', 'top': '197', 'height': '13', 'left': '780'} 19102 215-735-6660 swojda...more | |
{'font': '1', 'width': '104', 'top': '232', 'height': '12', 'left': '30'} Zalenski, Andrew | |
{'font': '1', 'width': '79', 'top': '232', 'height': '12', 'left': '221'} 230 State ST | |
{'font': '1', 'width': '63', 'top': '232', 'height': '12', 'left': '638'} Harrisburg | |
{'font': '1', 'width': '18', 'top': '232', 'height': '12', 'left': '741'} PA | |
{'font': '1', 'width': '333', 'top': '232', 'height': '13', 'left': '780'} 17101 215-545-4980 andrew...more |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment