Created
January 28, 2012 10:50
-
-
Save altmind/1693933 to your computer and use it in GitHub Desktop.
BelstatPDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scraperwiki | |
import urllib | |
import tempfile | |
import os | |
from xml import * | |
import re | |
def pdftoxml(pdfdata): | |
pdffout = tempfile.NamedTemporaryFile(suffix='.pdf') | |
pdffout.write(pdfdata) | |
pdffout.flush() | |
xmlin = tempfile.NamedTemporaryFile(mode='r', suffix='.xml') | |
tmpxml = xmlin.name # "temph.xml" | |
cmd = '/usr/bin/pdftohtml -xml -nodrm -zoom 1.5 -enc UTF-8 -noframes "%s" "%s"' % (pdffout.name, os.path.splitext(tmpxml)[0]) | |
cmd = cmd + " >/dev/null 2>&1" # can't turn off output, so throw away even stderr yeuch | |
os.system(cmd) | |
pdffout.close() | |
xmldata = xmlin.read() | |
xmlin.close() | |
return xmldata | |
def scraper_pdf(pdfurl): | |
pdfdata = urllib.urlopen(pdfurl).read() | |
pdfxml = pdftoxml(pdfdata) | |
return pdfxml | |
print scraper_pdf("""http://belstat.gov.by/homep/ru/indicators/regions_current_data_2011/06.pdf""") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment