Skip to content

Instantly share code, notes, and snippets.

@altmind
Created January 28, 2012 10:50
Show Gist options
  • Save altmind/1693933 to your computer and use it in GitHub Desktop.
Save altmind/1693933 to your computer and use it in GitHub Desktop.
BelstatPDF
import scraperwiki
import urllib
import tempfile
import os
from xml import *
import re
def pdftoxml(pdfdata):
pdffout = tempfile.NamedTemporaryFile(suffix='.pdf')
pdffout.write(pdfdata)
pdffout.flush()
xmlin = tempfile.NamedTemporaryFile(mode='r', suffix='.xml')
tmpxml = xmlin.name # "temph.xml"
cmd = '/usr/bin/pdftohtml -xml -nodrm -zoom 1.5 -enc UTF-8 -noframes "%s" "%s"' % (pdffout.name, os.path.splitext(tmpxml)[0])
cmd = cmd + " >/dev/null 2>&1" # can't turn off output, so throw away even stderr yeuch
os.system(cmd)
pdffout.close()
xmldata = xmlin.read()
xmlin.close()
return xmldata
def scraper_pdf(pdfurl):
pdfdata = urllib.urlopen(pdfurl).read()
pdfxml = pdftoxml(pdfdata)
return pdfxml
print scraper_pdf("""http://belstat.gov.by/homep/ru/indicators/regions_current_data_2011/06.pdf""")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment