Skip to content

Instantly share code, notes, and snippets.

@thequbit
Created July 23, 2013 02:18
Show Gist options
  • Save thequbit/6059362 to your computer and use it in GitHub Desktop.
Save thequbit/6059362 to your computer and use it in GitHub Desktop.
def _decodepdf(filename):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(filename, 'rb')
process_pdf(rsrcmgr, device, fp)
fp.close()
device.close()
pdfstr = retstr.getvalue()
retstr.close()
labels = ['Book Dt:','Book Typ:','Cus Typ:','Bail:','Bond:','Court:','Judge:',
'Arr Agy:','Arr Typ:','ROC:','Chg:','Indict:','Adj Dt:','Term:']
for label in labels:
pdfstr = pdfstr.replace(label,"\n{0} ".foramt(label)
pdfstr = re.sub(' +',' ',pdfstr)
pdfstr = re.sub('\n+','\n',pdfstr)
return pdfstr,True
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment