Created
July 23, 2013 02:18
-
-
Save thequbit/6059362 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def _decodepdf(filename): | |
rsrcmgr = PDFResourceManager() | |
retstr = StringIO() | |
codec = 'utf-8' | |
laparams = LAParams() | |
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) | |
fp = file(filename, 'rb') | |
process_pdf(rsrcmgr, device, fp) | |
fp.close() | |
device.close() | |
pdfstr = retstr.getvalue() | |
retstr.close() | |
labels = ['Book Dt:','Book Typ:','Cus Typ:','Bail:','Bond:','Court:','Judge:', | |
'Arr Agy:','Arr Typ:','ROC:','Chg:','Indict:','Adj Dt:','Term:'] | |
for label in labels: | |
pdfstr = pdfstr.replace(label,"\n{0} ".foramt(label) | |
pdfstr = re.sub(' +',' ',pdfstr) | |
pdfstr = re.sub('\n+','\n',pdfstr) | |
return pdfstr,True |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment