Skip to content

Instantly share code, notes, and snippets.

@terencezl
Created April 20, 2017 04:39
Show Gist options
  • Save terencezl/61fe3f28c44a763dd1e9f060b8ff6f2e to your computer and use it in GitHub Desktop.
Save terencezl/61fe3f28c44a763dd1e9f060b8ff6f2e to your computer and use it in GitHub Desktop.
use pdfminer to extract pdf
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter, XMLConverter, HTMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO
def convert_pdf(path, format='text', codec='utf-8', password=''):
rsrcmgr = PDFResourceManager()
retstr = BytesIO()
laparams = LAParams()
if format == 'text':
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
elif format == 'html':
device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
elif format == 'xml':
device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
else:
raise ValueError('provide format, either text, html or xml!')
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue().decode()
fp.close()
device.close()
retstr.close()
return text
@SLadovir
Copy link

SLadovir commented Nov 12, 2021

image
image
image
We need to add the last line:
image
image
After which we get this:
image

using crutches we can fix it this way:
image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment