Skip to content

Instantly share code, notes, and snippets.

@terencezl
Created April 20, 2017 04:39
Show Gist options
  • Save terencezl/61fe3f28c44a763dd1e9f060b8ff6f2e to your computer and use it in GitHub Desktop.
Save terencezl/61fe3f28c44a763dd1e9f060b8ff6f2e to your computer and use it in GitHub Desktop.
use pdfminer to extract pdf
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter, XMLConverter, HTMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO
def convert_pdf(path, format='text', codec='utf-8', password=''):
rsrcmgr = PDFResourceManager()
retstr = BytesIO()
laparams = LAParams()
if format == 'text':
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
elif format == 'html':
device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
elif format == 'xml':
device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
else:
raise ValueError('provide format, either text, html or xml!')
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue().decode()
fp.close()
device.close()
retstr.close()
return text
@DevilBoy007
Copy link

DevilBoy007 commented Aug 12, 2021

init() got an unexpected keyword arguement 'codec' ... ??

EDIT : class TextConverter(PDFConverter) does not take codec arguement

def __init__(self, rsrcmgr, outfp, pageno=1, laparams=None,
showpageno=False, imagewriter=None):
PDFConverter.__init__(self, rsrcmgr, outfp, pageno=pageno, laparams=laparams)

@NaveenTanguduTR
Copy link

while running above code getting this error

Traceback (most recent call last):
File "C:/Users//AppData/Roaming/JetBrains/PyCharmCE2021.1/scratches/scratch_10.py", line 41, in
if name == main():
File "C:/Users/
/AppData/Roaming/JetBrains/PyCharmCE2021.1/scratches/scratch_10.py", line 38, in main
out = convert_pdf(fileName, codec)
File "C:/Users//AppData/Roaming/JetBrains/PyCharmCE2021.1/scratches/scratch_10.py", line 18, in convert_pdf
device = XMLConverter(rsrcmgr, retstr, laparams=laparams)
File "C:\Users*
\PycharmProjects\gcs-authoring_authoring-service\venv\lib\site-packages\pdfminer\converter.py", line 407, in init
self.write_header()
File "C:\Users*
****\PycharmProjects\gcs-authoring_authoring-service\venv\lib\site-packages\pdfminer\converter.py", line 411, in write_header
self.outfp.write('\n')
TypeError: a bytes-like object is required, not 'str'

@SLadovir
Copy link

SLadovir commented Nov 12, 2021

image
image
image
We need to add the last line:
image
image
After which we get this:
image

using crutches we can fix it this way:
image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment