Skip to content

Instantly share code, notes, and snippets.

@shimo164
Last active July 3, 2019 11:37
Show Gist options
  • Save shimo164/7ace31f6f9a4740e95fad6ff0743362c to your computer and use it in GitHub Desktop.
Save shimo164/7ace31f6f9a4740e95fad6ff0743362c to your computer and use it in GitHub Desktop.
import re
from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
space = re.compile(r"[  ]+")
def convert_pdf_to_txt(path, txtname, buf=True):
rsrcmgr = PDFResourceManager()
if buf:
outfp = StringIO()
else:
outfp = open(txtname, 'w')
codec = 'utf-8'
laparams = LAParams()
laparams.detect_vertical = True
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.get_pages(fp):
interpreter.process_page(page)
fp.close()
device.close()
if buf:
text = re.sub(space, "", outfp.getvalue())
return text
outfp.close()
convert_pdf_to_txt("TEST.pdf", "from_pdf.txt", False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment