Skip to content

Instantly share code, notes, and snippets.

@vmesel
Last active October 18, 2018 14:03
Show Gist options
  • Save vmesel/38a0e8c19635e4e9ad798e9a4e41b88e to your computer and use it in GitHub Desktop.
Save vmesel/38a0e8c19635e4e9ad798e9a4e41b88e to your computer and use it in GitHub Desktop.
import io
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
def extract_text_from_pdf(pdf_path):
laparams = LAParams()
resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle, laparams=laparams)
page_interpreter = PDFPageInterpreter(resource_manager, converter)
with open(pdf_path, 'rb') as fh:
for page in PDFPage.get_pages(fh,
caching=True,
check_extractable=True):
page_interpreter.process_page(page)
text = fake_file_handle.getvalue().encode('latin-1','replace').decode('latin-1')
# close open handles
converter.close()
fake_file_handle.close()
if text:
return text
if __name__ == '__main__':
print(extract_text_from_pdf('ex4.pdf'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment