Last active
October 18, 2018 14:03
-
-
Save vmesel/38a0e8c19635e4e9ad798e9a4e41b88e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import io | |
from pdfminer.converter import TextConverter | |
from pdfminer.pdfinterp import PDFPageInterpreter | |
from pdfminer.pdfinterp import PDFResourceManager | |
from pdfminer.layout import LAParams | |
from pdfminer.pdfpage import PDFPage | |
def extract_text_from_pdf(pdf_path): | |
laparams = LAParams() | |
resource_manager = PDFResourceManager() | |
fake_file_handle = io.StringIO() | |
converter = TextConverter(resource_manager, fake_file_handle, laparams=laparams) | |
page_interpreter = PDFPageInterpreter(resource_manager, converter) | |
with open(pdf_path, 'rb') as fh: | |
for page in PDFPage.get_pages(fh, | |
caching=True, | |
check_extractable=True): | |
page_interpreter.process_page(page) | |
text = fake_file_handle.getvalue().encode('latin-1','replace').decode('latin-1') | |
# close open handles | |
converter.close() | |
fake_file_handle.close() | |
if text: | |
return text | |
if __name__ == '__main__': | |
print(extract_text_from_pdf('ex4.pdf')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment