Created
August 16, 2018 16:05
-
-
Save lobstrio/b6aa541c141d44b8e93325562ce18171 to your computer and use it in GitHub Desktop.
Python 3 script to convert .pdf file into .txt output using PDFMiner
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# coding: utf-8 | |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
from pdfminer.converter import TextConverter | |
from pdfminer.pdfpage import PDFPage | |
from io import BytesIO | |
import argparse | |
def pdf2xt(path): | |
""" | |
Extract text from PDF file, and return | |
the string contained inside | |
:param path (str) path to the .pdf file | |
:return: text (str) string extracted | |
""" | |
rsrcmgr = PDFResourceManager() | |
retstr = BytesIO() | |
device = TextConverter(rsrcmgr, retstr) | |
with open(path, "rb") as fp: # open in 'rb' mode to read PDF bytes | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
for page in PDFPage.get_pages(fp, check_extractable=True): | |
interpreter.process_page(page) | |
device.close() | |
text = retstr.getvalue() | |
retstr.close() | |
return text | |
if __name__ == '__main__': | |
# ARGUMENTS FOR THE EXTRACTOR | |
argparser = argparse.ArgumentParser() | |
argparser.add_argument("-s", "--source", required=True, help="path to input .pdf to be converted") | |
argparser.add_argument("-f", "--file", required=False, help="path to the ouput .txt file") | |
args = vars(argparser.parse_args()) | |
source = args["source"] | |
file = args["file"] | |
# EXTRACTING TEXT | |
print('-- Extracting') | |
pdf_text = pdf2xt(source) | |
print('-- Output: {}'.format(pdf_text)) | |
# SAVING TEXT IN .TXT FILE | |
if file: | |
print('-- Writing file') | |
with open(file, "wb") as f: | |
f.write(pdf_text) | |
print('-- File written here: {}'.format(file)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment