Last active
May 11, 2017 17:55
-
-
Save eledroos/78f4963c6137efcd69058fcfeb31e022 to your computer and use it in GitHub Desktop.
I had a folder hundreds of emails in the form of PDFs, and I needed to be able to parse them. So, the goal in this gist is to convert all of the PDFs into text files which I could then parse using Regex into a CSV file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from cStringIO import StringIO | |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
from pdfminer.converter import TextConverter | |
from pdfminer.layout import LAParams | |
from pdfminer.pdfpage import PDFPage | |
import os | |
import sys, getopt | |
def convert(fname, pages=None): | |
if not pages: | |
pagenums = set() | |
else: | |
pagenums = set(pages) | |
output = StringIO() | |
manager = PDFResourceManager() | |
converter = TextConverter(manager, output, laparams=LAParams()) | |
interpreter = PDFPageInterpreter(manager, converter) | |
infile = file(fname, 'rb') | |
for page in PDFPage.get_pages(infile, pagenums): | |
interpreter.process_page(page) | |
infile.close() | |
converter.close() | |
text = output.getvalue() | |
output.close | |
return text | |
# Converts all PDFs in directory pdfDirectory, saves all resulting text files to textDirectory | |
def convertMultiple(pdfDirectory, textDirectory): | |
if pdfDirectory == "": pdfDirectory = os.getcwd() + "\\" # Returns current working directory, if pdfDirectory is empty. | |
i = 0 | |
for pdf in os.listdir(pdfDirectory): # Iterate through PDFs in pdfDirectory | |
print("Processing File: " + str(pdf)) | |
fileExtension = pdf.split(".")[-1] | |
if fileExtension == "pdf": | |
pdfFilename = pdfDirectory + pdf # Becomes: \PATH_TO_PDF\example.pdf | |
text = convert(pdfFilename) # Send PDF file to convert() | |
textFilename = textDirectory + pdf + ".txt" # Create file name structure of outputted text file | |
textFile = open(textFilename, "w") # Make text file | |
textFile.write(text) # Write text to file | |
i += 1 | |
print("Successfully wrote " + str(i) + " files to disk.") | |
#i : info | |
#p : pdfDirectory | |
#t = textDirectory | |
def main(argv): | |
pdfDirectory = "../Documents/pdf//" # Put your source directory of PDFs here | |
textDirectory = "../Documents/text//" # Where you want your converted files to go. | |
try: | |
opts, args = getopt.getopt(argv,"ip:t:") | |
except getopt.GetoptError: | |
print("pdfToT.py -p <pdfDirectoryectory> -t <textdirectory>") | |
sys.exit(2) | |
for opt, arg in opts: | |
if opt == "-i": | |
print("pdfToT.py -p <pdfDirectoryectory> -t <textdirectory>") | |
sys.exit() | |
elif opt == "-p": | |
pdfDirectory = arg | |
elif opt == "-t": | |
textDirectory = arg | |
convertMultiple(pdfDirectory, textDirectory) | |
if __name__ == "__main__": | |
main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment