Skip to content

Instantly share code, notes, and snippets.

@eledroos
Last active May 11, 2017 17:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eledroos/78f4963c6137efcd69058fcfeb31e022 to your computer and use it in GitHub Desktop.
Save eledroos/78f4963c6137efcd69058fcfeb31e022 to your computer and use it in GitHub Desktop.
I had a folder hundreds of emails in the form of PDFs, and I needed to be able to parse them. So, the goal in this gist is to convert all of the PDFs into text files which I could then parse using Regex into a CSV file.
from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
import os
import sys, getopt
def convert(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = file(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return text
# Converts all PDFs in directory pdfDirectory, saves all resulting text files to textDirectory
def convertMultiple(pdfDirectory, textDirectory):
if pdfDirectory == "": pdfDirectory = os.getcwd() + "\\" # Returns current working directory, if pdfDirectory is empty.
i = 0
for pdf in os.listdir(pdfDirectory): # Iterate through PDFs in pdfDirectory
print("Processing File: " + str(pdf))
fileExtension = pdf.split(".")[-1]
if fileExtension == "pdf":
pdfFilename = pdfDirectory + pdf # Becomes: \PATH_TO_PDF\example.pdf
text = convert(pdfFilename) # Send PDF file to convert()
textFilename = textDirectory + pdf + ".txt" # Create file name structure of outputted text file
textFile = open(textFilename, "w") # Make text file
textFile.write(text) # Write text to file
i += 1
print("Successfully wrote " + str(i) + " files to disk.")
#i : info
#p : pdfDirectory
#t = textDirectory
def main(argv):
pdfDirectory = "../Documents/pdf//" # Put your source directory of PDFs here
textDirectory = "../Documents/text//" # Where you want your converted files to go.
try:
opts, args = getopt.getopt(argv,"ip:t:")
except getopt.GetoptError:
print("pdfToT.py -p <pdfDirectoryectory> -t <textdirectory>")
sys.exit(2)
for opt, arg in opts:
if opt == "-i":
print("pdfToT.py -p <pdfDirectoryectory> -t <textdirectory>")
sys.exit()
elif opt == "-p":
pdfDirectory = arg
elif opt == "-t":
textDirectory = arg
convertMultiple(pdfDirectory, textDirectory)
if __name__ == "__main__":
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment