Created
October 20, 2016 17:50
-
-
Save christopherkullenberg/6b40491e7af2edf5b8d22b24a27f9788 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def pdfparser(fn): | |
import subprocess | |
'''Requires pdftotext to be installed on the system. Please change directories | |
accordingly.''' | |
f = "upload/" + fn # fn is the filename. upload/ is a directory, pls change. | |
cmd = 'python3 upload/pdf2txt.py -o %s.txt %s' % (f.split('.')[0], f) | |
run = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
out, err = run.communicate() | |
# display errors if they occur | |
#if err: | |
#print(err) | |
fileastextfile = f.split('.')[0] + ".txt" | |
textfile = open(fileastextfile, 'r', encoding="utf-8") | |
extractedtext = textfile.read() | |
return(extractedtext) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment