Skip to content

Instantly share code, notes, and snippets.

@mh-github
Created September 12, 2019 14:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mh-github/2a2efb80cce5e2b4adf4da2e6489e623 to your computer and use it in GitHub Desktop.
Save mh-github/2a2efb80cce5e2b4adf4da2e6489e623 to your computer and use it in GitHub Desktop.
import PyPDF2, docx, textract, os
def getOutfileName(infile):
name_arr = infile.split('.')
str = ""
for i in range(len(name_arr)-1):
str += name_arr[i] + '.'
return str + 'txt'
def process_pdf(infile):
with open(infile, 'rb') as pdfFileObj:
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
outFile = getOutfileName(infile)
with open(outFile, 'w', encoding = 'utf-8') as outfile:
for pageNum in range(1, pdfReader.numPages):
pageObj = pdfReader.getPage(pageNum)
outfile.write(pageObj.extractText())
def process_docx(infile):
document = docx.Document(infile)
fullText = []
for para in document.paragraphs:
fullText.append('' + para.text)
doc_string = '\n'.join(fullText)
outFile = getOutfileName(infile)
with open(outFile, 'w') as outfile:
for string in fullText:
outfile.write(doc_string)
def process_doc(infile):
text = textract.process(infile)
outFile = getOutfileName(infile)
with open(outFile, 'wb') as outfile:
outfile.write(text)
for filename in os.listdir('.'):
if filename.endswith('.pdf'):
process_pdf(filename)
elif filename.endswith('.docx'):
process_docx(filename)
elif filename.endswith('.doc'):
process_doc(filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment