mh-github/documents_to_text.py

## documents_to_text.py
import PyPDF2, docx, textract, os

def getOutfileName(infile):
    name_arr = infile.split('.')
    str      = ""
    for i in range(len(name_arr)-1):
        str += name_arr[i] + '.'

    return str + 'txt'

def process_pdf(infile):
    with open(infile, 'rb') as pdfFileObj:
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        outFile   = getOutfileName(infile)
        with open(outFile, 'w', encoding = 'utf-8') as outfile:
            for pageNum in range(1, pdfReader.numPages):
                pageObj = pdfReader.getPage(pageNum)
                outfile.write(pageObj.extractText())

def process_docx(infile):
    document = docx.Document(infile)
    fullText = []
    for para in document.paragraphs:
        fullText.append('' + para.text)
    doc_string = '\n'.join(fullText)

    outFile = getOutfileName(infile)
    with open(outFile, 'w') as outfile:
        for string in fullText:
            outfile.write(doc_string)

def process_doc(infile):
    text = textract.process(infile)

    outFile = getOutfileName(infile)
    with open(outFile, 'wb') as outfile:
        outfile.write(text)


for filename in os.listdir('.'):
    if filename.endswith('.pdf'):
        process_pdf(filename)
    elif filename.endswith('.docx'):
        process_docx(filename)
    elif filename.endswith('.doc'):
        process_doc(filename)
	import PyPDF2, docx, textract, os

	def getOutfileName(infile):
	name_arr = infile.split('.')
	str = ""
	for i in range(len(name_arr)-1):
	str += name_arr[i] + '.'

	return str + 'txt'

	def process_pdf(infile):
	with open(infile, 'rb') as pdfFileObj:
	pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
	outFile = getOutfileName(infile)
	with open(outFile, 'w', encoding = 'utf-8') as outfile:
	for pageNum in range(1, pdfReader.numPages):
	pageObj = pdfReader.getPage(pageNum)
	outfile.write(pageObj.extractText())

	def process_docx(infile):
	document = docx.Document(infile)
	fullText = []
	for para in document.paragraphs:
	fullText.append('' + para.text)
	doc_string = '\n'.join(fullText)

	outFile = getOutfileName(infile)
	with open(outFile, 'w') as outfile:
	for string in fullText:
	outfile.write(doc_string)

	def process_doc(infile):
	text = textract.process(infile)

	outFile = getOutfileName(infile)
	with open(outFile, 'wb') as outfile:
	outfile.write(text)


	for filename in os.listdir('.'):
	if filename.endswith('.pdf'):
	process_pdf(filename)
	elif filename.endswith('.docx'):
	process_docx(filename)
	elif filename.endswith('.doc'):
	process_doc(filename)