Skip to content

Instantly share code, notes, and snippets.

@mz1991
Last active December 4, 2017 14:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mz1991/97ee3f7045c8fd0e6f21ab14f9e588c7 to your computer and use it in GitHub Desktop.
Save mz1991/97ee3f7045c8fd0e6f21ab14f9e588c7 to your computer and use it in GitHub Desktop.
import PyPDF2 # pip install PyPDF2
import docx # pip install python-docx
def get_pdf_text(file_path):
pdf_file_obj = open(file_path,'rb')
pdf_reader = PyPDF2.PdfFileReader(pdf_file_obj)
tot_pages = pdf_reader.numPages
text = []
for i in range(tot_pages):
pageObj = pdf_reader.getPage(i)
text.append(pageObj.extractText())
return "\n".join(text)
def get_docx_text(file_path):
doc = docx.Document(file_path)
all_text = []
for doc_para in doc.paragraphs:
all_text.append(doc_para.text)
return "\n".join(all_text)
def get_txt_text(file_path):
with open(file_path, "r") as f:
text = f.read()
return text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment