Skip to content

Instantly share code, notes, and snippets.

@kenzo0107
Created May 28, 2024 11:57
Show Gist options
  • Save kenzo0107/456439de57b3640c053cf369ca42f358 to your computer and use it in GitHub Desktop.
Save kenzo0107/456439de57b3640c053cf369ca42f358 to your computer and use it in GitHub Desktop.
import os
import sys
from docx import Document
from pypdf import PdfReader
# .docx
def read_docx(filepath):
doc = Document(filepath)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
return "\n".join(full_text)
# .pdf
def read_pdf(filepath):
reader = PdfReader(filepath)
full_text = ""
for p in reader.pages:
full_text += p.extract_text()
return full_text
# .txt, .md etc...
def read_txt(filepath):
with open(filepath, 'r') as file:
content = file.read()
return content
def main(filepath):
_, ext = os.path.splitext(filepath)
if ext == '.docx':
t = read_docx(filepath)
elif ext == '.pdf':
t = read_pdf(filepath)
else:
t = read_txt(filepath)
print(t)
if __name__ == "__main__":
filepath = sys.argv[1]
main(filepath)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment