-
-
Save kenzo0107/456439de57b3640c053cf369ca42f358 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
from docx import Document | |
from pypdf import PdfReader | |
# .docx | |
def read_docx(filepath): | |
doc = Document(filepath) | |
full_text = [] | |
for para in doc.paragraphs: | |
full_text.append(para.text) | |
return "\n".join(full_text) | |
def read_pdf(filepath): | |
reader = PdfReader(filepath) | |
full_text = "" | |
for p in reader.pages: | |
full_text += p.extract_text() | |
return full_text | |
# .txt, .md etc... | |
def read_txt(filepath): | |
with open(filepath, 'r') as file: | |
content = file.read() | |
return content | |
def main(filepath): | |
_, ext = os.path.splitext(filepath) | |
if ext == '.docx': | |
t = read_docx(filepath) | |
elif ext == '.pdf': | |
t = read_pdf(filepath) | |
else: | |
t = read_txt(filepath) | |
print(t) | |
if __name__ == "__main__": | |
filepath = sys.argv[1] | |
main(filepath) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment