Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Extract text contents of PDF files recursively
from tika import parser
import os
def extract_text_from_pdfs_recursively(dir):
for root, dirs, files in os.walk(dir):
for file in files:
path_to_pdf = os.path.join(root, file)
[stem, ext] = os.path.splitext(path_to_pdf)
if ext == '.pdf':
print("Processing " + path_to_pdf)
pdf_contents = parser.from_file(path_to_pdf)
path_to_txt = stem + '.txt'
with open(path_to_txt, 'w') as txt_file:
print("Writing contents to " + path_to_txt)
txt_file.write(pdf_contents['content'])
if __name__ == "__main__":
extract_text_from_pdfs_recursively(os.getcwd())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.