Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Extract text contents of PDF files recursively
from tika import parser
import os
def extract_text_from_pdfs_recursively(dir):
for root, dirs, files in os.walk(dir):
for file in files:
path_to_pdf = os.path.join(root, file)
[stem, ext] = os.path.splitext(path_to_pdf)
if ext == '.pdf':
print("Processing " + path_to_pdf)
pdf_contents = parser.from_file(path_to_pdf)
path_to_txt = stem + '.txt'
with open(path_to_txt, 'w') as txt_file:
print("Writing contents to " + path_to_txt)
if __name__ == "__main__":
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.