Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Extract text contents of PDF files recursively
from tika import parser
import os
def extract_text_from_pdfs_recursively(dir):
for root, dirs, files in os.walk(dir):
for file in files:
path_to_pdf = os.path.join(root, file)
[stem, ext] = os.path.splitext(path_to_pdf)
if ext == '.pdf':
print("Processing " + path_to_pdf)
pdf_contents = parser.from_file(path_to_pdf)
path_to_txt = stem + '.txt'
with open(path_to_txt, 'w') as txt_file:
print("Writing contents to " + path_to_txt)
txt_file.write(pdf_contents['content'])
if __name__ == "__main__":
extract_text_from_pdfs_recursively(os.getcwd())
@yuripiffer

This comment has been minimized.

Copy link

@yuripiffer yuripiffer commented Nov 19, 2020

Thank you soo much!!!

@adindarizky99

This comment has been minimized.

Copy link

@adindarizky99 adindarizky99 commented Aug 13, 2021

Thank you
It really help me

@KTBL-JaschaJung

This comment has been minimized.

Copy link

@KTBL-JaschaJung KTBL-JaschaJung commented Nov 15, 2021

Thank you, this was very helpful.
I ran into a UnicodeEncodeError, but could resolve it by specifying the encoding:
with open(path_to_txt, 'w', encoding="utf-8") as txt_file:

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment