Skip to content

Instantly share code, notes, and snippets.

@rkjiwa
Last active January 3, 2020 15:52
Show Gist options
  • Save rkjiwa/08fee7f22728b972c88470326c39e8bf to your computer and use it in GitHub Desktop.
Save rkjiwa/08fee7f22728b972c88470326c39e8bf to your computer and use it in GitHub Desktop.
Script to extract text from a pdf using Apache Tika.
# imports
from tika import parser
import os
import datetime
# class for extracting tika files
class TikaExtract(object):
# initialize the object
def __init__(self, source_directory, target_directory_name):
# assigned variables for source_directory and target_directory_name
self.dir = source_directory
self.target = str(target_directory_name)
# define recursive function to walk through directory and convert pdfs
def extract_text_from_pdfs_recursively(self):
for root, dirs, files in os.walk(self.dir):
for file in files:
path_to_pdf = os.path.join(root, file)
[stem, ext] = os.path.splitext(path_to_pdf)
if ext == '.pdf':
print("Processing " + path_to_pdf)
# use tika to parse contents from file
pdf_contents = parser.from_file(path_to_pdf)
# project specific - convert to raw
raw_text = r'{}'.format(pdf_contents['content'])
# project specific - replace new lines with spaces
raw_text = raw_text.replace("\n"," ")
# project specific - replace double new lines with spaces
raw_text = raw_text.replace("\n\n" , " ")
# project specific - replace tabs with spaces
raw_text = raw_text.replace("\t"," ")
path_to_txt = stem + '.txt'
# check if target directory exists
if not os.path.exists(str(os.getcwd()) + self.target):
os.makedirs(str(os.getcwd()) + self.target)
# write the text file to the target directory
# names of the files will be the same, except have the .txt extension
with open(str(os.getcwd()) + self.target + str(file[:-4]) + ".txt", 'w') as txt_file:
print("Writing contents to " + str(os.getcwd()) + self.target + str(file[:-4]) + ".txt")
txt_file.write(raw_text)
# implementation
tikaextract = TikaExtract(source_directory=str(os.getcwd())+'/source_directory/',
target_directory_name='/target_directory/')
# run the function
tikaextract.extract_text_from_pdfs_recursively()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment