rkjiwa/pdf_extraction_tika.py

## pdf_extraction_tika.py
# imports
from tika import parser
import os
import datetime

# class for extracting tika files
class TikaExtract(object):
    # initialize the object
    def __init__(self, source_directory, target_directory_name):
        # assigned variables for source_directory and target_directory_name
        self.dir = source_directory
        self.target = str(target_directory_name)

    # define recursive function to walk through directory and convert pdfs
    def extract_text_from_pdfs_recursively(self):
        for root, dirs, files in os.walk(self.dir):
            for file in files:
                path_to_pdf = os.path.join(root, file)
                [stem, ext] = os.path.splitext(path_to_pdf)
                if ext == '.pdf':
                    print("Processing " + path_to_pdf)
                    # use tika to parse contents from file
                    pdf_contents = parser.from_file(path_to_pdf)
                    # project specific - convert to raw
                    raw_text = r'{}'.format(pdf_contents['content'])
                    # project specific - replace new lines with spaces
                    raw_text = raw_text.replace("\n"," ")
                    # project specific - replace double new lines with spaces
                    raw_text = raw_text.replace("\n\n" , " ")
                    # project specific - replace tabs with spaces
                    raw_text = raw_text.replace("\t"," ")
                    path_to_txt = stem + '.txt'
                    # check if target directory exists
                    if not os.path.exists(str(os.getcwd()) + self.target):
                        os.makedirs(str(os.getcwd()) + self.target)
                    # write the text file to the target directory
                    # names of the files will be the same, except have the .txt extension
                    with open(str(os.getcwd()) + self.target + str(file[:-4]) + ".txt", 'w') as txt_file:
                        print("Writing contents to " + str(os.getcwd()) + self.target + str(file[:-4]) + ".txt")
                        txt_file.write(raw_text)

# implementation
tikaextract = TikaExtract(source_directory=str(os.getcwd())+'/source_directory/',
                         target_directory_name='/target_directory/')
# run the function
tikaextract.extract_text_from_pdfs_recursively()
	# imports
	from tika import parser
	import os
	import datetime

	# class for extracting tika files
	class TikaExtract(object):
	# initialize the object
	def __init__(self, source_directory, target_directory_name):
	# assigned variables for source_directory and target_directory_name
	self.dir = source_directory
	self.target = str(target_directory_name)

	# define recursive function to walk through directory and convert pdfs
	def extract_text_from_pdfs_recursively(self):
	for root, dirs, files in os.walk(self.dir):
	for file in files:
	path_to_pdf = os.path.join(root, file)
	[stem, ext] = os.path.splitext(path_to_pdf)
	if ext == '.pdf':
	print("Processing " + path_to_pdf)
	# use tika to parse contents from file
	pdf_contents = parser.from_file(path_to_pdf)
	# project specific - convert to raw
	raw_text = r'{}'.format(pdf_contents['content'])
	# project specific - replace new lines with spaces
	raw_text = raw_text.replace("\n"," ")
	# project specific - replace double new lines with spaces
	raw_text = raw_text.replace("\n\n" , " ")
	# project specific - replace tabs with spaces
	raw_text = raw_text.replace("\t"," ")
	path_to_txt = stem + '.txt'
	# check if target directory exists
	if not os.path.exists(str(os.getcwd()) + self.target):
	os.makedirs(str(os.getcwd()) + self.target)
	# write the text file to the target directory
	# names of the files will be the same, except have the .txt extension
	with open(str(os.getcwd()) + self.target + str(file[:-4]) + ".txt", 'w') as txt_file:
	print("Writing contents to " + str(os.getcwd()) + self.target + str(file[:-4]) + ".txt")
	txt_file.write(raw_text)

	# implementation
	tikaextract = TikaExtract(source_directory=str(os.getcwd())+'/source_directory/',
	target_directory_name='/target_directory/')
	# run the function
	tikaextract.extract_text_from_pdfs_recursively()