afro-coder/pdftotext.py

## pdftotext.py
# This will remove the manual work of copying the file names
# You can run this in batches to processes it
# pip install --user tika to download the tika library
# the first run will download tika.jar
from tika import parser


filename="path_to_file"

#parse the pdf
raw_content = parser.from_file(filename)

#could add the PDF's in a folder loop through the filenames and store them in the last format needed

# raw_get.get('content') => Is a Dictionary key the split is to split after linebreaks
# The filter function will filter all empty or non-truthy content such as ''
# Cast the filter function to the list
process_content = list(filter(None,raw_content.get('content').split("\n")))

for line in process_content:
    print(line)
	# This will remove the manual work of copying the file names
	# You can run this in batches to processes it
	# pip install --user tika to download the tika library
	# the first run will download tika.jar
	from tika import parser


	filename="path_to_file"

	#parse the pdf
	raw_content = parser.from_file(filename)

	#could add the PDF's in a folder loop through the filenames and store them in the last format needed

	# raw_get.get('content') => Is a Dictionary key the split is to split after linebreaks
	# The filter function will filter all empty or non-truthy content such as ''
	# Cast the filter function to the list
	process_content = list(filter(None,raw_content.get('content').split("\n")))

	for line in process_content:
	print(line)