nickjevershed/read_pdfs.py

## read_pdfs.py
import PyPDF2
from os import listdir
from fuzzywuzzy import fuzz
import simplejson as json

files = listdir("docs/")

with open('template.txt', 'r') as f:
    template=f.read().replace("\n", "")

# print(template)

# overall matches

overall = []

for filename in files:
	pdfStr = ""
	print(filename)
	with open("docs/" + filename, 'rb') as pdfFile:
		pdfReader = PyPDF2.PdfFileReader(pdfFile)
		createDateXMP = str(pdfReader.getXmpMetadata().xmp_createDate).split(" ")[0]
		if '/CreationDate' in pdfReader.getDocumentInfo():
			dateText = pdfReader.getDocumentInfo()['/CreationDate']
			if "D:" in dateText:
				createDateInfo = dateText.split('D:')[1][:-7]
			else:
				createDateInfo = dateText
		else:
			createDateInfo = None
		pages = pdfReader.numPages

		for x in range(0,pages):
			pageObj = pdfReader.getPage(x)
			text = pageObj.extractText()
			pdfStr = pdfStr + text

		# print(pdfStr)

	ratio = fuzz.partial_ratio(template.lower(), pdfStr.lower())
	print(ratio)
	overall.append({"filename":filename,"ratio":ratio, "createDateXMP":createDateXMP, "createDatInfo":createDateInfo})

with open('overall.json', 'w') as out:
	out.write(json.dumps(overall, indent=4))

# match just one line

partial = []

for filename in files:
	pdfStr = ""
	print(filename)
	with open("docs/" + filename, 'rb') as pdfFile:
		pdfReader = PyPDF2.PdfFileReader(pdfFile)
		createDateXMP = str(pdfReader.getXmpMetadata().xmp_createDate).split(" ")[0]
		if '/CreationDate' in pdfReader.getDocumentInfo():
			dateText = pdfReader.getDocumentInfo()['/CreationDate']
			if "D:" in dateText:
				createDateInfo = dateText.split('D:')[1][:-7]
			else:
				createDateInfo = dateText
		else:
			createDateInfo = None
		pages = pdfReader.numPages

		for x in range(0,pages):
			pageObj = pdfReader.getPage(x)
			text = pageObj.extractText()
			pdfStr = pdfStr + text

		# print(pdfStr)

	ratio = fuzz.partial_ratio("unfairly target retirees who have worked hard and sacrificed for their retirement", pdfStr.lower())
	print(ratio)
	partial.append({"filename":filename,"ratio":ratio, "createDateXMP":createDateXMP, "createDatInfo":createDateInfo})

with open('partial.json', 'w') as out:
	out.write(json.dumps(partial, indent=4))

## template.txt
I want to formally register my opposition to scrap refundable franking credits and the attack on full tax refunds.

This policy will:

- Unfairly target retirees who have worked hard and sacrificed for their retirement.

- Unfairly hit many people on low incomes, including hundreds of thousands of retirees that receive full tax refunds and with 97% of people who receive these refunds having incomes below $87,000.

- Unfairly target retirees on low incomes who will now face double tax, while those on higher incomes will be able to reduce their tax bill by the full value of overpaid tax.

The impact of the retirement tax has not been thought through. It will directly harm my financial security. It should be abandoned.
	import PyPDF2
	from os import listdir
	from fuzzywuzzy import fuzz
	import simplejson as json

	files = listdir("docs/")

	with open('template.txt', 'r') as f:
	template=f.read().replace("\n", "")

	# print(template)

	# overall matches

	overall = []

	for filename in files:
	pdfStr = ""
	print(filename)
	with open("docs/" + filename, 'rb') as pdfFile:
	pdfReader = PyPDF2.PdfFileReader(pdfFile)
	createDateXMP = str(pdfReader.getXmpMetadata().xmp_createDate).split(" ")[0]
	if '/CreationDate' in pdfReader.getDocumentInfo():
	dateText = pdfReader.getDocumentInfo()['/CreationDate']
	if "D:" in dateText:
	createDateInfo = dateText.split('D:')[1][:-7]
	else:
	createDateInfo = dateText
	else:
	createDateInfo = None
	pages = pdfReader.numPages

	for x in range(0,pages):
	pageObj = pdfReader.getPage(x)
	text = pageObj.extractText()
	pdfStr = pdfStr + text

	# print(pdfStr)

	ratio = fuzz.partial_ratio(template.lower(), pdfStr.lower())
	print(ratio)
	overall.append({"filename":filename,"ratio":ratio, "createDateXMP":createDateXMP, "createDatInfo":createDateInfo})

	with open('overall.json', 'w') as out:
	out.write(json.dumps(overall, indent=4))

	# match just one line

	partial = []

	for filename in files:
	pdfStr = ""
	print(filename)
	with open("docs/" + filename, 'rb') as pdfFile:
	pdfReader = PyPDF2.PdfFileReader(pdfFile)
	createDateXMP = str(pdfReader.getXmpMetadata().xmp_createDate).split(" ")[0]
	if '/CreationDate' in pdfReader.getDocumentInfo():
	dateText = pdfReader.getDocumentInfo()['/CreationDate']
	if "D:" in dateText:
	createDateInfo = dateText.split('D:')[1][:-7]
	else:
	createDateInfo = dateText
	else:
	createDateInfo = None
	pages = pdfReader.numPages

	for x in range(0,pages):
	pageObj = pdfReader.getPage(x)
	text = pageObj.extractText()
	pdfStr = pdfStr + text

	# print(pdfStr)

	ratio = fuzz.partial_ratio("unfairly target retirees who have worked hard and sacrificed for their retirement", pdfStr.lower())
	print(ratio)
	partial.append({"filename":filename,"ratio":ratio, "createDateXMP":createDateXMP, "createDatInfo":createDateInfo})

	with open('partial.json', 'w') as out:
	out.write(json.dumps(partial, indent=4))
	I want to formally register my opposition to scrap refundable franking credits and the attack on full tax refunds.

	This policy will:

	- Unfairly target retirees who have worked hard and sacrificed for their retirement.

	- Unfairly hit many people on low incomes, including hundreds of thousands of retirees that receive full tax refunds and with 97% of people who receive these refunds having incomes below $87,000.

	- Unfairly target retirees on low incomes who will now face double tax, while those on higher incomes will be able to reduce their tax bill by the full value of overpaid tax.

	The impact of the retirement tax has not been thought through. It will directly harm my financial security. It should be abandoned.