Created
February 12, 2019 01:03
-
-
Save nickjevershed/fa0dfbe4e33cea2738be9bebf70f534b to your computer and use it in GitHub Desktop.
Fuzzy string matching with PDFs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import PyPDF2 | |
from os import listdir | |
from fuzzywuzzy import fuzz | |
import simplejson as json | |
files = listdir("docs/") | |
with open('template.txt', 'r') as f: | |
template=f.read().replace("\n", "") | |
# print(template) | |
# overall matches | |
overall = [] | |
for filename in files: | |
pdfStr = "" | |
print(filename) | |
with open("docs/" + filename, 'rb') as pdfFile: | |
pdfReader = PyPDF2.PdfFileReader(pdfFile) | |
createDateXMP = str(pdfReader.getXmpMetadata().xmp_createDate).split(" ")[0] | |
if '/CreationDate' in pdfReader.getDocumentInfo(): | |
dateText = pdfReader.getDocumentInfo()['/CreationDate'] | |
if "D:" in dateText: | |
createDateInfo = dateText.split('D:')[1][:-7] | |
else: | |
createDateInfo = dateText | |
else: | |
createDateInfo = None | |
pages = pdfReader.numPages | |
for x in range(0,pages): | |
pageObj = pdfReader.getPage(x) | |
text = pageObj.extractText() | |
pdfStr = pdfStr + text | |
# print(pdfStr) | |
ratio = fuzz.partial_ratio(template.lower(), pdfStr.lower()) | |
print(ratio) | |
overall.append({"filename":filename,"ratio":ratio, "createDateXMP":createDateXMP, "createDatInfo":createDateInfo}) | |
with open('overall.json', 'w') as out: | |
out.write(json.dumps(overall, indent=4)) | |
# match just one line | |
partial = [] | |
for filename in files: | |
pdfStr = "" | |
print(filename) | |
with open("docs/" + filename, 'rb') as pdfFile: | |
pdfReader = PyPDF2.PdfFileReader(pdfFile) | |
createDateXMP = str(pdfReader.getXmpMetadata().xmp_createDate).split(" ")[0] | |
if '/CreationDate' in pdfReader.getDocumentInfo(): | |
dateText = pdfReader.getDocumentInfo()['/CreationDate'] | |
if "D:" in dateText: | |
createDateInfo = dateText.split('D:')[1][:-7] | |
else: | |
createDateInfo = dateText | |
else: | |
createDateInfo = None | |
pages = pdfReader.numPages | |
for x in range(0,pages): | |
pageObj = pdfReader.getPage(x) | |
text = pageObj.extractText() | |
pdfStr = pdfStr + text | |
# print(pdfStr) | |
ratio = fuzz.partial_ratio("unfairly target retirees who have worked hard and sacrificed for their retirement", pdfStr.lower()) | |
print(ratio) | |
partial.append({"filename":filename,"ratio":ratio, "createDateXMP":createDateXMP, "createDatInfo":createDateInfo}) | |
with open('partial.json', 'w') as out: | |
out.write(json.dumps(partial, indent=4)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
I want to formally register my opposition to scrap refundable franking credits and the attack on full tax refunds. | |
This policy will: | |
- Unfairly target retirees who have worked hard and sacrificed for their retirement. | |
- Unfairly hit many people on low incomes, including hundreds of thousands of retirees that receive full tax refunds and with 97% of people who receive these refunds having incomes below $87,000. | |
- Unfairly target retirees on low incomes who will now face double tax, while those on higher incomes will be able to reduce their tax bill by the full value of overpaid tax. | |
The impact of the retirement tax has not been thought through. It will directly harm my financial security. It should be abandoned. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment