Created
November 10, 2017 14:44
-
-
Save jaklinger/a64eb5535f021da4fcf2f9f0b2c8c909 to your computer and use it in GitHub Desktop.
Matching titles to free text in a PDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
focus_areas = ['Education Innovation and Technology', | |
'Health Information Technology and Bioinformatics', | |
'Public Health, Non-Communicable Diseases and Wellness', | |
'Biotechnology and Genomics', | |
'Water Management and Economics', | |
'Solar and Alternative Energy Technology Systems', | |
'Space Sciences', | |
'Cubesats and Nanosatellites', | |
'Cybersecurity', | |
'Semiconductor Process Development', | |
'Robotics and Artificial Intelligence', | |
'Smart City Applications and Solutions', | |
'Architecture and Urban Design', | |
'Arabic Digital Technology', | |
'Financial Services Technology', | |
'Petroleum Geosciences', | |
'Internet of Things and Big Data', | |
'Additive Manufacturing (3D Printing)', | |
'Advanced Building and Construction Materials', | |
'Food Security', 'Transportation Logistics, Analytics and Security', | |
'Aerospace Advanced Materials, Manufacturing, Maintenance and Testing', | |
'Commercial Unmanned Aerial Vehicles', 'Autonomous Vehicles'] | |
from fuzzywuzzy import process as fuzzy_process | |
from fuzzywuzzy import fuzz | |
import textract | |
# Load PDF file | |
file_name = '/Users/jklinger/Downloads/Science Technology and Innovation Policy (1) (1) (1).pdf' | |
pdftext = textract.process(file_name) | |
# Keywords to indicate start/end of region | |
start_word = b'Education Innovation' | |
end_word = b'Talent' | |
# Find position of start/end | |
first = pdftext.find(start_word) | |
last = pdftext.find(end_word) | |
# Extract long texts from this region | |
focus_descriptions = [] | |
for txt in pdftext[first:last].split(b'\n\n'): | |
if len(txt) < 300: | |
continue | |
focus_descriptions.append(txt.decode()) | |
def match_title_to_description(focus_areas,focus_descriptions, | |
context=None,threshold=50): | |
matches = {} # The output | |
# Generate descriptions of length `context` | |
short_descriptions = [t.lower()[0:context] for t in focus_descriptions] | |
_focus_areas = [] # List of matched focus areas, for book-keeping | |
# Iterate until done | |
while len(matches) < len(focus_areas): | |
# Calculate the best score, and focus-description matches | |
best_score = 0 | |
best_f = None | |
best_d = None | |
# Find the best match in the this iteration | |
for f in focus_areas: | |
# Don't repeat matched focus areas | |
if f in _focus_areas: | |
continue | |
d,score = fuzzy_process.extractOne(f.lower(),short_descriptions, | |
scorer=fuzz.token_set_ratio) | |
# If this is the best in this iteration, store | |
if score > best_score: | |
best_d = d | |
best_f = f | |
best_score = score | |
# If the score is sufficiently good | |
if best_score > threshold: | |
#print(best_f,best_score,"\n",best_d) | |
#print() | |
matches[best_f] = best_d | |
_focus_areas.append(best_f) | |
short_descriptions.remove(best_d) | |
else: | |
break | |
return matches | |
# Recursively match | |
param_set = [dict(context=75,threshold=50), | |
dict(context=None,threshold=50), | |
dict(context=None,threshold=0)] | |
matches = {} | |
for params in param_set: | |
# Find unmatched focus areas | |
_focus_areas = [] | |
found = [f for f,_ in matches.items()] | |
for f in focus_areas: | |
if f in found: | |
continue | |
_focus_areas.append(f) | |
# Find unmatched descriptions | |
_descriptions = [] | |
found = [d for _,d in matches.items()] | |
for d in focus_descriptions: | |
if any(x in d.lower() for x in found): | |
continue | |
_descriptions.append(d) | |
if len(_focus_areas) == 0: | |
break | |
# Perform the matching | |
_matches = match_title_to_description(_focus_areas,_descriptions,**params) | |
matches = dict(**matches,**_matches) | |
# Convert partial descriptions to the original text descriptions | |
output = {} | |
for k,description in matches.items(): | |
for d in focus_descriptions: | |
if description in d.lower(): | |
output[k] = d | |
# Write the dict as lazy json | |
with open("focus_areas.json","w") as f: | |
f.write(str(output)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment