Skip to content

Instantly share code, notes, and snippets.

@jaklinger
Created November 10, 2017 14:44
Show Gist options
  • Save jaklinger/a64eb5535f021da4fcf2f9f0b2c8c909 to your computer and use it in GitHub Desktop.
Save jaklinger/a64eb5535f021da4fcf2f9f0b2c8c909 to your computer and use it in GitHub Desktop.
Matching titles to free text in a PDF
focus_areas = ['Education Innovation and Technology',
'Health Information Technology and Bioinformatics',
'Public Health, Non-Communicable Diseases and Wellness',
'Biotechnology and Genomics',
'Water Management and Economics',
'Solar and Alternative Energy Technology Systems',
'Space Sciences',
'Cubesats and Nanosatellites',
'Cybersecurity',
'Semiconductor Process Development',
'Robotics and Artificial Intelligence',
'Smart City Applications and Solutions',
'Architecture and Urban Design',
'Arabic Digital Technology',
'Financial Services Technology',
'Petroleum Geosciences',
'Internet of Things and Big Data',
'Additive Manufacturing (3D Printing)',
'Advanced Building and Construction Materials',
'Food Security', 'Transportation Logistics, Analytics and Security',
'Aerospace Advanced Materials, Manufacturing, Maintenance and Testing',
'Commercial Unmanned Aerial Vehicles', 'Autonomous Vehicles']
from fuzzywuzzy import process as fuzzy_process
from fuzzywuzzy import fuzz
import textract
# Load PDF file
file_name = '/Users/jklinger/Downloads/Science Technology and Innovation Policy (1) (1) (1).pdf'
pdftext = textract.process(file_name)
# Keywords to indicate start/end of region
start_word = b'Education Innovation'
end_word = b'Talent'
# Find position of start/end
first = pdftext.find(start_word)
last = pdftext.find(end_word)
# Extract long texts from this region
focus_descriptions = []
for txt in pdftext[first:last].split(b'\n\n'):
if len(txt) < 300:
continue
focus_descriptions.append(txt.decode())
def match_title_to_description(focus_areas,focus_descriptions,
context=None,threshold=50):
matches = {} # The output
# Generate descriptions of length `context`
short_descriptions = [t.lower()[0:context] for t in focus_descriptions]
_focus_areas = [] # List of matched focus areas, for book-keeping
# Iterate until done
while len(matches) < len(focus_areas):
# Calculate the best score, and focus-description matches
best_score = 0
best_f = None
best_d = None
# Find the best match in the this iteration
for f in focus_areas:
# Don't repeat matched focus areas
if f in _focus_areas:
continue
d,score = fuzzy_process.extractOne(f.lower(),short_descriptions,
scorer=fuzz.token_set_ratio)
# If this is the best in this iteration, store
if score > best_score:
best_d = d
best_f = f
best_score = score
# If the score is sufficiently good
if best_score > threshold:
#print(best_f,best_score,"\n",best_d)
#print()
matches[best_f] = best_d
_focus_areas.append(best_f)
short_descriptions.remove(best_d)
else:
break
return matches
# Recursively match
param_set = [dict(context=75,threshold=50),
dict(context=None,threshold=50),
dict(context=None,threshold=0)]
matches = {}
for params in param_set:
# Find unmatched focus areas
_focus_areas = []
found = [f for f,_ in matches.items()]
for f in focus_areas:
if f in found:
continue
_focus_areas.append(f)
# Find unmatched descriptions
_descriptions = []
found = [d for _,d in matches.items()]
for d in focus_descriptions:
if any(x in d.lower() for x in found):
continue
_descriptions.append(d)
if len(_focus_areas) == 0:
break
# Perform the matching
_matches = match_title_to_description(_focus_areas,_descriptions,**params)
matches = dict(**matches,**_matches)
# Convert partial descriptions to the original text descriptions
output = {}
for k,description in matches.items():
for d in focus_descriptions:
if description in d.lower():
output[k] = d
# Write the dict as lazy json
with open("focus_areas.json","w") as f:
f.write(str(output))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment