jaklinger/uae_pdf_matching.py

## uae_pdf_matching.py
focus_areas = ['Education Innovation and Technology',
               'Health Information Technology and Bioinformatics',
               'Public Health, Non-Communicable Diseases and Wellness',
               'Biotechnology and Genomics',
               'Water Management and Economics',
               'Solar and Alternative Energy Technology Systems',
               'Space Sciences',
               'Cubesats and Nanosatellites',
               'Cybersecurity',
               'Semiconductor Process Development',
               'Robotics and Artificial Intelligence',
               'Smart City Applications and Solutions',
               'Architecture and Urban Design',
               'Arabic Digital Technology',
               'Financial Services Technology',
               'Petroleum Geosciences',
               'Internet of Things and Big Data',
               'Additive Manufacturing (3D Printing)',
               'Advanced Building and Construction Materials',
               'Food Security', 'Transportation Logistics, Analytics and Security',
               'Aerospace Advanced Materials, Manufacturing, Maintenance and Testing',
               'Commercial Unmanned Aerial Vehicles', 'Autonomous Vehicles']

from fuzzywuzzy import process as fuzzy_process
from fuzzywuzzy import fuzz
import textract

# Load PDF file
file_name = '/Users/jklinger/Downloads/Science Technology and Innovation Policy (1) (1) (1).pdf'
pdftext = textract.process(file_name)

# Keywords to indicate start/end of region
start_word = b'Education Innovation'
end_word = b'Talent'
# Find position of start/end
first = pdftext.find(start_word)
last = pdftext.find(end_word)
# Extract long texts from this region
focus_descriptions = []
for txt in pdftext[first:last].split(b'\n\n'):
    if len(txt) < 300:
        continue
    focus_descriptions.append(txt.decode())


def match_title_to_description(focus_areas,focus_descriptions,
                               context=None,threshold=50):
    matches = {} # The output

    # Generate descriptions of length `context`
    short_descriptions = [t.lower()[0:context] for t in focus_descriptions]
    _focus_areas = [] # List of matched focus areas, for book-keeping
    # Iterate until done
    while len(matches) < len(focus_areas):
        # Calculate the best score, and focus-description matches
        best_score = 0
        best_f = None
        best_d = None
        # Find the best match in the this iteration
        for f in focus_areas:
            # Don't repeat matched focus areas
            if f in _focus_areas:
                continue
            d,score = fuzzy_process.extractOne(f.lower(),short_descriptions,
                                               scorer=fuzz.token_set_ratio)
            # If this is the best in this iteration, store
            if score > best_score:
                best_d = d
                best_f = f
                best_score = score
        # If the score is sufficiently good
        if best_score > threshold:
            #print(best_f,best_score,"\n",best_d)
            #print()
            matches[best_f] = best_d
            _focus_areas.append(best_f)
            short_descriptions.remove(best_d)
        else:
            break
    return matches


# Recursively match
param_set = [dict(context=75,threshold=50),
             dict(context=None,threshold=50),
             dict(context=None,threshold=0)]
matches = {}
for params in param_set:
    # Find unmatched focus areas
    _focus_areas = []
    found = [f for f,_ in matches.items()]
    for f in focus_areas:
        if f in found:
            continue
        _focus_areas.append(f)
    # Find unmatched descriptions
    _descriptions = []
    found = [d for _,d in matches.items()]
    for d in focus_descriptions:
        if any(x in d.lower() for x in found):
            continue
        _descriptions.append(d)
    if len(_focus_areas) == 0:
        break
    # Perform the matching
    _matches = match_title_to_description(_focus_areas,_descriptions,**params)
    matches = dict(**matches,**_matches)

# Convert partial descriptions to the original text descriptions
output = {}
for k,description in matches.items():
    for d in focus_descriptions:
        if description in d.lower():
            output[k] = d

# Write the dict as lazy json
with open("focus_areas.json","w") as f:
    f.write(str(output))
	focus_areas = ['Education Innovation and Technology',
	'Health Information Technology and Bioinformatics',
	'Public Health, Non-Communicable Diseases and Wellness',
	'Biotechnology and Genomics',
	'Water Management and Economics',
	'Solar and Alternative Energy Technology Systems',
	'Space Sciences',
	'Cubesats and Nanosatellites',
	'Cybersecurity',
	'Semiconductor Process Development',
	'Robotics and Artificial Intelligence',
	'Smart City Applications and Solutions',
	'Architecture and Urban Design',
	'Arabic Digital Technology',
	'Financial Services Technology',
	'Petroleum Geosciences',
	'Internet of Things and Big Data',
	'Additive Manufacturing (3D Printing)',
	'Advanced Building and Construction Materials',
	'Food Security', 'Transportation Logistics, Analytics and Security',
	'Aerospace Advanced Materials, Manufacturing, Maintenance and Testing',
	'Commercial Unmanned Aerial Vehicles', 'Autonomous Vehicles']

	from fuzzywuzzy import process as fuzzy_process
	from fuzzywuzzy import fuzz
	import textract

	# Load PDF file
	file_name = '/Users/jklinger/Downloads/Science Technology and Innovation Policy (1) (1) (1).pdf'
	pdftext = textract.process(file_name)

	# Keywords to indicate start/end of region
	start_word = b'Education Innovation'
	end_word = b'Talent'
	# Find position of start/end
	first = pdftext.find(start_word)
	last = pdftext.find(end_word)
	# Extract long texts from this region
	focus_descriptions = []
	for txt in pdftext[first:last].split(b'\n\n'):
	if len(txt) < 300:
	continue
	focus_descriptions.append(txt.decode())


	def match_title_to_description(focus_areas,focus_descriptions,
	context=None,threshold=50):
	matches = {} # The output

	# Generate descriptions of length `context`
	short_descriptions = [t.lower()[0:context] for t in focus_descriptions]
	_focus_areas = [] # List of matched focus areas, for book-keeping
	# Iterate until done
	while len(matches) < len(focus_areas):
	# Calculate the best score, and focus-description matches
	best_score = 0
	best_f = None
	best_d = None
	# Find the best match in the this iteration
	for f in focus_areas:
	# Don't repeat matched focus areas
	if f in _focus_areas:
	continue
	d,score = fuzzy_process.extractOne(f.lower(),short_descriptions,
	scorer=fuzz.token_set_ratio)
	# If this is the best in this iteration, store
	if score > best_score:
	best_d = d
	best_f = f
	best_score = score
	# If the score is sufficiently good
	if best_score > threshold:
	#print(best_f,best_score,"\n",best_d)
	#print()
	matches[best_f] = best_d
	_focus_areas.append(best_f)
	short_descriptions.remove(best_d)
	else:
	break
	return matches


	# Recursively match
	param_set = [dict(context=75,threshold=50),
	dict(context=None,threshold=50),
	dict(context=None,threshold=0)]
	matches = {}
	for params in param_set:
	# Find unmatched focus areas
	_focus_areas = []
	found = [f for f,_ in matches.items()]
	for f in focus_areas:
	if f in found:
	continue
	_focus_areas.append(f)
	# Find unmatched descriptions
	_descriptions = []
	found = [d for _,d in matches.items()]
	for d in focus_descriptions:
	if any(x in d.lower() for x in found):
	continue
	_descriptions.append(d)
	if len(_focus_areas) == 0:
	break
	# Perform the matching
	_matches = match_title_to_description(_focus_areas,_descriptions,**params)
	matches = dict(matches,_matches)

	# Convert partial descriptions to the original text descriptions
	output = {}
	for k,description in matches.items():
	for d in focus_descriptions:
	if description in d.lower():
	output[k] = d

	# Write the dict as lazy json
	with open("focus_areas.json","w") as f:
	f.write(str(output))