mbruzek/find_similar.py

## find_similar.py
#!/usr/bin/env python3

"""
A Python3 tool that uses the NASA-Acronym data to find similar expansions.

This code uses the Levenshtein Distance to calculate the difference between
sequences of characters in the expansions field of the data.

This file relies on thefuzz python package: https://github.com/seatgeek/thefuzz

Usage: python3 find_similar.py MPCV

"""

import json
import sys
import thefuzz
from thefuzz import fuzz
from thefuzz import process


acronym_path = 'lists/acronyms.json'
# The cutoff ratio to use for the fuzzy scorer algorithm.
cutoff = 88
# The fuzzy scorer algorithm to use when finding similar expansions.
scorer=fuzz.partial_ratio
data = []

with open(acronym_path) as reader:
    json_data = reader.read()

data = json.loads(json_data)

print('{} total acronyms found'.format(len(data)))


def find_all():
    """Find the similar expansions for all the abbreviations."""
    all_acronyms = [element['abbreviation'] for element in data]
    find_similar_expansions(all_acronyms)


def find_similar_expansions(acronyms):
    """Find the similar expansions for the list of abbreviations."""
    overloaded_dict = {}
    overloaded_set = set()

    # Loop over each element in the data list by index.
    for index, element in enumerate(data):
        abbreviation = element['abbreviation']
        # Is the abbreviation is in the list of acronyms to expand?
        if abbreviation in acronyms:
            # Use casefold on the key to utlize caseless matching.
            key = abbreviation.casefold()
            if key in overloaded_set:
                # The key is already in the set, add another index to dict.
                overloaded_dict[key].append(index)
            else:
                # The key is not in the set, add it and set the initial index.
                overloaded_set.add(key)
                overloaded_dict[key] = [index]

    # Loop over each key in the overloaded set.
    for key in overloaded_set:
        # Count the nuber of times this key is overloaded.
        overloaded_count = len(overloaded_dict[key])
        # If the key has more than one expansion it is overloaded.
        if overloaded_count > 1:
            abbreviation = data[overloaded_dict[key][0]]['abbreviation']
            overloaded_objects = [data[index] for index in overloaded_dict[key]]
            expansions = [element['expansion'] for element in overloaded_objects]
            # Compare the list of expansions use fuzzy matching to remove duplciates.
            deduped = process.dedupe(expansions, threshold=cutoff, scorer=scorer)
            if len(expansions) != len(deduped):
                # Get the list of differences between expansions and deduped.
                difference = list(set(expansions) - set(deduped))
                # Print the difference, the expansions that are duplicates.
                print('{} has possible duplicate expansions {}'.format(
                                                                abbreviation,
                                                                difference))
                # Print out all the expansions for context on this abbreviation.
                for item in overloaded_objects:
                    details = '{:<60} {:<9} {:<5} {:<3}'.format(item['expansion'],
                                                                item['source'],
                                                                item['acronym_id'],
                                                                item['source_id'])
                    print(details)


if __name__ == '__main__':
    if len(sys.argv) > 1:
        find_similar_expansions(sys.argv[1:])
    else:
        find_all()
	#!/usr/bin/env python3

	"""
	A Python3 tool that uses the NASA-Acronym data to find similar expansions.

	This code uses the Levenshtein Distance to calculate the difference between
	sequences of characters in the expansions field of the data.

	This file relies on thefuzz python package: https://github.com/seatgeek/thefuzz

	Usage: python3 find_similar.py MPCV

	"""

	import json
	import sys
	import thefuzz
	from thefuzz import fuzz
	from thefuzz import process


	acronym_path = 'lists/acronyms.json'
	# The cutoff ratio to use for the fuzzy scorer algorithm.
	cutoff = 88
	# The fuzzy scorer algorithm to use when finding similar expansions.
	scorer=fuzz.partial_ratio
	data = []

	with open(acronym_path) as reader:
	json_data = reader.read()

	data = json.loads(json_data)

	print('{} total acronyms found'.format(len(data)))


	def find_all():
	"""Find the similar expansions for all the abbreviations."""
	all_acronyms = [element['abbreviation'] for element in data]
	find_similar_expansions(all_acronyms)


	def find_similar_expansions(acronyms):
	"""Find the similar expansions for the list of abbreviations."""
	overloaded_dict = {}
	overloaded_set = set()

	# Loop over each element in the data list by index.
	for index, element in enumerate(data):
	abbreviation = element['abbreviation']
	# Is the abbreviation is in the list of acronyms to expand?
	if abbreviation in acronyms:
	# Use casefold on the key to utlize caseless matching.
	key = abbreviation.casefold()
	if key in overloaded_set:
	# The key is already in the set, add another index to dict.
	overloaded_dict[key].append(index)
	else:
	# The key is not in the set, add it and set the initial index.
	overloaded_set.add(key)
	overloaded_dict[key] = [index]

	# Loop over each key in the overloaded set.
	for key in overloaded_set:
	# Count the nuber of times this key is overloaded.
	overloaded_count = len(overloaded_dict[key])
	# If the key has more than one expansion it is overloaded.
	if overloaded_count > 1:
	abbreviation = data[overloaded_dict[key][0]]['abbreviation']
	overloaded_objects = [data[index] for index in overloaded_dict[key]]
	expansions = [element['expansion'] for element in overloaded_objects]
	# Compare the list of expansions use fuzzy matching to remove duplciates.
	deduped = process.dedupe(expansions, threshold=cutoff, scorer=scorer)
	if len(expansions) != len(deduped):
	# Get the list of differences between expansions and deduped.
	difference = list(set(expansions) - set(deduped))
	# Print the difference, the expansions that are duplicates.
	print('{} has possible duplicate expansions {}'.format(
	abbreviation,
	difference))
	# Print out all the expansions for context on this abbreviation.
	for item in overloaded_objects:
	details = '{:<60} {:<9} {:<5} {:<3}'.format(item['expansion'],
	item['source'],
	item['acronym_id'],
	item['source_id'])
	print(details)


	if __name__ == '__main__':
	if len(sys.argv) > 1:
	find_similar_expansions(sys.argv[1:])
	else:
	find_all()