Last active
November 19, 2021 19:49
-
-
Save mbruzek/dccc141a498a7eca19ca2c3183595069 to your computer and use it in GitHub Desktop.
A duplicate expansion finding tool not yet ready for primetime.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
A Python3 tool that uses the NASA-Acronym data to find similar expansions. | |
This code uses the Levenshtein Distance to calculate the difference between | |
sequences of characters in the expansions field of the data. | |
This file relies on thefuzz python package: https://github.com/seatgeek/thefuzz | |
Usage: python3 find_similar.py MPCV | |
""" | |
import json | |
import sys | |
import thefuzz | |
from thefuzz import fuzz | |
from thefuzz import process | |
acronym_path = 'lists/acronyms.json' | |
# The cutoff ratio to use for the fuzzy scorer algorithm. | |
cutoff = 88 | |
# The fuzzy scorer algorithm to use when finding similar expansions. | |
scorer=fuzz.partial_ratio | |
data = [] | |
with open(acronym_path) as reader: | |
json_data = reader.read() | |
data = json.loads(json_data) | |
print('{} total acronyms found'.format(len(data))) | |
def find_all(): | |
"""Find the similar expansions for all the abbreviations.""" | |
all_acronyms = [element['abbreviation'] for element in data] | |
find_similar_expansions(all_acronyms) | |
def find_similar_expansions(acronyms): | |
"""Find the similar expansions for the list of abbreviations.""" | |
overloaded_dict = {} | |
overloaded_set = set() | |
# Loop over each element in the data list by index. | |
for index, element in enumerate(data): | |
abbreviation = element['abbreviation'] | |
# Is the abbreviation is in the list of acronyms to expand? | |
if abbreviation in acronyms: | |
# Use casefold on the key to utlize caseless matching. | |
key = abbreviation.casefold() | |
if key in overloaded_set: | |
# The key is already in the set, add another index to dict. | |
overloaded_dict[key].append(index) | |
else: | |
# The key is not in the set, add it and set the initial index. | |
overloaded_set.add(key) | |
overloaded_dict[key] = [index] | |
# Loop over each key in the overloaded set. | |
for key in overloaded_set: | |
# Count the nuber of times this key is overloaded. | |
overloaded_count = len(overloaded_dict[key]) | |
# If the key has more than one expansion it is overloaded. | |
if overloaded_count > 1: | |
abbreviation = data[overloaded_dict[key][0]]['abbreviation'] | |
overloaded_objects = [data[index] for index in overloaded_dict[key]] | |
expansions = [element['expansion'] for element in overloaded_objects] | |
# Compare the list of expansions use fuzzy matching to remove duplciates. | |
deduped = process.dedupe(expansions, threshold=cutoff, scorer=scorer) | |
if len(expansions) != len(deduped): | |
# Get the list of differences between expansions and deduped. | |
difference = list(set(expansions) - set(deduped)) | |
# Print the difference, the expansions that are duplicates. | |
print('{} has possible duplicate expansions {}'.format( | |
abbreviation, | |
difference)) | |
# Print out all the expansions for context on this abbreviation. | |
for item in overloaded_objects: | |
details = '{:<60} {:<9} {:<5} {:<3}'.format(item['expansion'], | |
item['source'], | |
item['acronym_id'], | |
item['source_id']) | |
print(details) | |
if __name__ == '__main__': | |
if len(sys.argv) > 1: | |
find_similar_expansions(sys.argv[1:]) | |
else: | |
find_all() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment