Skip to content

Instantly share code, notes, and snippets.

@mbruzek
Last active November 19, 2021 19:49
Show Gist options
  • Save mbruzek/dccc141a498a7eca19ca2c3183595069 to your computer and use it in GitHub Desktop.
Save mbruzek/dccc141a498a7eca19ca2c3183595069 to your computer and use it in GitHub Desktop.
A duplicate expansion finding tool not yet ready for primetime.
#!/usr/bin/env python3
"""
A Python3 tool that uses the NASA-Acronym data to find similar expansions.
This code uses the Levenshtein Distance to calculate the difference between
sequences of characters in the expansions field of the data.
This file relies on thefuzz python package: https://github.com/seatgeek/thefuzz
Usage: python3 find_similar.py MPCV
"""
import json
import sys
import thefuzz
from thefuzz import fuzz
from thefuzz import process
acronym_path = 'lists/acronyms.json'
# The cutoff ratio to use for the fuzzy scorer algorithm.
cutoff = 88
# The fuzzy scorer algorithm to use when finding similar expansions.
scorer=fuzz.partial_ratio
data = []
with open(acronym_path) as reader:
json_data = reader.read()
data = json.loads(json_data)
print('{} total acronyms found'.format(len(data)))
def find_all():
"""Find the similar expansions for all the abbreviations."""
all_acronyms = [element['abbreviation'] for element in data]
find_similar_expansions(all_acronyms)
def find_similar_expansions(acronyms):
"""Find the similar expansions for the list of abbreviations."""
overloaded_dict = {}
overloaded_set = set()
# Loop over each element in the data list by index.
for index, element in enumerate(data):
abbreviation = element['abbreviation']
# Is the abbreviation is in the list of acronyms to expand?
if abbreviation in acronyms:
# Use casefold on the key to utlize caseless matching.
key = abbreviation.casefold()
if key in overloaded_set:
# The key is already in the set, add another index to dict.
overloaded_dict[key].append(index)
else:
# The key is not in the set, add it and set the initial index.
overloaded_set.add(key)
overloaded_dict[key] = [index]
# Loop over each key in the overloaded set.
for key in overloaded_set:
# Count the nuber of times this key is overloaded.
overloaded_count = len(overloaded_dict[key])
# If the key has more than one expansion it is overloaded.
if overloaded_count > 1:
abbreviation = data[overloaded_dict[key][0]]['abbreviation']
overloaded_objects = [data[index] for index in overloaded_dict[key]]
expansions = [element['expansion'] for element in overloaded_objects]
# Compare the list of expansions use fuzzy matching to remove duplciates.
deduped = process.dedupe(expansions, threshold=cutoff, scorer=scorer)
if len(expansions) != len(deduped):
# Get the list of differences between expansions and deduped.
difference = list(set(expansions) - set(deduped))
# Print the difference, the expansions that are duplicates.
print('{} has possible duplicate expansions {}'.format(
abbreviation,
difference))
# Print out all the expansions for context on this abbreviation.
for item in overloaded_objects:
details = '{:<60} {:<9} {:<5} {:<3}'.format(item['expansion'],
item['source'],
item['acronym_id'],
item['source_id'])
print(details)
if __name__ == '__main__':
if len(sys.argv) > 1:
find_similar_expansions(sys.argv[1:])
else:
find_all()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment