Skip to content

Instantly share code, notes, and snippets.

@pszemraj
Created March 14, 2024 02:49
Show Gist options
  • Save pszemraj/29753b5f2ea3b46afb3abb869e5f582f to your computer and use it in GitHub Desktop.
Save pszemraj/29753b5f2ea3b46afb3abb869e5f582f to your computer and use it in GitHub Desktop.
fuzzy string alignment of two lists
from rapidfuzz import process, fuzz
def fuzzy_align(masterlist, list2, cutoff=70):
# Dictionary to hold matches
matches = {}
# Track used indices to avoid duplicate matches in the masterlist
used_indices = set()
for item in list2:
# Find the best match for each item in list2 within the masterlist
# Ignore matches already used
best_match, score, best_match_index = max(
(
match
for match in process.extract(item, masterlist, scorer=fuzz.WRatio)
if match[2] not in used_indices
),
default=(None, 0, None),
key=lambda x: x[1],
)
# If the match score is above the cutoff and not already used, add it to the matches
if best_match and score > cutoff:
matches[item] = best_match
used_indices.add(best_match_index)
else:
matches[item] = None # or any indicator that it wasn't matched
return matches
# Example usage
masterlist = ["apple", "banana", "cherry", "date"]
list2 = ["appel", "bannana", "dates"]
matches = fuzzy_align(masterlist, list2)
for k, v in matches.items():
print(f"{k}: {v}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment