Created
March 14, 2024 02:49
-
-
Save pszemraj/29753b5f2ea3b46afb3abb869e5f582f to your computer and use it in GitHub Desktop.
fuzzy string alignment of two lists
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from rapidfuzz import process, fuzz | |
def fuzzy_align(masterlist, list2, cutoff=70): | |
# Dictionary to hold matches | |
matches = {} | |
# Track used indices to avoid duplicate matches in the masterlist | |
used_indices = set() | |
for item in list2: | |
# Find the best match for each item in list2 within the masterlist | |
# Ignore matches already used | |
best_match, score, best_match_index = max( | |
( | |
match | |
for match in process.extract(item, masterlist, scorer=fuzz.WRatio) | |
if match[2] not in used_indices | |
), | |
default=(None, 0, None), | |
key=lambda x: x[1], | |
) | |
# If the match score is above the cutoff and not already used, add it to the matches | |
if best_match and score > cutoff: | |
matches[item] = best_match | |
used_indices.add(best_match_index) | |
else: | |
matches[item] = None # or any indicator that it wasn't matched | |
return matches | |
# Example usage | |
masterlist = ["apple", "banana", "cherry", "date"] | |
list2 = ["appel", "bannana", "dates"] | |
matches = fuzzy_align(masterlist, list2) | |
for k, v in matches.items(): | |
print(f"{k}: {v}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment