Created
January 13, 2024 19:57
-
-
Save imvladikon/565033e562fe01a8a065b8c68dfbf0b3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
from typing import Sequence, Dict, Callable | |
import numpy as np | |
from rapidfuzz.distance import Levenshtein | |
from scipy.optimize import linear_sum_assignment | |
def aligned_edit_distance( | |
set_1: Sequence[str], | |
set_2: Sequence[str], | |
threshold: float = 0.5, | |
sim_fn: Callable = Levenshtein.normalized_similarity, | |
sim_fn_kwargs: Dict = {}, | |
token_ops_cost: Dict = {} | |
): | |
if len(set_1) < len(set_2): | |
set_2, set_1 = set_1, set_2 | |
big_length, small_length = len(set_1), len(set_2) | |
sim_matrix = np.full((big_length, small_length), -np.inf) | |
for i, t1 in enumerate(set_1): | |
for j, t2 in enumerate(set_2): | |
score = sim_fn(t1, t2, **sim_fn_kwargs) | |
if score < threshold: | |
score = 0.0 | |
sim_matrix[j, i] = score | |
left_matches = linear_sum_assignment(sim_matrix, maximize=True) | |
exact_matches, fuzzy_matches, deletions, transpositions = 0, 0, 0, 0 | |
for i, j in zip(*left_matches): | |
t1, t2 = set_1[j], set_2[i] | |
if t1 == t2: | |
exact_matches += 1 | |
if i != j: | |
transpositions += token_ops_cost.get("transpose", 0.2) | |
elif sim_matrix[i, j] > threshold: | |
fuzzy_matches += 1 | |
else: | |
deletions += token_ops_cost.get("deletion", 1.0) | |
fuzzy_dist = ( | |
big_length - (exact_matches + fuzzy_matches) + deletions + transpositions | |
) | |
levenstein_similarity = 1 - fuzzy_dist / big_length | |
return levenstein_similarity | |
if __name__ == "__main__": | |
set_1 = "hello world".split() | |
set_2 = "world hello".split() | |
print(aligned_edit_distance(set_1, set_2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment