Skip to content

Instantly share code, notes, and snippets.

@imvladikon
Created January 13, 2024 19:57
Show Gist options
  • Save imvladikon/565033e562fe01a8a065b8c68dfbf0b3 to your computer and use it in GitHub Desktop.
Save imvladikon/565033e562fe01a8a065b8c68dfbf0b3 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from typing import Sequence, Dict, Callable
import numpy as np
from rapidfuzz.distance import Levenshtein
from scipy.optimize import linear_sum_assignment
def aligned_edit_distance(
set_1: Sequence[str],
set_2: Sequence[str],
threshold: float = 0.5,
sim_fn: Callable = Levenshtein.normalized_similarity,
sim_fn_kwargs: Dict = {},
token_ops_cost: Dict = {}
):
if len(set_1) < len(set_2):
set_2, set_1 = set_1, set_2
big_length, small_length = len(set_1), len(set_2)
sim_matrix = np.full((big_length, small_length), -np.inf)
for i, t1 in enumerate(set_1):
for j, t2 in enumerate(set_2):
score = sim_fn(t1, t2, **sim_fn_kwargs)
if score < threshold:
score = 0.0
sim_matrix[j, i] = score
left_matches = linear_sum_assignment(sim_matrix, maximize=True)
exact_matches, fuzzy_matches, deletions, transpositions = 0, 0, 0, 0
for i, j in zip(*left_matches):
t1, t2 = set_1[j], set_2[i]
if t1 == t2:
exact_matches += 1
if i != j:
transpositions += token_ops_cost.get("transpose", 0.2)
elif sim_matrix[i, j] > threshold:
fuzzy_matches += 1
else:
deletions += token_ops_cost.get("deletion", 1.0)
fuzzy_dist = (
big_length - (exact_matches + fuzzy_matches) + deletions + transpositions
)
levenstein_similarity = 1 - fuzzy_dist / big_length
return levenstein_similarity
if __name__ == "__main__":
set_1 = "hello world".split()
set_2 = "world hello".split()
print(aligned_edit_distance(set_1, set_2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment