imvladikon/aligned_edit_distance.py

## aligned_edit_distance.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from typing import Sequence, Dict, Callable

import numpy as np
from rapidfuzz.distance import Levenshtein
from scipy.optimize import linear_sum_assignment


def aligned_edit_distance(
        set_1: Sequence[str],
        set_2: Sequence[str],
        threshold: float = 0.5,
        sim_fn: Callable = Levenshtein.normalized_similarity,
        sim_fn_kwargs: Dict = {},
        token_ops_cost: Dict = {}
):
    if len(set_1) < len(set_2):
        set_2, set_1 = set_1, set_2

    big_length, small_length = len(set_1), len(set_2)

    sim_matrix = np.full((big_length, small_length), -np.inf)
    for i, t1 in enumerate(set_1):
        for j, t2 in enumerate(set_2):
            score = sim_fn(t1, t2, **sim_fn_kwargs)
            if score < threshold:
                score = 0.0
            sim_matrix[j, i] = score

    left_matches = linear_sum_assignment(sim_matrix, maximize=True)

    exact_matches, fuzzy_matches, deletions, transpositions = 0, 0, 0, 0
    for i, j in zip(*left_matches):
        t1, t2 = set_1[j], set_2[i]
        if t1 == t2:
            exact_matches += 1
            if i != j:
                transpositions += token_ops_cost.get("transpose", 0.2)
        elif sim_matrix[i, j] > threshold:
            fuzzy_matches += 1
        else:
            deletions += token_ops_cost.get("deletion", 1.0)

    fuzzy_dist = (
            big_length - (exact_matches + fuzzy_matches) + deletions + transpositions
    )
    levenstein_similarity = 1 - fuzzy_dist / big_length
    return levenstein_similarity


if __name__ == "__main__":
    set_1 = "hello world".split()
    set_2 = "world hello".split()
    print(aligned_edit_distance(set_1, set_2))
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	from typing import Sequence, Dict, Callable

	import numpy as np
	from rapidfuzz.distance import Levenshtein
	from scipy.optimize import linear_sum_assignment


	def aligned_edit_distance(
	set_1: Sequence[str],
	set_2: Sequence[str],
	threshold: float = 0.5,
	sim_fn: Callable = Levenshtein.normalized_similarity,
	sim_fn_kwargs: Dict = {},
	token_ops_cost: Dict = {}
	):
	if len(set_1) < len(set_2):
	set_2, set_1 = set_1, set_2

	big_length, small_length = len(set_1), len(set_2)

	sim_matrix = np.full((big_length, small_length), -np.inf)
	for i, t1 in enumerate(set_1):
	for j, t2 in enumerate(set_2):
	score = sim_fn(t1, t2, **sim_fn_kwargs)
	if score < threshold:
	score = 0.0
	sim_matrix[j, i] = score

	left_matches = linear_sum_assignment(sim_matrix, maximize=True)

	exact_matches, fuzzy_matches, deletions, transpositions = 0, 0, 0, 0
	for i, j in zip(*left_matches):
	t1, t2 = set_1[j], set_2[i]
	if t1 == t2:
	exact_matches += 1
	if i != j:
	transpositions += token_ops_cost.get("transpose", 0.2)
	elif sim_matrix[i, j] > threshold:
	fuzzy_matches += 1
	else:
	deletions += token_ops_cost.get("deletion", 1.0)

	fuzzy_dist = (
	big_length - (exact_matches + fuzzy_matches) + deletions + transpositions
	)
	levenstein_similarity = 1 - fuzzy_dist / big_length
	return levenstein_similarity


	if __name__ == "__main__":
	set_1 = "hello world".split()
	set_2 = "world hello".split()
	print(aligned_edit_distance(set_1, set_2))