Skip to content

Instantly share code, notes, and snippets.

@alvations
Last active March 12, 2020 10:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alvations/989dacca76c820c48ef18c4637515b39 to your computer and use it in GitHub Desktop.
Save alvations/989dacca76c820c48ef18c4637515b39 to your computer and use it in GitHub Desktop.
from functools import lru_cache
from itertools import product
@lru_cache(maxsize=4095)
def ld(s, t):
"""
Levenshtein distance memoized implementation from Rosetta code:
https://rosettacode.org/wiki/Levenshtein_distance#Python
"""
if not s: return len(t)
if not t: return len(s)
if s[0] == t[0]: return ld(s[1:], t[1:])
l1 = ld(s, t[1:]) # Deletion.
l2 = ld(s[1:], t) # Insertion.
l3 = ld(s[1:], t[1:]) # Substitution.
return 1 + min(l1, l2, l3)
def find_shifts(hyp, ref):
"""Find possible shifts in hypothesis."""
hyp_len, ref_len = len(hyp), len(ref)
for i, j in product(range(hyp_len), range(ref_len)):
if i == j: # Skip words in the same position.
continue
# When word matches.
if hyp[i] == ref[j]:
# Find the longest matching phrase from this position
l = 0
for l, (h, r) in enumerate(zip(hyp[i:], ref[j:])):
if h != r:
break
l += 1
# Compute the shifted hypothesis.
shifted_hyp = hyp[:i] + hyp[i+l:]
shifted_hyp[j:j] = hyp[i:i+l]
yield shifted_hyp
def shift(hyp, ref):
original = ld(tuple(hyp), tuple(ref))
# Find the lowest possible shift and it distance.
scores = []
for shifted_hyp in find_shifts(hyp, ref):
shifted_dist = ld(tuple(shifted_hyp), tuple(ref))
scores.append((original - shifted_dist, shifted_hyp))
# Return original hypothesis if shift is not better.
return sorted(scores)[-1] if scores else (0, hyp)
def ter(hyp, ref):
# Initialize no. of edits, e.
e = 0
while True:
# Find shift, s, that most reduces min-edit-distance(h', r)
delta, s = shift(hyp, ref)
# until no shifts that reduce edit distance remain
if delta <= 0:
break
# if shift reduces edit distance, then
# h' <- apply s to h'
hyp = s
# e <- e + 1
e += 1
# e <- e + min-edit-distance(h', r)
e += ld(tuple(hyp), tuple(ref))
return e / len(ref)
"""
>>> hyp = u'THIS WEEK THE SAUDIS denied information published in the new york times'.split()
>>> ref = u'SAUDI ARABIA denied THIS WEEK information published in the AMERICAN new york times'.split()
>>> ter(hyp, ref)
0.3076923076923077
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment