Skip to content

Instantly share code, notes, and snippets.

@ettorerizza
Last active October 31, 2017 11:16
Show Gist options
  • Save ettorerizza/41b65704d58bf334d9e323c09a7f6d46 to your computer and use it in GitHub Desktop.
Save ettorerizza/41b65704d58bf334d9e323c09a7f6d46 to your computer and use it in GitHub Desktop.
A function for calculating the Levensthein edit distance between columns with Jython in Open Refine
def call_counter(func):
def helper(*args, **kwargs):
helper.calls += 1
return func(*args, **kwargs)
helper.calls = 0
helper.__name__= func.__name__
return helper
memo = {}
@call_counter
def levenshtein(s, t):
if s == "":
return len(t)
if t == "":
return len(s)
cost = 0 if s[-1] == t[-1] else 1
i1 = (s[:-1], t)
if not i1 in memo:
memo[i1] = levenshtein(*i1)
i2 = (s, t[:-1])
if not i2 in memo:
memo[i2] = levenshtein(*i2)
i3 = (s[:-1], t[:-1])
if not i3 in memo:
memo[i3] = levenshtein(*i3)
res = min([memo[i1]+1, memo[i2]+1, memo[i3]+cost])
return res
def call_counter(func):
def helper(*args, **kwargs):
helper.calls += 1
return func(*args, **kwargs)
helper.calls = 0
helper.__name__= func.__name__
return helper
memo = {}
@call_counter
def levenshtein(s, t):
if s == "":
return len(t)
if t == "":
return len(s)
cost = 0 if s[-1] == t[-1] else 1
i1 = (s[:-1], t)
if not i1 in memo:
memo[i1] = levenshtein(*i1)
i2 = (s, t[:-1])
if not i2 in memo:
memo[i2] = levenshtein(*i2)
i3 = (s[:-1], t[:-1])
if not i3 in memo:
memo[i3] = levenshtein(*i3)
res = min([memo[i1]+1, memo[i2]+1, memo[i3]+cost])
return res
return levenshtein(value, cells['other column']['value'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment