Skip to content

Instantly share code, notes, and snippets.

@kylebgorman
Created July 14, 2011 04:41
Show Gist options
  • Save kylebgorman/1081951 to your computer and use it in GitHub Desktop.
Save kylebgorman/1081951 to your computer and use it in GitHub Desktop.
Python implementation of the Wagner & Fischer dynamic programming approach to computing Levenshtein distance, with support for thresholding, arbitrary weights, and traceback to get individual insertion/deletion/substitution counts.
#!/usr/bin/env python
# wagnerfischer.py: Dynamic programming Levensthein distance function
# Kyle Gorman <gormanky@ohsu.edu>
#
# Based on:
#
# Robert A. Wagner and Michael J. Fischer (1974). The string-to-string
# correction problem. Journal of the ACM 21(1):168-173.
#
# The thresholding function was inspired by BSD-licensed code from
# Babushka, a Ruby tool by Ben Hoskings and others.
#
# Unlike many other Levenshtein distance functions out there, this works
# on arbitrary comparable Python objects, not just strings.
try: # use numpy arrays if possible...
from numpy import zeros
def _zeros(*shape):
""" like this syntax better...a la MATLAB """
return zeros(shape)
except ImportError: # otherwise do this cute solution
def _zeros(*shape):
if len(shape) == 0:
return 0
car = shape[0]
cdr = shape[1:]
return [my_zeros(*cdr) for i in range(car)]
def _dist(A, B, insertion, deletion, substitution):
D = _zeros(len(A) + 1, len(B) + 1)
for i in xrange(len(A)):
D[i + 1][0] = D[i][0] + deletion
for j in xrange(len(B)):
D[0][j + 1] = D[0][j] + insertion
for i in xrange(len(A)): # fill out middle of matrix
for j in xrange(len(B)):
if A[i] == B[j]:
D[i + 1][j + 1] = D[i][j] # aka, it's free.
else:
D[i + 1][j + 1] = min(D[i + 1][j] + insertion,
D[i][j + 1] + deletion,
D[i][j] + substitution)
return D
def _dist_thresh(A, B, thresh, insertion, deletion, substitution):
D = _zeros(len(A) + 1, len(B) + 1)
for i in xrange(len(A)):
D[i + 1][0] = D[i][0] + deletion
for j in xrange(len(B)):
D[0][j + 1] = D[0][j] + insertion
for i in xrange(len(A)): # fill out middle of matrix
for j in xrange(len(B)):
if A[i] == B[j]:
D[i + 1][j + 1] = D[i][j] # aka, it's free.
else:
D[i + 1][j + 1] = min(D[i + 1][j] + insertion,
D[i][j + 1] + deletion,
D[i][j] + substitution)
if min(D[i + 1]) >= thresh:
return
return D
def _levenshtein(A, B, insertion, deletion, substitution):
return _dist(A, B, insertion, deletion, substitution)[len(A)][len(B)]
def _levenshtein_ids(A, B, insertion, deletion, substitution):
"""
Perform a backtrace to determine the optimal path. This was hard.
"""
D = _dist(A, B, insertion, deletion, substitution)
i = len(A)
j = len(B)
ins_c = 0
del_c = 0
sub_c = 0
while True:
if i > 0:
if j > 0:
if D[i - 1][j] <= D[i][j - 1]: # if ins < del
if D[i - 1][j] < D[i - 1][j - 1]: # if ins < m/s
ins_c += 1
else:
if D[i][j] != D[i - 1][j - 1]: # if not m
sub_c += 1
j -= 1
i -= 1
else:
if D[i][j - 1] <= D[i - 1][j - 1]: # if del < m/s
del_c += 1
else:
if D[i][j] != D[i - 1][j - 1]: # if not m
sub_c += 1
i -= 1
j -= 1
else: # only insert
ins_c += 1
i -= 1
elif j > 0: # only delete
del_c += 1
j -= 1
else:
return (ins_c, del_c, sub_c)
def _levenshtein_thresh(A, B, thresh, insertion, deletion, substitution):
D = _dist_thresh(A, B, thresh, insertion, deletion, substitution)
if D != None:
return D[len(A)][len(B)]
def levenshtein(A, B, thresh=None, insertion=1, deletion=1, substitution=1):
"""
Compute levenshtein distance between iterables A and B
These are standard cases:
>>> print int(levenshtein('god', 'gawd'))
2
>>> print int(levenshtein('sitting', 'kitten'))
3
>>> print int(levenshtein('bana', 'banananana'))
6
>>> print int(levenshtein('bana', 'bana'))
0
>>> print int(levenshtein('banana', 'angioplastical'))
11
>>> print int(levenshtein('angioplastical', 'banana'))
11
>>> print int(levenshtein('Saturday', 'Sunday'))
3
These should return nothing:
>>> levenshtein('banana', 'angioplasty', 5)
>>> levenshtein('banana', 'angioplastical', 5)
>>> levenshtein('angioplastical', 'banana', 5)
"""
# basic checks
if len(A) == len(B) and A == B:
return 0
if len(B) > len(A):
(A, B) = (B, A)
if len(A) == 0:
return len(B)
if thresh:
if len(A) - len(B) > thresh:
return
return _levenshtein_thresh(A, B, thresh, insertion, deletion,
substitution)
else:
return _levenshtein(A, B, insertion, deletion, substitution)
def levenshtein_ids(A, B, insertion=1, deletion=1, substitution=1):
"""
Compute number of insertions deletions, and substitutions for an
optimal alignment.
There may be more than one, in which case we disfavor substitution.
>>> print levenshtein_ids('sitting', 'kitten')
(1, 0, 2)
>>> print levenshtein_ids('banat', 'banap')
(0, 0, 1)
>>> print levenshtein_ids('Saturday', 'Sunday')
(2, 0, 1)
"""
# basic checks
if len(A) == len(B) and A == B:
return (0, 0, 0)
if len(B) > len(A):
(A, B) = (B, A)
if len(A) == 0:
return len(B)
else:
return _levenshtein_ids(A, B, insertion, deletion, substitution)
if __name__ == '__main__':
import doctest
doctest.testmod()
@christianbuck
Copy link

Nice! What does my_zeros look like? Just [0.0]*n ?

@Lazik
Copy link

Lazik commented Mar 8, 2014

No my_zeros is meant to be _zeros.
Found the code here: http://www.ling.upenn.edu/~kgorman/py/align.py

See my fork or kylebgorman might update the code.

@kylebgorman
Copy link
Author

see https://gist.github.com/kylebgorman/8034009 for a better version of the same

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment