Skip to content

Instantly share code, notes, and snippets.

@bahostetterlewis
Last active December 12, 2015 08:38
Show Gist options
  • Save bahostetterlewis/4744997 to your computer and use it in GitHub Desktop.
Save bahostetterlewis/4744997 to your computer and use it in GitHub Desktop.
Levenshtein-Damerau, found originally in my sublime text 2 gist account (giststhebearbear) This is a memory/time optimized solution of the Levenshtein-Damerou distance algorithm. There are some non idiomatic things in the code, but it is pure standard python and can be used freely by anyone.
def leven(s1, s2, maxDistance):
# get smallest string so our rows are minimized
s1, s2 = (s1, s2) if len(s1) <= len(s2) else (s2, s1)
# set lengths
l1, l2 = len(s1), len(s2)
# We are simulatng an NM matrix where n is the longer string
# and m is the shorter string. By doing this we can minimize
# memory usage to O(M).
# Since we are simulating the matrix we only maintain two rows
# at a time the current row and the previous rows.
# A move from the current cell looking at the cell before it indicates
# consideration of an insert operation.
# A move from the current cell looking at the cell above it indicates
# consideration of a deletion
# Both operations are cost 1
# A move from the current cell to the cell up and to the left indicates
# an edit operation of 0 cost for a matching character and a 1 cost for
# a non matching characters
# no row has been previously computed yet, set empty row
# Since this is also a Damerou-Levenshtein calculation transposition
# costs will be taken into account. These look back 2 characters to
# determine optimal cost based on a possible transposition
# example: aei -> aie with levensthein has a cost of 2
# match a, change e->i change i->e => aie
# Damarau-Levenshtein has a cost of 1
# match a, transpose ei to ie => aie
transpositionRow = None
prevRow = None
# build first leven matrix row
# The first row represents transformation from an empty string
# to the shorter string making it static [0-n]
# since this row is static we can set it as
# curRow and start computation at the second row or index 1
curRow = range(0, l1 + 1)]
# use second length to loop through all the rows being built
# we start at row one
for rowNum in range(1, l2 + 1):
# set transposition, previous, and current
# because the rowNum always increments by one
# we can use rowNum to set the value representing
# the first column which is indicitive of transforming TO
# the empty string from our longer string
# transposition row maintains an extra row so that it is possible
# for us to apply Damarou's formula
transpositionRow, prevRow, curRow = prevRow, curRow, [rowNum] + [0] * l1
# consider if we have passed the max distance if all paths through
# the transposition row are larger than the max we can stop calculating
# distance and return the last element in that row and return the max
if transpositionRow:
if not any(cellValue < maxDistance for cellValue in transpositionRow):
return maxDistance
for colNum in range(1, l1 + 1):
insertionCost = curRow[colNum - 1] + 1
deletionCost = prevRow[colNum] + 1
changeCost = prevRow[colNum - 1] + (0 if s1[colNum - 1] == s2[rowNum - 1] else 1)
# set the cell value - min distance to reach this
# position
curRow[colNum] = min(insertionCost, deletionCost, changeCost)
# test for a possible transposition optimization
# check to see if we have at least 2 characters
if 1 < rowNum <= colNum:
# test for possible transposition
if s1[colNum - 1] == s2[colNum - 2] and s2[colNum - 1] == s1[colNum - 2]:
curRow[colNum] = min(curRow[colNum], transpositionRow[colNum - 2] + 1)
# the last cell of the matrix is ALWAYS the shortest distance between the two strings
return curRow[-1]
def leven(s1, s2, maxDistance):
# get smallest string so our rows are minimized
s1, s2 = (s1, s2) if len(s1) <= len(s2) else (s2, s1)
# set lengths
l1, l2 = len(s1), len(s2)
# We are simulatng an NM matrix where n is the longer string
# and m is the shorter string. By doing this we can minimize
# memory usage to O(M).
# Since we are simulating the matrix we only maintain two rows
# at a time the current row and the previous rows.
# A move from the current cell looking at the cell before it indicates
# consideration of an insert operation.
# A move from the current cell looking at the cell above it indicates
# consideration of a deletion
# Both operations are cost 1
# A move from the current cell to the cell up and to the left indicates
# an edit operation of 0 cost for a matching character and a 1 cost for
# a non matching characters
# no row has been previously computed yet, set empty row
# Since this is also a Damerou-Levenshtein calculation transposition
# costs will be taken into account. These look back 2 characters to
# determine optimal cost based on a possible transposition
# example: aei -> aie with levensthein has a cost of 2
# match a, change e->i change i->e => aie
# Damarau-Levenshtein has a cost of 1
# match a, transpose ei to ie => aie
transpositionRow = None
prevRow = None
# build first leven matrix row
# The first row represents transformation from an empty string
# to the shorter string making it static [0-n]
# since this row is static we can set it as
# curRow and start computation at the second row or index 1
curRow = range(0, l1 + 1)]
# use second length to loop through all the rows being built
# we start at row one
for rowNum in range(1, l2 + 1):
# set transposition, previous, and current
# because the rowNum always increments by one
# we can use rowNum to set the value representing
# the first column which is indicitive of transforming TO
# the empty string from our longer string
# transposition row maintains an extra row so that it is possible
# for us to apply Damarou's formula
transpositionRow, prevRow, curRow = prevRow, curRow, [rowNum] + [0] * l1
# consider if we have passed the max distance if all paths through
# the transposition row are larger than the max we can stop calculating
# distance and return the last element in that row and return the max
if transpositionRow:
if not any(cellValue < maxDistance for cellValue in transpositionRow):
return maxDistance
for colNum in range(1, l1 + 1):
insertionCost = curRow[colNum - 1] + 1
deletionCost = prevRow[colNum] + 1
changeCost = prevRow[colNum - 1] + (0 if s1[colNum - 1] == s2[rowNum - 1] else 1)
# set the cell value - min distance to reach this
# position
curRow[colNum] = min(insertionCost, deletionCost, changeCost)
# test for a possible transposition optimization
# check to see if we have at least 2 characters
if 1 < rowNum <= colNum:
# test for possible transposition
if s1[colNum - 1] == s2[colNum - 2] and s2[colNum - 1] == s1[colNum - 2]:
curRow[colNum] = min(curRow[colNum], transpositionRow[colNum - 2] + 1)
# the last cell of the matrix is ALWAYS the shortest distance between the two strings
return curRow[-1]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment