Last active
December 12, 2015 08:38
-
-
Save bahostetterlewis/4744997 to your computer and use it in GitHub Desktop.
Levenshtein-Damerau, found originally in my sublime text 2 gist account (giststhebearbear) This is a memory/time optimized solution of the Levenshtein-Damerou distance algorithm. There are some non idiomatic things in the code, but it is pure standard python and can be used freely by anyone.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def leven(s1, s2, maxDistance): | |
# get smallest string so our rows are minimized | |
s1, s2 = (s1, s2) if len(s1) <= len(s2) else (s2, s1) | |
# set lengths | |
l1, l2 = len(s1), len(s2) | |
# We are simulatng an NM matrix where n is the longer string | |
# and m is the shorter string. By doing this we can minimize | |
# memory usage to O(M). | |
# Since we are simulating the matrix we only maintain two rows | |
# at a time the current row and the previous rows. | |
# A move from the current cell looking at the cell before it indicates | |
# consideration of an insert operation. | |
# A move from the current cell looking at the cell above it indicates | |
# consideration of a deletion | |
# Both operations are cost 1 | |
# A move from the current cell to the cell up and to the left indicates | |
# an edit operation of 0 cost for a matching character and a 1 cost for | |
# a non matching characters | |
# no row has been previously computed yet, set empty row | |
# Since this is also a Damerou-Levenshtein calculation transposition | |
# costs will be taken into account. These look back 2 characters to | |
# determine optimal cost based on a possible transposition | |
# example: aei -> aie with levensthein has a cost of 2 | |
# match a, change e->i change i->e => aie | |
# Damarau-Levenshtein has a cost of 1 | |
# match a, transpose ei to ie => aie | |
transpositionRow = None | |
prevRow = None | |
# build first leven matrix row | |
# The first row represents transformation from an empty string | |
# to the shorter string making it static [0-n] | |
# since this row is static we can set it as | |
# curRow and start computation at the second row or index 1 | |
curRow = range(0, l1 + 1)] | |
# use second length to loop through all the rows being built | |
# we start at row one | |
for rowNum in range(1, l2 + 1): | |
# set transposition, previous, and current | |
# because the rowNum always increments by one | |
# we can use rowNum to set the value representing | |
# the first column which is indicitive of transforming TO | |
# the empty string from our longer string | |
# transposition row maintains an extra row so that it is possible | |
# for us to apply Damarou's formula | |
transpositionRow, prevRow, curRow = prevRow, curRow, [rowNum] + [0] * l1 | |
# consider if we have passed the max distance if all paths through | |
# the transposition row are larger than the max we can stop calculating | |
# distance and return the last element in that row and return the max | |
if transpositionRow: | |
if not any(cellValue < maxDistance for cellValue in transpositionRow): | |
return maxDistance | |
for colNum in range(1, l1 + 1): | |
insertionCost = curRow[colNum - 1] + 1 | |
deletionCost = prevRow[colNum] + 1 | |
changeCost = prevRow[colNum - 1] + (0 if s1[colNum - 1] == s2[rowNum - 1] else 1) | |
# set the cell value - min distance to reach this | |
# position | |
curRow[colNum] = min(insertionCost, deletionCost, changeCost) | |
# test for a possible transposition optimization | |
# check to see if we have at least 2 characters | |
if 1 < rowNum <= colNum: | |
# test for possible transposition | |
if s1[colNum - 1] == s2[colNum - 2] and s2[colNum - 1] == s1[colNum - 2]: | |
curRow[colNum] = min(curRow[colNum], transpositionRow[colNum - 2] + 1) | |
# the last cell of the matrix is ALWAYS the shortest distance between the two strings | |
return curRow[-1] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def leven(s1, s2, maxDistance): | |
# get smallest string so our rows are minimized | |
s1, s2 = (s1, s2) if len(s1) <= len(s2) else (s2, s1) | |
# set lengths | |
l1, l2 = len(s1), len(s2) | |
# We are simulatng an NM matrix where n is the longer string | |
# and m is the shorter string. By doing this we can minimize | |
# memory usage to O(M). | |
# Since we are simulating the matrix we only maintain two rows | |
# at a time the current row and the previous rows. | |
# A move from the current cell looking at the cell before it indicates | |
# consideration of an insert operation. | |
# A move from the current cell looking at the cell above it indicates | |
# consideration of a deletion | |
# Both operations are cost 1 | |
# A move from the current cell to the cell up and to the left indicates | |
# an edit operation of 0 cost for a matching character and a 1 cost for | |
# a non matching characters | |
# no row has been previously computed yet, set empty row | |
# Since this is also a Damerou-Levenshtein calculation transposition | |
# costs will be taken into account. These look back 2 characters to | |
# determine optimal cost based on a possible transposition | |
# example: aei -> aie with levensthein has a cost of 2 | |
# match a, change e->i change i->e => aie | |
# Damarau-Levenshtein has a cost of 1 | |
# match a, transpose ei to ie => aie | |
transpositionRow = None | |
prevRow = None | |
# build first leven matrix row | |
# The first row represents transformation from an empty string | |
# to the shorter string making it static [0-n] | |
# since this row is static we can set it as | |
# curRow and start computation at the second row or index 1 | |
curRow = range(0, l1 + 1)] | |
# use second length to loop through all the rows being built | |
# we start at row one | |
for rowNum in range(1, l2 + 1): | |
# set transposition, previous, and current | |
# because the rowNum always increments by one | |
# we can use rowNum to set the value representing | |
# the first column which is indicitive of transforming TO | |
# the empty string from our longer string | |
# transposition row maintains an extra row so that it is possible | |
# for us to apply Damarou's formula | |
transpositionRow, prevRow, curRow = prevRow, curRow, [rowNum] + [0] * l1 | |
# consider if we have passed the max distance if all paths through | |
# the transposition row are larger than the max we can stop calculating | |
# distance and return the last element in that row and return the max | |
if transpositionRow: | |
if not any(cellValue < maxDistance for cellValue in transpositionRow): | |
return maxDistance | |
for colNum in range(1, l1 + 1): | |
insertionCost = curRow[colNum - 1] + 1 | |
deletionCost = prevRow[colNum] + 1 | |
changeCost = prevRow[colNum - 1] + (0 if s1[colNum - 1] == s2[rowNum - 1] else 1) | |
# set the cell value - min distance to reach this | |
# position | |
curRow[colNum] = min(insertionCost, deletionCost, changeCost) | |
# test for a possible transposition optimization | |
# check to see if we have at least 2 characters | |
if 1 < rowNum <= colNum: | |
# test for possible transposition | |
if s1[colNum - 1] == s2[colNum - 2] and s2[colNum - 1] == s1[colNum - 2]: | |
curRow[colNum] = min(curRow[colNum], transpositionRow[colNum - 2] + 1) | |
# the last cell of the matrix is ALWAYS the shortest distance between the two strings | |
return curRow[-1] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment