Skip to content

Instantly share code, notes, and snippets.

@jlherren
Created February 19, 2020 16:10
Show Gist options
  • Save jlherren/d97839b1276b9bd7faa930f74711a4b6 to your computer and use it in GitHub Desktop.
Save jlherren/d97839b1276b9bd7faa930f74711a4b6 to your computer and use it in GitHub Desktop.
Find Levenshtein distance between two strings and construct edit instructions to go from one string to the other
import numpy
def wagner_fisher(s: str, t: str):
"""
Computes the Levenshtein distance between the two strings. Returns a tuple containing
the distance itself and also the entire matrix for further processing.
See: https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm
"""
m, n = len(s), len(t)
d = numpy.zeros(shape=(m + 1, n + 1), dtype='int32')
for i in range(1, m + 1):
d[i, 0] = i
for j in range(1, n + 1):
d[0, j] = j
for j in range(1, n + 1):
for i in range(1, m + 1):
if s[i - 1] == t[j - 1]:
substitutionCost = 0
else:
substitutionCost = 1
d[i, j] = min(d[i - 1, j] + 1, d[i, j - 1] + 1, d[i - 1, j - 1] + substitutionCost)
return d[m, n], d
def edit_instructions(s: str, t: str):
"""
Compute the edit operations required to get from string s to string t
"""
distance, d = wagner_fisher(s, t)
m, n = len(s), len(t)
instructions = []
while m > 0 or n > 0:
deletion_score = d[m - 1, n] if m >= 1 else float('inf')
insertion_score = d[m, n - 1] if n >= 1 else float('inf')
substitution_or_noop_score = d[m - 1, n - 1] if m >= 1 and n >= 1 else float('inf')
smallest = min(deletion_score, insertion_score, substitution_or_noop_score)
if smallest == substitution_or_noop_score:
if d[m - 1, n - 1] < d[m, n]:
instructions.append('substitute "%s" with "%s" at position %d' % (s[m - 1], t[n - 1], n - 1))
m -= 1
n -= 1
elif smallest == deletion_score:
instructions.append('delete "%s" at position %d' % (s[m - 1], n))
m -= 1
elif smallest == insertion_score:
instructions.append('insert "%s" at position %d' % (t[n - 1], n - 1))
n -= 1
if distance != len(instructions):
raise Exception('Internal error')
return instructions[::-1]
# A few tests
def test(s, t, instructions):
i = edit_instructions(s, t)
i = ', '.join(i)
if i != instructions:
print('Test failed for "%s" and "%s"' % (s, t))
print('Expected:', instructions)
print('Obtained:', i)
for s in ['', 'a', 'aa', 'aaa', 'ab', 'abc', 'abcd']:
test(s, s, '')
test('aa', 'a', 'delete "a" at position 1')
test('a', 'aa', 'insert "a" at position 1')
test('a', 'b', 'substitute "a" with "b" at position 0')
test('a', 'ab', 'insert "b" at position 1')
test('a', 'ba', 'insert "b" at position 0')
test('bc', 'abc', 'insert "a" at position 0')
test('ac', 'abc', 'insert "b" at position 1')
test('ab', 'abc', 'insert "c" at position 2')
test('abc', 'bc', 'delete "a" at position 0')
test('abc', 'ac', 'delete "b" at position 1')
test('abc', 'ab', 'delete "c" at position 2')
test('abc', 'cba', 'substitute "a" with "c" at position 0, substitute "c" with "a" at position 2')
test('abc', '123', 'substitute "a" with "1" at position 0, substitute "b" with "2" at position 1, ' +
'substitute "c" with "3" at position 2')
test('abc', 'ab3', 'substitute "c" with "3" at position 2')
test('abc', 'a2c', 'substitute "b" with "2" at position 1')
test('abc', '1bc', 'substitute "a" with "1" at position 0')
test('abc--abc', 'bc--bc', 'delete "a" at position 0, delete "a" at position 4')
test('abc--abc', 'bc--0abc', 'delete "a" at position 0, insert "0" at position 4')
test('abc--abc', 'bc--1bc', 'delete "a" at position 0, substitute "a" with "1" at position 4')
test('abc--abc', '0abc--bc', 'insert "0" at position 0, delete "a" at position 6')
test('abc--abc', '0abc--0abc', 'insert "0" at position 0, insert "0" at position 6')
test('abc--abc', '0abc--1abc', 'insert "0" at position 0, insert "1" at position 6')
test('abc--abc', '1bc--bc', 'substitute "a" with "1" at position 0, delete "a" at position 5')
test('abc--abc', '1bc--0abc', 'substitute "a" with "1" at position 0, insert "0" at position 5')
test('abc--abc', '1bc--1bc', 'substitute "a" with "1" at position 0, substitute "a" with "1" at position 5')
test('abcdefghijklmnopqrstuvwxyz', 'a0bcefGhi0jkmnOpq0rsuvWxyz', 'insert "0" at position 1, delete "d" at position 4, ' +
'substitute "g" with "G" at position 6, insert "0" at position 9, delete "l" at position 12, ' +
'substitute "o" with "O" at position 14, insert "0" at position 17, delete "t" at position 20, ' +
'substitute "w" with "W" at position 22')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment