Skip to content

Instantly share code, notes, and snippets.

@tonyg
Last active Jul 18, 2021
Embed
What would you like to do?
Myers-Ukkonen diff in 39 lines of python
# Copyright (c) 2015 Tony Garnock-Jones <tonyg@leastfixedpoint.com>
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation files
# (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge,
# publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# Text diff algorithm after Myers 1986 and Ukkonen 1985, following
# Levente Uzonyi's Squeak Smalltalk implementation at
# http://squeaksource.com/DiffMerge.html
#
# E. W. Myers, "An O(ND) difference algorithm and its variations,"
# Algorithmica, vol. 1, no. 1-4, pp. 251-266, Nov. 1986.
#
# E. Ukkonen, "Algorithms for approximate string matching," Inf.
# Control, vol. 64, no. 1-3, pp. 100-118, Jan. 1985.
def longest_common_subsequence(xs, ys):
totallen = len(xs) + len(ys)
frontier = [0] * (2 * totallen + 1)
candidates = [None] * (2 * totallen + 1)
for d in range(totallen + 1):
for k in range(-d, d+1, 2):
if k == -d or (k != d and frontier[totallen + k - 1] < frontier[totallen + k + 1]):
index = totallen + k + 1
x = frontier[index]
else:
index = totallen + k - 1
x = frontier[index] + 1
y = x - k
chain = candidates[index]
while x < len(xs) and y < len(ys) and xs[x] == ys[y]:
chain = ((x, y), chain)
x = x + 1
y = y + 1
if x >= len(xs) and y >= len(ys):
result = []
while chain:
result.append(chain[0])
chain = chain[1]
result.reverse()
return result
frontier[totallen + k] = x
candidates[totallen + k] = chain
def diff(xs, ys):
i = -1
j = -1
matches = longest_common_subsequence(xs, ys)
matches.append((len(xs), len(ys)))
result = []
for (mi, mj) in matches:
if mi - i > 1 or mj - j > 1:
result.append((i + 1, mi - i - 1, j + 1, mj - j - 1))
i = mi
j = mj
return result
if __name__ == '__main__':
def check(actual, expected):
if actual != expected:
print "Expected:", repr(expected)
print "Actual:", repr(actual)
print
check(diff("The red brown fox jumped over the rolling log",
"The brown spotted fox leaped over the rolling log"),
[(4,4,4,0), (14,0,10,8), (18,3,22,3)])
for (xs, ys, lcs) in [("acbcaca", "bcbcacb", [(1,1),(2,2),(3,3),(4,4),(5,5)]),
("bcbcacb", "acbcaca", [(1,1),(2,2),(3,3),(4,4),(5,5)]),
("acba", "bcbb", [(1,1),(2,2)]),
("abcabba", "cbabac", [(2,0),(3,2),(4,3),(6,4)]),
("cbabac", "abcabba", [(1,1),(2,3),(3,4),(4,6)]),
([[1,1,1],[1,1,1],[1,1,1],[1,1,1]],
[[1,1,1],[2,2,2],[1,1,1],[4,4,4]],
[(0,0),(1,2)])]:
check(longest_common_subsequence(xs, ys), lcs)
check(diff([[1,1,1],[1,1,1],[1,1,1],[1,1,1]],
[[1,1,1],[2,2,2],[1,1,1],[4,4,4]]),
[(1,0,1,1), (2,2,3,1)])
check(longest_common_subsequence("abc", "def"), [])
check(diff("abc", "def"), [(0,3,0,3)])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment