Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
difflib.SequenceMatcher.get_matching_blocks() doesn't return all results
# get_matching_blocks returns non-overlapping matches: https://bugs.python.org/issue35079
# the incomplete result is considered a feature
def get_matching_blocks(s1, s2, overlap=True):
if type(s1) is not str:
throw TypeError("get_matching_blocks() expects a string as its first positional argument")
if type(s2) is not str:
throw TypeError("get_matching_blocks() expects a string as its second positional argument")
if type(overlap) is not bool:
throw TypeError("get_matching_blocks() expects a bool as the named argument 'overlap'")
la, lb = len(s1), len(s2)
queue = [(0, la, 0, lb)]
matching_blocks = []
while queue:
alo, ahi, blo, bhi = queue.pop()
i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
if k: # if k is 0, there was no matching block
matching_blocks.append(x)
if alo < i and blo < j:
queue.append((alo, i, blo, j))
if i+k < ahi and j+k < bhi:
if overlap:
queue.append((i+1, ahi, j+1, bhi))
else:
queue.append((i+k, ahi, j+k, bhi))
matching_blocks.sort()
# It's possible that we have adjacent equal blocks in the
# matching_blocks list now. Starting with 2.5, this code was added
# to collapse them.
i1 = j1 = k1 = 0
non_adjacent = []
for i2, j2, k2 in matching_blocks:
# Is this block adjacent to i1, j1, k1?
if i1 + k1 == i2 and j1 + k1 == j2:
# Yes, so collapse them -- this just increases the length of
# the first block by the length of the second, and the first
# block so lengthened remains the block to compare against.
k1 += k2
else:
# Not adjacent. Remember the first block (k1==0 means it's
# the dummy we started with), and make the second block the
# new block to compare against.
if k1:
non_adjacent.append((i1, j1, k1))
i1, j1, k1 = i2, j2, k2
if k1:
non_adjacent.append((i1, j1, k1))
non_adjacent.append( (la, lb, 0) )
return list(map(Match._make, non_adjacent))
# Tested on Python 2.7.15, 3.7.0
import difflib
string1 = "TAGACCA"
string2 = "ATACA"
s = difflib.SequenceMatcher(None, string1, string2)
blox = s.get_matching_blocks()
print(blox)
print([string1[x.a:x.a+x.size] for x in blox if x.size > 1])
[Match(a=0, b=1, size=2), Match(a=5, b=3, size=2), Match(a=7, b=5, size=0)] # Missing Match(a=3, b=2, size=2)
['TA', 'CA'] # Missing the substring 'AC'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.