Last active
November 4, 2018 02:14
-
-
Save MatthewRalston/b0ab6ac1dbe322cb12063310ccdbb786 to your computer and use it in GitHub Desktop.
difflib.SequenceMatcher.get_matching_blocks() doesn't return all results
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get_matching_blocks returns non-overlapping matches: https://bugs.python.org/issue35079 | |
# the incomplete result is considered a feature | |
def get_matching_blocks(s1, s2, overlap=True): | |
if type(s1) is not str: | |
throw TypeError("get_matching_blocks() expects a string as its first positional argument") | |
if type(s2) is not str: | |
throw TypeError("get_matching_blocks() expects a string as its second positional argument") | |
if type(overlap) is not bool: | |
throw TypeError("get_matching_blocks() expects a bool as the named argument 'overlap'") | |
la, lb = len(s1), len(s2) | |
queue = [(0, la, 0, lb)] | |
matching_blocks = [] | |
while queue: | |
alo, ahi, blo, bhi = queue.pop() | |
i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi) | |
if k: # if k is 0, there was no matching block | |
matching_blocks.append(x) | |
if alo < i and blo < j: | |
queue.append((alo, i, blo, j)) | |
if i+k < ahi and j+k < bhi: | |
if overlap: | |
queue.append((i+1, ahi, j+1, bhi)) | |
else: | |
queue.append((i+k, ahi, j+k, bhi)) | |
matching_blocks.sort() | |
# It's possible that we have adjacent equal blocks in the | |
# matching_blocks list now. Starting with 2.5, this code was added | |
# to collapse them. | |
i1 = j1 = k1 = 0 | |
non_adjacent = [] | |
for i2, j2, k2 in matching_blocks: | |
# Is this block adjacent to i1, j1, k1? | |
if i1 + k1 == i2 and j1 + k1 == j2: | |
# Yes, so collapse them -- this just increases the length of | |
# the first block by the length of the second, and the first | |
# block so lengthened remains the block to compare against. | |
k1 += k2 | |
else: | |
# Not adjacent. Remember the first block (k1==0 means it's | |
# the dummy we started with), and make the second block the | |
# new block to compare against. | |
if k1: | |
non_adjacent.append((i1, j1, k1)) | |
i1, j1, k1 = i2, j2, k2 | |
if k1: | |
non_adjacent.append((i1, j1, k1)) | |
non_adjacent.append( (la, lb, 0) ) | |
return list(map(Match._make, non_adjacent)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Tested on Python 2.7.15, 3.7.0 | |
import difflib | |
string1 = "TAGACCA" | |
string2 = "ATACA" | |
s = difflib.SequenceMatcher(None, string1, string2) | |
blox = s.get_matching_blocks() | |
print(blox) | |
print([string1[x.a:x.a+x.size] for x in blox if x.size > 1]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[Match(a=0, b=1, size=2), Match(a=5, b=3, size=2), Match(a=7, b=5, size=0)] # Missing Match(a=3, b=2, size=2) | |
['TA', 'CA'] # Missing the substring 'AC' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment