MatthewRalston/difflib.patch.py

## difflib.patch.py
# get_matching_blocks returns non-overlapping matches: https://bugs.python.org/issue35079
# the incomplete result is considered a feature

def get_matching_blocks(s1, s2, overlap=True):
  if type(s1) is not str:
    throw TypeError("get_matching_blocks() expects a string as its first positional argument")
  if type(s2) is not str:
    throw TypeError("get_matching_blocks() expects a string as its second positional argument")
  if type(overlap) is not bool:
    throw TypeError("get_matching_blocks() expects a bool as the named argument 'overlap'")

  la, lb = len(s1), len(s2)

  queue = [(0, la, 0, lb)]
  matching_blocks = []
  while queue:
    alo, ahi, blo, bhi = queue.pop()
    i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
    if k:   # if k is 0, there was no matching block
      matching_blocks.append(x)
      if alo < i and blo < j:
        queue.append((alo, i, blo, j))
      if i+k < ahi and j+k < bhi:
        if overlap:
          queue.append((i+1, ahi, j+1, bhi))
        else:
          queue.append((i+k, ahi, j+k, bhi))
  matching_blocks.sort()

  # It's possible that we have adjacent equal blocks in the
  # matching_blocks list now.  Starting with 2.5, this code was added
  # to collapse them.
  i1 = j1 = k1 = 0
  non_adjacent = []
  for i2, j2, k2 in matching_blocks:
    # Is this block adjacent to i1, j1, k1?
    if i1 + k1 == i2 and j1 + k1 == j2:
      # Yes, so collapse them -- this just increases the length of
      # the first block by the length of the second, and the first
      # block so lengthened remains the block to compare against.
      k1 += k2
    else:
      # Not adjacent.  Remember the first block (k1==0 means it's
      # the dummy we started with), and make the second block the
      # new block to compare against.
      if k1:
         non_adjacent.append((i1, j1, k1))
      i1, j1, k1 = i2, j2, k2
  if k1:
    non_adjacent.append((i1, j1, k1))

  non_adjacent.append( (la, lb, 0) )
  return list(map(Match._make, non_adjacent))

## incomplete_matching_blocks.py
# Tested on Python 2.7.15, 3.7.0


import difflib

string1 = "TAGACCA"
string2 = "ATACA"

s = difflib.SequenceMatcher(None, string1, string2)
blox = s.get_matching_blocks()
print(blox)
print([string1[x.a:x.a+x.size] for x in blox if x.size > 1])

## output.txt
[Match(a=0, b=1, size=2), Match(a=5, b=3, size=2), Match(a=7, b=5, size=0)] # Missing Match(a=3, b=2, size=2)
['TA', 'CA'] # Missing the substring 'AC'
	# get_matching_blocks returns non-overlapping matches: https://bugs.python.org/issue35079
	# the incomplete result is considered a feature

	def get_matching_blocks(s1, s2, overlap=True):
	if type(s1) is not str:
	throw TypeError("get_matching_blocks() expects a string as its first positional argument")
	if type(s2) is not str:
	throw TypeError("get_matching_blocks() expects a string as its second positional argument")
	if type(overlap) is not bool:
	throw TypeError("get_matching_blocks() expects a bool as the named argument 'overlap'")

	la, lb = len(s1), len(s2)

	queue = [(0, la, 0, lb)]
	matching_blocks = []
	while queue:
	alo, ahi, blo, bhi = queue.pop()
	i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
	if k: # if k is 0, there was no matching block
	matching_blocks.append(x)
	if alo < i and blo < j:
	queue.append((alo, i, blo, j))
	if i+k < ahi and j+k < bhi:
	if overlap:
	queue.append((i+1, ahi, j+1, bhi))
	else:
	queue.append((i+k, ahi, j+k, bhi))
	matching_blocks.sort()

	# It's possible that we have adjacent equal blocks in the
	# matching_blocks list now. Starting with 2.5, this code was added
	# to collapse them.
	i1 = j1 = k1 = 0
	non_adjacent = []
	for i2, j2, k2 in matching_blocks:
	# Is this block adjacent to i1, j1, k1?
	if i1 + k1 == i2 and j1 + k1 == j2:
	# Yes, so collapse them -- this just increases the length of
	# the first block by the length of the second, and the first
	# block so lengthened remains the block to compare against.
	k1 += k2
	else:
	# Not adjacent. Remember the first block (k1==0 means it's
	# the dummy we started with), and make the second block the
	# new block to compare against.
	if k1:
	non_adjacent.append((i1, j1, k1))
	i1, j1, k1 = i2, j2, k2
	if k1:
	non_adjacent.append((i1, j1, k1))

	non_adjacent.append( (la, lb, 0) )
	return list(map(Match._make, non_adjacent))
	# Tested on Python 2.7.15, 3.7.0


	import difflib

	string1 = "TAGACCA"
	string2 = "ATACA"

	s = difflib.SequenceMatcher(None, string1, string2)
	blox = s.get_matching_blocks()
	print(blox)
	print([string1[x.a:x.a+x.size] for x in blox if x.size > 1])
	[Match(a=0, b=1, size=2), Match(a=5, b=3, size=2), Match(a=7, b=5, size=0)] # Missing Match(a=3, b=2, size=2)
	['TA', 'CA'] # Missing the substring 'AC'