Skip to content

Instantly share code, notes, and snippets.

@hynekcer
Created January 12, 2018 02:30
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save hynekcer/fa340f3b63826168ffc0c4b33310ae9c to your computer and use it in GitHub Desktop.
Save hynekcer/fa340f3b63826168ffc0c4b33310ae9c to your computer and use it in GitHub Desktop.
fast longest common substring - by suffix array
#!/usr/bin/env python
"""Find the longest repeated substring.
"Efficient way to find longest duplicate string for Python (From Programming Pearls)"
http://stackoverflow.com/questions/13560037/
The algorithm is based on "Prefix doubling".
The worst time complexity is O(n (log n)^2). Memory requirements are linear.
"""
import time
from random import randint
import itertools
import sys
import unittest
from itertools import groupby
from operator import itemgetter
import logging
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
try:
log.addHandler(logging.NullHandler())
except AttributeError:
pass
def run():
if sys.argv[1:] == ['-']:
text = sys.stdin.read()
elif sys.argv[1:]:
print('Reading data...')
text = open(sys.argv[1]).read()
else:
text = 'banana'
print('Sorting...')
result = longest_common_substring(text)
print('Longest common substrings in "{0}..." are:\n{1}'.format(
text[:20], result))
def longest_common_substring(text):
"""Get the longest common substrings and their positions.
>>> longest_common_substring('banana')
{'ana': [1, 3]}
>>> text = "not so Agamemnon, who spoke fiercely to "
>>> sorted(longest_common_substring(text).items())
[(' s', [3, 21]), ('no', [0, 13]), ('o ', [5, 20, 38])]
This function can be easy modified for any criteria, e.g. for searching ten
longest non overlapping repeated substrings.
"""
sa, rsa, lcp = suffix_array(text)
maxlen = max(lcp)
result = {}
for i in range(1, len(text)):
if lcp[i] == maxlen:
j1, j2, h = sa[i - 1], sa[i], lcp[i]
assert text[j1:j1 + h] == text[j2:j2 + h]
substring = text[j1:j1 + h]
if substring not in result:
result[substring] = [j1]
result[substring].append(j2)
return dict((k, sorted(v)) for k, v in result.items())
def suffix_array(text, _step=16):
"""Analyze all common strings in the text.
Short substrings of the length _step a are first pre-sorted. The are the
results repeatedly merged so that the garanteed number of compared
characters bytes is doubled in every iteration until all substrings are
sorted exactly.
Arguments:
text: The text to be analyzed.
_step: Is only for optimization and testing. It is the optimal length
of substrings used for initial pre-sorting. The bigger value is
faster if there is enough memory. Memory requirements are
approximately (estimate for 32 bit Python 3.3):
len(text) * (29 + (_size + 20 if _size > 2 else 0)) + 1MB
Return value: (tuple)
(sa, rsa, lcp)
sa: Suffix array for i in range(1, size):
assert text[sa[i-1]:] < text[sa[i]:]
rsa: Reverse suffix array for i in range(size):
assert rsa[sa[i]] == i
lcp: Longest common prefix for i in range(1, size):
assert text[sa[i-1]:sa[i-1]+lcp[i]] == text[sa[i]:sa[i]+lcp[i]]
if sa[i-1] + lcp[i] < len(text):
assert text[sa[i-1] + lcp[i]] < text[sa[i] + lcp[i]]
>>> suffix_array(text='banana')
([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2])
Explanation: 'a' < 'ana' < 'anana' < 'banana' < 'na' < 'nana'
The Longest Common String is 'ana': lcp[2] == 3 == len('ana')
It is between tx[sa[1]:] == 'ana' < 'anana' == tx[sa[2]:]
"""
tx = text
t0 = time.time()
size = len(tx)
step = min(max(_step, 1), len(tx))
sa = list(range(len(tx)))
log.debug('%6.3f pre sort', time.time() - t0)
sa.sort(key=lambda i: tx[i:i + step])
log.debug('%6.3f after sort', time.time() - t0)
grpstart = size * [False] + [True] # a boolean map for iteration speedup.
# It helps to skip yet resolved values. The last value True is a sentinel.
rsa = size * [None]
stgrp, igrp = '', 0
for i, pos in enumerate(sa):
st = tx[pos:pos + step]
if st != stgrp:
grpstart[igrp] = (igrp < i - 1)
stgrp = st
igrp = i
rsa[pos] = igrp
sa[i] = pos
grpstart[igrp] = (igrp < size - 1 or size == 0)
log.debug('%6.3f after group', time.time() - t0)
while grpstart.index(True) < size:
# assert step <= size
nmerge = 0
nextgr = grpstart.index(True)
while nextgr < size:
igrp = nextgr
nextgr = grpstart.index(True, igrp + 1)
glist = []
for ig in range(igrp, nextgr):
pos = sa[ig]
if rsa[pos] != igrp:
break
newgr = rsa[pos + step] if pos + step < size else -1
glist.append((newgr, pos))
glist.sort()
for ig, g in groupby(glist, key=itemgetter(0)):
g = [x[1] for x in g]
sa[igrp:igrp + len(g)] = g
grpstart[igrp] = (len(g) > 1)
for pos in g:
rsa[pos] = igrp
igrp += len(g)
nmerge += len(glist)
log.debug('%6.3f for step=%d nmerge=%d', time.time() - t0, step, nmerge)
step *= 2
del grpstart
# create LCP array
lcp = size * [None]
h = 0
for i in range(size):
if rsa[i] > 0:
j = sa[rsa[i] - 1]
while i != size - h and j != size - h and tx[i + h] == tx[j + h]:
h += 1
lcp[rsa[i]] = h
if h > 0:
h -= 1
if size > 0:
lcp[0] = 0
log.debug('%6.3f end', time.time() - t0)
return sa, rsa, lcp
# ---
class TestMixin(object):
def suffix_verify(self, text, step=16):
tx = text
sa, rsa, lcp = suffix_array(text=tx, _step=step)
self.assertEqual(set(sa), set(range(len(tx))))
ok = True
for i0, i1, h in zip(sa[:-1], sa[1:], lcp[1:]):
self.assertEqual(tx[i1:i1 + h], tx[i0:i0 + h], "Verify LCP characters equal on text '%s...'" % text[:20])
self.assertGreater(tx[i1 + h:i1 + h + 1], tx[i0 + h:i0 + h + 1],
"Verify LCP+1 char is different '%s...'" % text[:20])
self.assertLessEqual(max(i0, i1), len(tx) - h,
"Verify LCP is not more than length of string '%s...'" % text[:20])
self.assertTrue(ok)
class SuffixArrayTest(unittest.TestCase, TestMixin):
def test_16(self):
# 'a' < 'ana' < 'anana' < 'banana' < 'na' < 'nana'
expect = ([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2])
self.assertEqual(suffix_array(text='banana', _step=16), expect)
def test_1(self):
expect = ([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2])
self.assertEqual(suffix_array(text='banana', _step=1), expect)
def test_mini(self):
self.assertEqual(suffix_array(text='', _step=1), ([], [], []))
self.assertEqual(suffix_array(text='a', _step=1), ([0], [0], [0]))
self.assertEqual(suffix_array(text='aa', _step=1), ([1, 0], [1, 0], [0, 1]))
self.assertEqual(suffix_array(text='aaa', _step=1), ([2, 1, 0], [2, 1, 0], [0, 1, 2]))
def test_example(self):
self.suffix_verify('abracadabra')
def test_cartesian(self):
"""Test all combinations of alphabet "ABC" up to length 4 characters"""
for size in range(7):
for cartesian in itertools.product(*(size * ['ABC'])):
text = ''.join(cartesian)
log.debug('Testing "%s"', text)
self.suffix_verify(text, 1)
def test_lcp(self):
expect = {'ana': [1, 3]}
self.assertDictEqual(longest_common_substring('banana'), expect)
expect = {' s': [3, 21], 'no': [0, 13], 'o ': [5, 20, 38]}
self.assertDictEqual(longest_common_substring(
"not so Agamemnon, who spoke fiercely to "), expect)
class SlowTests(unittest.TestCase, TestMixin):
"""Slow development tests running many minutes.
It can be run only by an EXPLICIT command!
e.g.: python -m unittest maxsubstring.SlowTests._test_random
"""
def _test_random(self):
for power in range(2, 21, 2):
size = randint(2 ** (power - 1), 2 ** power)
for alphabet in (2, 4, 16, 256):
text = ''.join(chr(65 + randint(0, alphabet - 1)) for _ in range(size))
log.debug('%s %s %s', size, alphabet, 1)
self.suffix_verify(text, 1)
log.debug('%s %s %s', size, alphabet, 16)
self.suffix_verify(text, 16)
if __name__ == '__main__':
run()
@hdf
Copy link

hdf commented Feb 3, 2021

@likengmx I have written an ugly little function that does what you probably want:

def LRS(text, m=3, n=10): # Longest repeated non-overlapping substrings
    """
    Returns longest (maximum n number of) non-overlapping (minimum m times)
    repeated strings and how many times they are found in the text in
    descending order of count*len(substring)^2.
    """
    sa, rsa, lcp = suffix_array(text)
    result = {}
    for i in range(0, len(sa)):
        l = lcp[i]
        if l < 2: # Don't bother with individual characters.
            continue
        substring = text[sa[i]:sa[i] + l]
        c = text.count(substring) # This is the bottleneck.
        if c < m: # Not found enough times.
            continue
        halt = False
        for s in list(result.keys()): # Filter out overlaps.
            if len(s) >= l:
                if substring in s:
                    halt = True
                    break
            elif s in substring:
                del result[s]
        if not halt:
            result[substring] = c
    return dict(sorted(result.items(),
                key=lambda x: x[1]*(len(x[0])**2), reverse=True)[:n])

With this, we can for example look around in relatively large text files or code, and look for duplicates, prioritising long and common ones. Might be a nice feature to add to the search window in Notepad++... 😃

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment