Skip to content

Instantly share code, notes, and snippets.

@hdf
Forked from hynekcer/maxsubstring.py
Last active February 5, 2021 23:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hdf/5f0deb03e1b4fc52425ce3d9cd67cf1c to your computer and use it in GitHub Desktop.
Save hdf/5f0deb03e1b4fc52425ce3d9cd67cf1c to your computer and use it in GitHub Desktop.
fast longest common substring - by suffix array
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Redundancy checker</title>
<meta name="viewport" content="width=device-width, initial-scale=1">
<!--<script src="lrs.min.js"></script>-->
<style>
html {
font-family: 'Verdana', sans-serif;
margin: 0;
padding: 0;
}
pre {
margin: 0px;
}
#main {
background-color: #fff;
border: 1px solid #999;
border-spacing: 5px;
width: 100%;
}
#main>div {
display: flex;
}
#main>div>div {
background-color: #eee;
padding: 3px 6px 3px 6px;
margin: 1px;
flex: 0 0 110px;
}
#main>div>div:nth-child(2n) {
flex: 1 0 auto;
}
#main>div>div:nth-child(3n) {
flex: 0 1 145px;
}
.header {
font-size: 1.2em;
font-weight: bold;
}
center {
display: flex;
align-items: top;
justify-content: center;
}
#file {
display: none;
}
#btn {
margin-bottom: 16px;
padding: 0px 5px 0px 5px;
height: 72px;
flex: 1 1 0%;
max-width: 256px;
text-align: center;
font-size: 36px;
line-height: 72px;
background: rgb(42, 128, 225) none repeat scroll 0% 0%;
color: rgb(255, 255, 255);
border-radius: 12px;
box-shadow: inset 0 5px 5px rgba(0,0,0,0.05),0 0 5px rgba(0,0,0,0.8);
}
#btn:hover {
background: rgb(82, 64, 255) none repeat scroll 0% 0%;
}
#chk {
display: flex;
margin: 0px 0px 16px 5px;
align-items: center;
width: 400px;
padding: 0px 6px 0px 6px;
background-color: #efefef;
border-radius: 9px;
}
</style>
</head>
<body>
<center>
<input type="file" name="file" id="file">
<label id="btn" for="file" tabindex="0">Browse</label>
&nbsp;
<div id="chk">
<input type="checkbox" id="str" checked="checked">
<label for="str" tabindex="1" style="text-align: left; padding-left: 5px;">
Read file as string, rather than binary. This may result in incorrect "Occurs at:" numbers, but will increase search speed significantly.
</label>
</div>
</center>
<div id="main">
</div>
<script>
if(function(){var r=Math.floor,t=function(r){return r};function n(r,n){var e,o,f=r.length,s=[],i=f,u=-1,c=0;for(n=n||t;i--;)u=Math.max(n(r[i]),u);for(o=(u>>24?32:u>>16&&24)||u>>8&&16||8;c<o;c+=4){for(i=16;i--;)s[i]=[];for(i=f;i--;)s[n(r[i])>>c&15].push(r[i]);for(e=0;e<16;e++)for(u=s[e].length;u--;)r[++i]=s[e][u]}return r}function e(r){return"function"==typeof r?r:function(r){return"[object String]"==Object.prototype.toString.call(r)}(r)?function(t){return r.charCodeAt(t)}:function(t){return r[t]}}this.suffixArray=function(t,o){var f;return o="number"==typeof(f=o)||f instanceof Number?o:t.length,function t(e,o,f){var s,i,u,c=[],a=[],l=r(2*o/3),g=o-l,d=l+1>>1,b=l,h=0,p=[],y=[];if(1==o)return[[0],[0]];u=function(r){return r>=o?0:e(r)};for(;b--;)c[b]=1+(3*b>>1);for(b=3;b--;)n(c,function(r){return u(b+r)});h=a[r(c[0]/3)+(c[0]%3==1?0:d)]=1;for(b=1;b<l;b++)u(c[b])==u(c[b-1])&&u(c[b]+1)==u(c[b-1]+1)&&u(c[b]+2)==u(c[b-1]+2)||h++,a[r(c[b]/3)+(c[b]%3==1?0:d)]=h;if(h<l)for(a=t(function(r){return a[r]},l,!0),b=l;b--;)c[b]=a[b]<d?3*a[b]+1:3*(a[b]-d)+2;for(b=l;b--;)p[c[b]]=b;p[o]=-1;p[o+1]=-2;i=function(r,t){return u(r)-u(t)||(r%3==2?u(r+1)-u(t+1)||p[r+2]-p[t+2]:p[r+1]-p[t+1])};a=o%3==1?[o-1]:[];for(b=0;b<l;b++)c[b]%3==1&&a.push(c[b]-1);n(a,function(r){return u(r)});if(f){for(b=0,h=0,s=0;b<l&&h<g;)y[s++]=i(c[b],a[h])<0?c[b++]:a[h++];for(;b<l;)y[s++]=c[b++];for(;h<g;)y[s++]=a[h++];return y}var v,S,x=[],j=[0],m=0;for(b=0,h=0,s=0;b<l&&h<g;)if(v=s,y[s++]=i(c[b],a[h])<0?c[b++]:a[h++],x[y[v]]=v,x[v]>0){for(S=y[x[v]-1];v!=o-m&&S!=o-m&&u(v+m)==u(S+m);)m++;j[x[v]]=m,m>0&&m--}for(;b<l;)if(v=s,y[s++]=c[b++],x[y[v]]=v,x[v]>0){for(S=y[x[v]-1];v!=o-m&&S!=o-m&&u(v+m)==u(S+m);)m++;j[x[v]]=m,m>0&&m--}for(;h<g;)if(v=s,y[s++]=a[h++],x[y[v]]=v,x[v]>0){for(S=y[x[v]-1];v!=o-m&&S!=o-m&&u(v+m)==u(S+m);)m++;j[x[v]]=m,m>0&&m--}return[y,j]}(e(t),o)},this.LRS=function(r,t,n){t=t||3,n=n||10,nots=r.constructor===Uint8Array,decoder=new TextDecoder;for(var e,o,f,[i,u]=suffixArray(r),c={},a=nots?decoder.decode(r):r,l=0;l<i.length;l++)if(!((e=u[l])<2||(substring=nots?r.subarray(i[l],i[l]+e):r.substring(i[l],i[l]+e),substring=nots?decoder.decode(substring):substring,(o=a.split(substring).length-1)<t))){for(s in f=!1,c)if(s.length>=e){if(s.includes(substring)){f=!0;break}}else substring.includes(s)&&delete c[s];f||(c[substring]=[o,i[l]])}return Object.entries(c).sort((r,t)=>t[1][0]*t[0].length**2-r[1][0]*r[0].length**2).slice(0,n).reduce((r,t)=>(r[t[0]]=c[t[0]],r),{})},this.suffixArray.bsort=n}.call(),"object"==typeof process){var fs=require("fs"),file=fs.readFileSync(process.argv.length>2?process.argv[2]:"suffixarray.js","utf8");console.log(LRS(file))}else"function"==typeof importScripts&&addEventListener("message",r=>postMessage(LRS(r.data)));
</script>
<script>
(function() {
var msgs = ['Browse', 'Browse again'], b;
function makeWorker(f) {
//var blob = new Blob(['(', f.toString(), ')();'], { type: 'text/javascript'});
var blob = new Blob([f.toString()], {type: 'text/javascript'});
var blobUrl = URL.createObjectURL(blob);
var worker = new Worker(blobUrl);
URL.revokeObjectURL(blobUrl);
return worker;
}
//var w = makeWorker(e => postMessage(LRS(e.data)));
var w = makeWorker(document.getElementsByTagName('script')[0].innerHTML.trim());
w.addEventListener('message', function(e) {
//var res = LRS(e.target.result);
var res = e.data;
var m = document.getElementById('main');
m.innerHTML = ' <div class="header"><div>Occurs at:</div><div>Substring:</div><div>Found count:</div></div>\n';
for (r in res)
m.innerHTML += ' <div><div>' + res[r][1] + '</div><div><pre>' + r + '</pre></div><div>' + res[r][0] + '</div></div>\n';
//document.getElementById('btn').textContent = msgs[Math.floor(Math.random() * msgs.length)];
document.getElementById('btn').textContent = msgs[b=b?0:1];
});
function selectHandler(e) {
var reader = new FileReader();
var d = new TextDecoder();
reader.onload = function(e) {
document.getElementById('btn').textContent = 'Processing...';
w.postMessage((typeof e.target.result==='string')?e.target.result:(new Uint8Array(e.target.result)));
};
if (document.getElementById('str').checked)
reader.readAsText(e.target.files[0]);
else
reader.readAsArrayBuffer(e.target.files[0]);
}
document.getElementById('file').addEventListener('change', selectHandler, false);
})();
</script>
</body>
</html>
#!/usr/bin/env python
# From: https://gist.github.com/hdf/5f0deb03e1b4fc52425ce3d9cd67cf1c
# Original from: https://gist.github.com/hynekcer/fa340f3b63826168ffc0c4b33310ae9c
"""Find the longest repeated substring.
"Efficient way to find longest duplicate string for Python (From Programming Pearls)"
http://stackoverflow.com/questions/13560037/
The algorithm is based on "Prefix doubling".
The worst time complexity is O(n (log n)^2). Memory requirements are linear.
"""
import time
from random import randint
import itertools
import sys
import unittest
from itertools import groupby
from operator import itemgetter
import logging
log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
try:
log.addHandler(logging.NullHandler())
except AttributeError:
pass
cache = False
def run():
if sys.argv[1:] == ['-']:
print("(Return/Enter and CTRL+D or CTRL+Z" +
" and Return/Enter again to finish.)")
text = sys.stdin.read()
elif sys.argv[1:]:
print('Reading data...')
text = open(sys.argv[1]).read()
else:
text = 'banana\nabababab'
print('Sorting...')
global cache
cache = False
result = longest_common_substring(text)
print('Longest common substrings in "{0}..." are:\n{1}'.format(
text[:20], result))
result = LRS(text)
print('Top (at most) 10, atleast 3 times repeated longest' +
' non-overlapping substrings are:\n{0}'.format(result))
def LRS(text, m=3, n=10): # Longest repeated non-overlapping substrings
"""
Returns longest (maximum n number of) non-overlapping (minimum m times)
repeated strings and how many times they are found in the text in
descending order of count*len(substring)^2.
"""
sa, rsa, lcp = suffix_array(text)
result = {}
for i in range(0, len(sa)):
l = lcp[i]
if l < 2: # Don't bother with individual characters.
continue
substring = text[sa[i]:sa[i] + l]
c = text.count(substring) # This is the bottleneck.
if c < m: # Not found enough times.
continue
halt = False
for s in list(result.keys()): # Filter out overlaps.
if len(s) >= l:
if substring in s:
halt = True
break
elif s in substring:
del result[s]
if not halt:
result[substring] = c
return dict(sorted(result.items(),
key=lambda x: x[1]*(len(x[0])**2), reverse=True)[:n])
def longest_common_substring(text):
"""Get the longest common substrings and their positions.
>>> longest_common_substring('banana')
{'ana': [1, 3]}
>>> text = "not so Agamemnon, who spoke fiercely to "
>>> sorted(longest_common_substring(text).items())
[(' s', [3, 21]), ('no', [0, 13]), ('o ', [5, 20, 38])]
This function can be easy modified for any criteria, e.g. for searching ten
longest non overlapping repeated substrings.
"""
sa, rsa, lcp = suffix_array(text)
maxlen = max(lcp)
result = {}
for i in range(1, len(text)):
if lcp[i] == maxlen:
j1, j2, h = sa[i - 1], sa[i], lcp[i]
assert text[j1:j1 + h] == text[j2:j2 + h]
substring = text[j1:j1 + h]
if substring not in result:
result[substring] = [j1]
result[substring].append(j2)
return dict((k, sorted(v)) for k, v in result.items())
def suffix_array(text, _step=16):
global cache
if cache and "unittest" not in sys.modules:
return cache
"""Analyze all common strings in the text.
Short substrings of the length _step a are first pre-sorted. The are the
results repeatedly merged so that the garanteed number of compared
characters bytes is doubled in every iteration until all substrings are
sorted exactly.
Arguments:
text: The text to be analyzed.
_step: Is only for optimization and testing. It is the optimal length
of substrings used for initial pre-sorting. The bigger value is
faster if there is enough memory. Memory requirements are
approximately (estimate for 32 bit Python 3.3):
len(text) * (29 + (_size + 20 if _size > 2 else 0)) + 1MB
Return value: (tuple)
(sa, rsa, lcp)
sa: Suffix array for i in range(1, size):
assert text[sa[i-1]:] < text[sa[i]:]
rsa: Reverse suffix array for i in range(size):
assert rsa[sa[i]] == i
lcp: Longest common prefix for i in range(1, size):
assert text[sa[i-1]:sa[i-1]+lcp[i]] == text[sa[i]:sa[i]+lcp[i]]
if sa[i-1] + lcp[i] < len(text):
assert text[sa[i-1] + lcp[i]] < text[sa[i] + lcp[i]]
>>> suffix_array(text='banana')
([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2])
Explanation: 'a' < 'ana' < 'anana' < 'banana' < 'na' < 'nana'
The Longest Common String is 'ana': lcp[2] == 3 == len('ana')
It is between tx[sa[1]:] == 'ana' < 'anana' == tx[sa[2]:]
"""
tx = text
t0 = time.time()
size = len(tx)
step = min(max(_step, 1), len(tx))
sa = list(range(len(tx)))
log.debug('%6.3f pre sort', time.time() - t0)
sa.sort(key=lambda i: tx[i:i + step])
log.debug('%6.3f after sort', time.time() - t0)
grpstart = size * [False] + [True] # a boolean map for iteration speedup.
# It helps to skip yet resolved values. The last value True is a sentinel.
rsa = size * [None]
stgrp, igrp = '', 0
for i, pos in enumerate(sa):
st = tx[pos:pos + step]
if st != stgrp:
grpstart[igrp] = (igrp < i - 1)
stgrp = st
igrp = i
rsa[pos] = igrp
sa[i] = pos
grpstart[igrp] = (igrp < size - 1 or size == 0)
log.debug('%6.3f after group', time.time() - t0)
while grpstart.index(True) < size:
# assert step <= size
nmerge = 0
nextgr = grpstart.index(True)
while nextgr < size:
igrp = nextgr
nextgr = grpstart.index(True, igrp + 1)
glist = []
for ig in range(igrp, nextgr):
pos = sa[ig]
if rsa[pos] != igrp:
break
newgr = rsa[pos + step] if pos + step < size else -1
glist.append((newgr, pos))
glist.sort()
for ig, g in groupby(glist, key=itemgetter(0)):
g = [x[1] for x in g]
sa[igrp:igrp + len(g)] = g
grpstart[igrp] = (len(g) > 1)
for pos in g:
rsa[pos] = igrp
igrp += len(g)
nmerge += len(glist)
log.debug('%6.3f for step=%d nmerge=%d', time.time() - t0, step, nmerge)
step *= 2
del grpstart
# create LCP array
lcp = size * [0]
h = 0
for i in range(size):
if rsa[i] > 0:
j = sa[rsa[i] - 1]
while i != size - h and j != size - h and tx[i + h] == tx[j + h]:
h += 1
lcp[rsa[i]] = h
if h > 0:
h -= 1
log.debug('%6.3f end', time.time() - t0)
cache = (sa, rsa, lcp)
return cache
# ---
class TestMixin(object):
def suffix_verify(self, text, step=16):
tx = text
sa, rsa, lcp = suffix_array(text=tx, _step=step)
self.assertEqual(set(sa), set(range(len(tx))))
ok = True
for i0, i1, h in zip(sa[:-1], sa[1:], lcp[1:]):
self.assertEqual(tx[i1:i1 + h], tx[i0:i0 + h], "Verify LCP characters equal on text '%s...'" % text[:20])
self.assertGreater(tx[i1 + h:i1 + h + 1], tx[i0 + h:i0 + h + 1],
"Verify LCP+1 char is different '%s...'" % text[:20])
self.assertLessEqual(max(i0, i1), len(tx) - h,
"Verify LCP is not more than length of string '%s...'" % text[:20])
self.assertTrue(ok)
class SuffixArrayTest(unittest.TestCase, TestMixin):
def test_16(self):
# 'a' < 'ana' < 'anana' < 'banana' < 'na' < 'nana'
expect = ([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2])
self.assertEqual(suffix_array(text='banana', _step=16), expect)
def test_1(self):
expect = ([5, 3, 1, 0, 4, 2], [3, 2, 5, 1, 4, 0], [0, 1, 3, 0, 0, 2])
self.assertEqual(suffix_array(text='banana', _step=1), expect)
def test_mini(self):
self.assertEqual(suffix_array(text='', _step=1), ([], [], []))
self.assertEqual(suffix_array(text='a', _step=1), ([0], [0], [0]))
self.assertEqual(suffix_array(text='aa', _step=1), ([1, 0], [1, 0], [0, 1]))
self.assertEqual(suffix_array(text='aaa', _step=1), ([2, 1, 0], [2, 1, 0], [0, 1, 2]))
def test_example(self):
self.suffix_verify('abracadabra')
def test_cartesian(self):
"""Test all combinations of alphabet "ABC" up to length 4 characters"""
for size in range(7):
for cartesian in itertools.product(*(size * ['ABC'])):
text = ''.join(cartesian)
log.debug('Testing "%s"', text)
self.suffix_verify(text, 1)
def test_lcp(self):
expect = {'ana': [1, 3]}
self.assertDictEqual(longest_common_substring('banana'), expect)
expect = {' s': [3, 21], 'no': [0, 13], 'o ': [5, 20, 38]}
self.assertDictEqual(longest_common_substring(
"not so Agamemnon, who spoke fiercely to "), expect)
class SlowTests(unittest.TestCase, TestMixin):
"""Slow development tests running many minutes.
It can be run only by an EXPLICIT command!
e.g.: python -m unittest maxsubstring.SlowTests._test_random
"""
def _test_random(self):
for power in range(2, 21, 2):
size = randint(2 ** (power - 1), 2 ** power)
for alphabet in (2, 4, 16, 256):
text = ''.join(chr(65 + randint(0, alphabet - 1)) for _ in range(size))
log.debug('%s %s %s', size, alphabet, 1)
self.suffix_verify(text, 1)
log.debug('%s %s %s', size, alphabet, 16)
self.suffix_verify(text, 16)
if __name__ == '__main__':
run()
// From: https://gist.github.com/hdf/5f0deb03e1b4fc52425ce3d9cd67cf1c
// (Minify with: https://javascript-minifier.com/)
// Original from: https://github.com/tixxit/suffixarray/blob/master/suffixarray.js
/**
* An implementation of the linear time suffix array construction of
* Karkkainen & Sanders:
*
* "Simple Linear Work Suffix Array Construction", Karkainen and Sanders.
*
* Creating a suffix array is very simple; just call suffixArray(...) with
* either a string or a function that returns integers and its length. For
* example,
*
* var s = "Sort this!";
* suffixArray(s); // Returns [4, 9, 0, 6, 7, 1, 2, 8, 3, 5].
*
* function reverse(i) { return s.charCodeAt(s.length - 1 - i) }
* suffixArray(reverse, s.length); // Returns [5, 0, 9, 3, 2, 8, 7, 1, 4, 6].
*
* @author Thomas Switzer
*/
(function() {
var global = this,
floor = Math.floor,
identity = function(x) { return x },
undefined;
/**
* Sorts an array of (unsigned) integers in linear time. The values of the
* array (a) act as keys, which are passed to the key function which returns an
* integer value.
*
* @param a An array of keys to sort.
* @param key A function that maps keys to integer values.
* @return The array a.
*/
function bsort(a, key) {
var len = a.length,
buckets = [],
i = len, j = -1, b, d = 0,
keys = 0,
bits;
key = key || identity;
while (i--)
j = Math.max(key(a[i]), j);
bits = j >> 24 && 32 || j >> 16 && 24 || j >> 8 && 16 || 8;
for (; d < bits; d += 4) {
for (i = 16; i--;)
buckets[i] = [];
for (i = len; i--;)
buckets[(key(a[i]) >> d) & 15].push(a[i]);
for (b = 0; b < 16; b++)
for (j = buckets[b].length; j--;)
a[++i] = buckets[b][j];
}
return a;
}
function isInt(n) {
return typeof n == "number" || n instanceof Number;
}
function isStr(s) {
return Object.prototype.toString.call(s) == "[object String]";
}
function wrap(s) {
return typeof s == "function" ? s : (isStr(s)
? function(i) { return s.charCodeAt(i) }
: function(i) { return s[i] });
}
/**
* Returns the suffix array of the string s. The suffix array is constructed
* in linear time. Also returns Longest common prefix array.
*
* The string s can either be an Unicode string (ie. JavaScript String object)
* or a function that takes an index (integer >= 0) and returns another
* integer (a "symbol"). If a function is provided, then another argument
* specifying its length (integer >= 0) must be provided.
*
* This also takes a 3rd optional parameter that dictates how to treat the end
* of the string. This can be either "min" or "wrap". If it is "min", then
* characters after the end of the string are treated as 0's (the minimum).
* If "wrap" is given, then the end of the string wraps back around to the
* beginning. If this parameter is omitted, then "wrap" is assumed.
*
* In the case of strings, you can omit the 2nd paramter (length) and still
* provide the 3rd paramter. For instance, suffixArray(str, "min").
*
* The returned array contains the indexes of the string in the lexicographical
* order of the suffixes that start at those indexes.
*
* @param s A string or function that maps ints between [0, len) to integers.
* @param len The length of s (optional if s is a string, required otherwise).
* @param end Either "min", "wrap" or leave out (defaults to "wrap").
* @return An array of indexes into s.
*/
global.suffixArray = function(s, len, end) {
end = end || len;
len = isInt(len) ? len : s.length;
if (end == "wrap")
return wrappedSuffixArray(s, len);
else
return _suffixArray(wrap(s), len);
}
/**
* Longest repeated non-overlapping substrings:
* Returns longest (maximum n number of) non-overlapping (minimum m times)
* repeated strings and how many times they are found in the text in
* descending order of count*len(substring)^2.
*/
global.LRS = function(text, m, n) {
m = m || 3;
n = n || 10;
nots = (text.constructor===Uint8Array);
decoder = new TextDecoder();
var [sa, lcp] = suffixArray(text);
var result = {}, l, c, halt,
t = nots?decoder.decode(text):text;
for (var i = 0; i < sa.length; i++) {
l = lcp[i];
if (l < 2) continue; // Don't bother with individual characters.
substring = nots?text.subarray(sa[i], sa[i] + l):text.substring(sa[i], sa[i] + l); // Subarray is very slow. :(
substring = nots?decoder.decode(substring):substring;
c = t.split(substring).length - 1; // This is slow too.
if (c < m) continue; // Not found enough times.
halt = false;
for (s in result) // Filter out overlaps.
if (s.length >= l) {
if (s.includes(substring)) {
halt = true;
break;
}
} else if (substring.includes(s))
delete result[s];
if (!halt) result[substring] = [c, sa[i]];
}
return Object.entries(result).sort((a, b) =>
(b[1][0]*(b[0].length**2))-(a[1][0]*(a[0].length**2)))
.slice(0, n).reduce((o, k) => {
o[k[0]] = result[k[0]];
return o;
}, {});
}
// Export the Bucket Sort.
global.suffixArray.bsort = bsort;
/**
* Constructs the suffix array of s. It takes either a string, an array, or a
* function that takes an integer and returns a unsigned integer. It also takes
* an optional 2nd paramter, the length. This is required if the first
* parameter is a function.
*
* This uses the nice idea from Karkkainen & Sander's paper of replacing each
* letter with the equivalent k-letter version (3 in their paper, 2 in this
* algorithm). This is repeated recursively until all the letters are
* different. This doesn't have the nice 1/3 pruning / merge step of their
* algorithm, but still performs relatively fast, running in O(n log n).
*
* @param s A string, array, or function.
* @param len The length of s.
* @return The order of the suffixes.
*/
function wrappedSuffixArray(s, len) {
len = isInt(len) ? len : s.length;
s = wrap(s);
var array = [],
swap = [],
order = [],
span,
sym,
i = len;
while (i--)
array[i] = s(order[i] = i);
for (span = 1; sym != len && span < len; span *= 2) {
bsort(order, function(i) { return array[(i + span) % len] });
bsort(order, function(i) { return array[i] });
sym = swap[order[0]] = 1;
for (i = 1; i < len; i++) {
if (array[order[i]] != array[order[i - 1]] || array[(order[i] + span) % len] != array[(order[i - 1] + span) % len])
sym++;
swap[order[i]] = sym;
}
tmp = array;
array = swap;
swap = tmp;
}
return order;
}
/* Constructs the suffix array of s. In this case, s must be a function that
* maps integers between 0 and len - 1 to "symbols" (unsigned integers). It
* returns the suffixes in lexicographical order as an array of indexes where
* those suffixes start. Also returns Longest common prefix array.
*
* I have tried to keep the code reasonably well commented. Both for my sake,
* and yours. That said, my code was not written with pedagogy in mind, but
* to be relatively fast and have a small minified size.
*
* The description of the algorithm in the paper is very concise and is well
* worth a read.
*
* The C code accompanying the paper is very terse and, IMHO, creates more
* confusion than clarity. While the algorithm itself is fairly simple (simple
* and fast, who wants more?), it does deal with quite a bit of abstraction.
* That is, you are dealing with a lot of placeholders, rather than concrete
* objects; indexes into the string to represent suffixes, lexical names
* representing triplets of symbols, indexes of these lexical names, etc.
*/
function _suffixArray(_s, len, sub) {
var a = [],
b = [],
alen = floor(2 * len / 3), // Number of indexes s.t. i % 3 != 0.
blen = len - alen, // Number of indexes s.t. i % 3 = 0.
r = (alen + 1) >> 1, // Number of indexes s.t. i % 3 = 1.
i = alen,
j = 0,
k,
lookup = [],
result = [],
tmp, cmp,
s;
if (len == 1)
return [[0], [0]];
s = function(i) { return i >= len ? 0 : _s(i) };
// Sort suffixes w/ indices % 3 != 0 by their first 3 symbols (triplets).
while (i--)
a[i] = ((i * 3) >> 1) + 1; // a = [1, 2, 4, 5, 7, 8, 10, 11, 13, ...]
for (i = 3; i--;)
bsort(a, function(j) { return s(i + j) });
// Assign lexicographical names (j) to the triplets of consecutive symbols,
// s.t. the order of the lex. names match the lex. order of the triplets.
// Array b contains lex. names in the order they appear in s for i % 3 != 0
j = b[floor(a[0] / 3) + (a[0] % 3 == 1 ? 0 : r)] = 1;
for (i = 1; i < alen; i++) {
if (s(a[i]) != s(a[i-1]) || s(a[i] + 1) != s(a[i-1] + 1) || s(a[i] + 2) != s(a[i-1] + 2))
j++;
b[floor(a[i] / 3) + (a[i] % 3 == 1 ? 0 : r)] = j;
}
// If all lex. names are unique, then a is already completely sorted.
if (j < alen) {
// Otherwise, recursively sort lex. names in b, then reconstruct the
// indexes of the sorted array b so they are relative to a.
b = _suffixArray(function(i) { return b[i] }, alen, true);
for (i = alen; i--;)
a[i] = b[i] < r ? b[i] * 3 + 1 : ((b[i] - r) * 3 + 2);
}
// Create a reverse lookup table for the indexes i, s.t. i % 3 != 0.
// This table can be used to simply determine the sorted order of 2
// suffixes whose indexes are both not divisible by 3.
for (i = alen; i--;)
lookup[a[i]] = i;
lookup[len] = -1;
lookup[len + 1] = -2;
/**
* This is a comparison function for the suffixes at indices m & n that
* uses the lookup table to shorten the searches. It assumes that
* n % 3 == 0 and m % 3 != 0.
*/
cmp = function(m, n) {
return (s(m) - s(n)) || (m % 3 == 2
? (s(m + 1) - s(n + 1)) || (lookup[m + 2] - lookup[n + 2])
: (lookup[m + 1] - lookup[n + 1]))
};
// Sort remaining suffixes (i % 3 == 0) using prev result (i % 3 != 0).
b = len % 3 == 1 ? [ len - 1 ] : [];
for (i = 0; i < alen; i++)
if (a[i] % 3 == 1)
b.push(a[i] - 1);
bsort(b, function(j) { return s(j) });
// Merge a (i % 3 != 0) and b (i % 3 == 0) together. We only need to
// compare, at most, 2 symbols before we end up comparing 2 suffixes whose
// indices are both not divisible by 3. At this point, we can use the
// reverse lookup array to order them.
if (sub) { // No need for extra rsa and lcp processing in sub calls, only in the top level one.
for (i = 0, j = 0, k = 0; i < alen && j < blen;)
result[k++] = cmp(a[i], b[j]) < 0 ? a[i++] : b[j++];
while (i < alen)
result[k++] = a[i++];
while (j < blen)
result[k++] = b[j++];
return result;
}
var rsa = [], lcp = [0], h = 0, i2, j2;
for (i = 0, j = 0, k = 0; i < alen && j < blen;) {
i2 = k;
result[k++] = cmp(a[i], b[j]) < 0 ? a[i++] : b[j++];
rsa[result[i2]] = i2;
if (rsa[i2] > 0) {
j2 = result[rsa[i2] - 1];
while (i2 != len - h && j2 != len - h && s(i2 + h) == s(j2 + h)) h++;
lcp[rsa[i2]] = h;
if (h > 0) h--;
}
}
while (i < alen) {
i2 = k;
result[k++] = a[i++];
rsa[result[i2]] = i2;
if (rsa[i2] > 0) {
j2 = result[rsa[i2] - 1];
while (i2 != len - h && j2 != len - h && s(i2 + h) == s(j2 + h)) h++;
lcp[rsa[i2]] = h;
if (h > 0) h--;
}
}
while (j < blen) {
i2 = k;
result[k++] = b[j++];
rsa[result[i2]] = i2;
if (rsa[i2] > 0) {
j2 = result[rsa[i2] - 1];
while (i2 != len - h && j2 != len - h && s(i2 + h) == s(j2 + h)) h++;
lcp[rsa[i2]] = h;
if (h > 0) h--;
}
}
return [result, lcp];
}
}).call();
if (typeof process === 'object') {
var fs = require('fs');
var file = fs.readFileSync((process.argv.length > 2) ? process.argv[2] : 'suffixarray.js', 'utf8');
console.log(LRS(file));
} else {
if (typeof importScripts === 'function') addEventListener('message', e => postMessage(LRS(e.data)));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment