#!/usr/bin/env python3 | |
import time | |
import sys | |
# Python by Steve Hanov | |
# Suffix array construction from: | |
# From Ge Nong, Sen Zhang and Wai Hong Chan, Two Efficient Algorithms for Linear Suffix Array Construction, 2008 | |
# longestMatches from: http://stevehanov.ca/blog/index.php?id=146 | |
# Released to the public domain. | |
class SuffixArray: | |
def __init__(self, A, B): | |
self.length1 = len(A)+1 | |
self.length2 = len(B)+1 | |
self.seq = A + b"\001" + B + b"\000" | |
# find the start or end of each bucket | |
def getBuckets(s, bkt, n, K, end, os): | |
sum = 0 | |
# clear all buckets | |
for i in range(K+1): | |
bkt[i] = 0 | |
# compute the size of each bucket | |
for i in range(n): | |
bkt[s[i+os]] += 1 | |
for i in range(K+1): | |
sum += bkt[i] | |
if end: | |
bkt[i] = sum | |
else: | |
bkt[i] = sum - bkt[i] | |
# compute SAl | |
def induceSAl(t, SA, s, bkt, n, K, end, os, oa): | |
# find starts of buckets | |
getBuckets(s, bkt, n, K, end, os) | |
for i in range(n): | |
j = SA[i+oa] - 1 | |
if j >= 0 and not t[j]: | |
SA[bkt[s[j+os]]+oa] = j | |
bkt[s[j+os]] += 1 | |
# compute SAs | |
def induceSAs(t, SA, s, bkt, n, K, end, os, oa): | |
# find ends of buckets | |
getBuckets(s, bkt, n, K, end, os); | |
for i in range(n - 1, -1, -1): | |
j = SA[i+oa] - 1 | |
if j >= 0 and t[j]: | |
bkt[s[j+os]] -= 1 | |
SA[bkt[s[j+os]]+oa] = j | |
# find the suffix array SA of s[0..n-1] in {1..K}^n | |
# require s[n-1]=0 (the sentinel!), n>=2 | |
# use a working space (excluding s and SA) of | |
# at most 2.25n+O(1) for a constant alphabet | |
def SA_IS(s, SA, n, K, os = 0, oa = 0): | |
def isLms(i): | |
return i > 0 and t[i] and not t[i-1] | |
#print("Stage 1"); | |
t = bytearray(n) | |
t[n - 2] = 0 | |
t[n - 1] = 1 | |
for i in range(n-3, -1, -1): | |
if s[i+os] < s[i+1+os] or s[i+os] == s[i+1+os] and t[i+1] == 1: | |
t[i] = 1 | |
else: | |
t[i] = 0 | |
#print("Stage 2"); | |
bkt = [0] * (K+1) | |
getBuckets(s, bkt, n, K, True, os) | |
for i in range(n): | |
SA[i+oa] = -1 | |
for i in range(1, n): | |
if isLms(i): | |
bkt[s[i+os]] -= 1 | |
SA[bkt[s[i+os]]+oa] = i | |
#print("Stage 3"); | |
induceSAl(t, SA, s, bkt, n, K, False, os, oa) | |
induceSAs(t, SA, s, bkt, n, K, True, os, oa) | |
bkt = None | |
n1 = 0 | |
for i in range(n): | |
if isLms(SA[i+oa]): | |
SA[n1+oa] = SA[i+oa] | |
n1 += 1 | |
for i in range(n1, n): | |
SA[i+oa] = -1 | |
#print("Stage 4"); | |
name = 0 | |
prev = -1 | |
for i in range(n1): | |
pos = SA[i+oa] | |
diff = False | |
for d in range(n): | |
if prev == -1 or s[pos+d+os] != s[prev+d+os] or \ | |
t[pos+d] != t[prev+d]: | |
diff = True | |
break | |
elif d > 0 and (isLms(pos+d) or isLms(prev+d)): | |
break | |
if diff: | |
name += 1 | |
prev = pos | |
pos = int(pos/2) | |
SA[n1 + pos + oa] = name - 1 | |
j = n - 1 | |
for i in range(n-1, n1-1, -1): | |
if SA[i+oa] >= 0: | |
SA[j+oa] = SA[i+oa] | |
j -= 1 | |
s1 = SA1 = SA | |
s1off = n - n1 | |
if name < n1: | |
SA_IS(s1, SA1, n1, name-1, s1off+oa, oa) | |
else: | |
for i in range(n1): | |
SA1[s1[i+s1off+oa]+oa] = i | |
bkt = [0] * (K+1) | |
#print("Stage 5"); | |
getBuckets(s, bkt, n, K, True, os) | |
j = 0 | |
for i in range(1, n): | |
if isLms(i): | |
s1[s1off+oa+j] = i | |
j += 1 | |
for i in range(n1): | |
SA1[i+oa] = s1[oa+s1off+SA1[oa+i]] | |
for i in range(n1, n): | |
SA[i+oa] = -1 | |
for i in range(n1-1, -1, -1): | |
j = SA[oa+i] | |
SA[oa+i] = -1 | |
bkt[s[os+j]] -= 1 | |
SA[bkt[s[os+j]]+oa] = j | |
#print("Stage 6"); | |
induceSAl(t, SA, s, bkt, n, K, False, os, oa) | |
induceSAs(t, SA, s, bkt, n, K, True, os, oa) | |
# Given string s and suffix array sa, compute the longest common prefix | |
# information in O(N) time. | |
def calculateLcp(s, sa): | |
n = len(sa) | |
k = 0 | |
lcp = [0] * n | |
rank = [0] * n | |
for i in range(n): | |
rank[sa[i]] = i | |
i = 0 | |
while True: | |
if i >= n: break | |
if rank[i] == n-1: | |
k=0 | |
i += 1 | |
continue | |
j = sa[rank[i]+1] | |
while i+k<n and j+k<n and s[i+k] == s[j+k]: | |
k += 1 | |
lcp[rank[i]]=k | |
i += 1 | |
if k: k -= 1 | |
return lcp | |
def remap(a, b): | |
# Fast SA construction algorithms assume the sequence is | |
# numeric, and has an upper value due to the bucket sort. | |
# reserve 0 and 1 for the separator and end character. | |
nextName = 2 | |
# Get the set of all characters used. | |
items = set(a) | |
items.update(b) | |
# map each character to its index in the sorted list | |
mapping = {} | |
for item in sorted(items): | |
mapping[item] = nextName | |
nextName += 1 | |
# create a new string that is numeric. | |
newString = [] | |
newString.extend([mapping[item] for item in a]) | |
newString.append(1) | |
newString.extend([mapping[item] for item in b]) | |
newString.append(0) | |
return newString, nextName - 1 | |
# Here is the code to initialize the suffix array object | |
remapped, K = remap(A, B) | |
self.sa = [0] * len(remapped) | |
if self.length1 + self.length2 > 2: | |
SA_IS(remapped, self.sa, len(self.sa), K) | |
#self.sa = sais_native(remapped, len(remapped), K) | |
self.lcp = calculateLcp(self.seq, self.sa) | |
def show(self): | |
for i in range(len(self.sa)): | |
self.showPosition(i) | |
def showPosition(self, saIndex): | |
i = saIndex | |
p = self.sa[i] | |
if self.sa[i] < self.length1: | |
s = "A " | |
else: | |
s = " B" | |
p -= self.length1 | |
print("{3} {0:8} {1:5} |{2}".format(p, self.lcp[i], json.dumps(self.seq[self.sa[i]:self.sa[i]+20]), s)) | |
def longestMatches(self): | |
# returns, for every position in B, a tuple with the longest matching | |
# position in A and the length of that match. | |
result = [None] * self.length2 | |
# forward pass | |
lcp = 0 | |
aIndex = 0 | |
for i in range(len(self.sa)): | |
if self.sa[i] < self.length1: | |
# string in A | |
lcp = self.lcp[i] | |
aIndex = self.sa[i] | |
else: | |
# string in B. | |
result[self.sa[i] - self.length1] = (aIndex, lcp) | |
lcp = min(lcp, self.lcp[i]) | |
# reverse pass | |
lcp = 0 | |
aIndex = 0 | |
for i in range(len(self.sa)-1, -1, -1): | |
if self.sa[i] < self.length1: | |
# string in A | |
aIndex = self.sa[i] | |
if i > 0: | |
lcp = self.lcp[i-1] | |
else: | |
# string in B. | |
lcp = min(lcp, self.lcp[i]) | |
bIndex = self.sa[i] - self.length1 | |
oldAIndex, oldLcp = result[bIndex] | |
if lcp > oldLcp: | |
result[bIndex] = (aIndex, lcp) | |
return result | |
class SAFindLongest: | |
def __init__(self, A, B): | |
self.matches = SuffixArray(A, B).longestMatches() | |
def findLongest(self, positionInB): | |
return self.matches[positionInB] | |
class BruteFindLongest: | |
def __init__(self, A, B): | |
self.A = A | |
self.B = B | |
def findLongest(self, positionInB): | |
a = self.A | |
def prefixLength(position, text): | |
l = 0 | |
while(l < len(text) and position+l < len(a) and text[l] == a[position+l]): | |
l += 1 | |
return l | |
text = self.B[positionInB:] | |
longest = 0 | |
longestPos = 0 | |
for p in range(len(self.A)): | |
l = prefixLength(p, text) | |
if l > longest: | |
longest = l | |
longestPos = p | |
return longestPos, longest | |
class TableFindLongest: | |
def __init__(self, A, B): | |
self.A = A | |
self.B = B | |
minlen = 4 | |
self.table = {} | |
for i in range(len(A)-minlen+1): | |
code = A[i:i+minlen] | |
if True or code not in self.table: | |
self.table[code] = [i] | |
else: | |
self.table[code].append(i) | |
def findLongest(self, positionInB): | |
minlen = 4 | |
code = self.B[positionInB:positionInB+minlen] | |
if code not in self.table: | |
return 0, 0 | |
a = self.A | |
lcp = 0 | |
pos = 0 | |
def prefixLength(position, text): | |
l = 0 | |
while(l < len(text) and position+l < len(a) and text[l] == a[position+l]): | |
l += 1 | |
return l | |
text = self.B[positionInB:] | |
for index in self.table[code]: | |
l = prefixLength(index, text) | |
if l > lcp: | |
pos = index | |
lcp = l | |
return pos, lcp | |
def longestPrefix(A, B): | |
l = 0 | |
m = min(len(A), len(B)) | |
while l < m and A[l] == B[l]: | |
l += 1 | |
return l | |
def longestSuffix(A, B): | |
l = 0 | |
la = len(A) | |
lb = len(B) | |
m = min(la, lb) | |
while l < m and A[la-l-1] == B[lb-l-1]: | |
l += 1 | |
return l | |
def DiffEncode(a, b, Algorithm): | |
start = time.time() | |
cmds = [] | |
j = 0 | |
insertFrom = -1 | |
check = bytearray() | |
totalCost = 0 | |
minimumMatch = 6 | |
def copy(start, length): | |
nonlocal totalCost, check | |
totalCost += 9 | |
cmds.append("COPY {0}, {1}".format(start, length)) | |
check += a[start:start+length] | |
def insert(start, length): | |
nonlocal totalCost, check | |
totalCost += 1+5+length | |
cmds.append("INSERT {0}, {1}".format(start, length)) | |
check += b[start:start+length] | |
j = prefix = longestPrefix(a, b) | |
if prefix > 0: | |
copy(0, prefix) | |
if prefix != len(b): | |
suffix = min(longestSuffix(a, b), len(b) - prefix) | |
limit = len(b) - suffix | |
if suffix + prefix != len(b): | |
if len(b) - suffix > prefix: | |
algorithm = Algorithm(a[prefix:len(a)-suffix], b[prefix:limit]) | |
while j < limit: | |
position, length = algorithm.findLongest(j-prefix) | |
position += prefix | |
if length < minimumMatch: | |
if insertFrom == -1: insertFrom = j | |
j += 1 | |
else: | |
if insertFrom >= 0: | |
insert(insertFrom, j - insertFrom) | |
insertFrom = -1 | |
copy(position, length) | |
j += length | |
if insertFrom >= 0: | |
insert(insertFrom, j-insertFrom) | |
if limit < len(b): | |
copy(len(a)-suffix, suffix) | |
if check != b: | |
print("\n".join(cmds)) | |
print("ERROR") | |
raise(Exception("Error")) | |
totalTime = time.time() - start | |
return "\n".join(cmds), totalTime, totalCost | |
import os | |
def performance(): | |
# Output a CSV file with the N and the time to construct a suffix | |
# array of a random string of length N. | |
for i in range(1000, 1000000, 1000): | |
s = os.urandom(i//2) | |
t = os.urandom(i//2) | |
start = time.time() | |
s = SuffixArray(s, t) | |
print("{0},{1}".format(i, time.time()-start)) | |
if "--performance" in sys.argv: | |
performance() | |
sys.exit(0) | |
Algorithms = { | |
"suffixarray": SAFindLongest, | |
"bruteforce": BruteFindLongest, | |
"table": TableFindLongest | |
} | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument("file1", type=str, help="First file to compare") | |
parser.add_argument("file2", type=str, help="Second file to compare") | |
parser.add_argument("--performance", help="Performance test SA construction") | |
parser.add_argument("-a", "--algorithm", type=str, help="Algorithm to use", | |
choices=Algorithms.keys(), default="suffixarray") | |
args = parser.parse_args() | |
f1 = open(args.file1, "rb").read() | |
f2 = open(args.file2, "rb").read() | |
cmds, time, cost = DiffEncode(f1, f2, Algorithms[args.algorithm]) | |
print("Cmds:" + cmds) | |
print("Total time: {0}".format(time)) | |
print("Instruction cost: {0}".format(cost)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment