Skip to content

Instantly share code, notes, and snippets.

@nojima
Created July 11, 2011 15:49
Show Gist options
  • Save nojima/1076138 to your computer and use it in GitHub Desktop.
Save nojima/1076138 to your computer and use it in GitHub Desktop.
Suffix Array Construction and String Search
#include <algorithm>
#include <climits>
#include <cstdlib>
#include <cstring>
#include <vector>
using namespace std;
struct Comp1 {
const char* str;
Comp1(const char* str): str(str) {}
bool operator()(int s1, int s2) const {
return strcmp(str+s1, str+s2) < 0;
}
};
void build_suffix_array1(const char* A, /*OUT*/ vector<int>& S) {
int n = strlen(A);
S.resize(n+1);
for (int i = 0; i < n+1; ++i) { S[i] = i; }
sort(S.begin(), S.end(), Comp1(A));
}
struct Comp2 {
int h;
const vector<int>& group;
Comp2(int h, const vector<int>& group): h(h), group(group) {}
bool operator()(int s1, int s2) const {
return group[s1+h] < group[s2+h];
}
};
void build_suffix_array2(const char* A, /*OUT*/ vector<int>& S) {
typedef unsigned char uchar;
int n = strlen(A);
vector<int> group(n+1), next(n+2), bucket(256);
S.resize(n+1);
for (int i = 0; i < n+1; ++i) { ++bucket[(uchar)A[i]]; }
for (int i = 1; i < 256; ++i) { bucket[i] += bucket[i-1]; }
for (int i = 0; i < n+1; ++i) { S[--bucket[(uchar)A[i]]] = i; }
for (int i = 0; i < n+1; ++i) { group[i] = bucket[(uchar)A[i]]; }
for (int i = 0; i < 255; ++i) { next[bucket[i]] = bucket[i+1]; }
next[bucket[255]] = n+1;
for (int h = 1; next[0] != -n-1; h *= 2) {
int prev = -1;
for (int i = 0; i < n+1; i = abs(next[i])) {
if (next[i]-i <= 1) {
if (prev == -1) { prev = i; }
else { next[prev] = -i; }
} else {
prev = -1;
}
}
if (prev != -1) { next[prev] = -n-1; }
Comp2 comp(h, group);
for (int i = 0; i < n+1; ) {
if (next[i] < 0) {
i = -next[i];
} else {
sort(S.begin()+i, S.begin()+next[i], comp);
int prev = i, nxt = next[i];
for (int j = i+1; j < nxt; ++j) {
if (comp(S[j-1], S[j])) { next[prev] = j; prev = j; }
}
i = next[prev] = nxt;
}
}
for (int i = 0; i < n+1; i = abs(next[i])) {
for (int j = i; j < next[i]; ++j) { group[S[j]] = i; }
}
}
}
void build_lcp1(const char* A, const vector<int>& S,
/*OUT*/ vector<int>& height) {
int n = S.size() - 1;
height = vector<int>(n+1);
for (int i = 1; i < n+1; ++i) {
while (A[S[i-1]+height[i]] == A[S[i]+height[i]]) { ++height[i]; }
}
}
void build_lcp2(const char* A, const vector<int>& S,
/*OUT*/ vector<int>& height) {
int n = S.size() - 1, h = 0;
vector<int> rank(n+1);
height = vector<int>(n+1);
for (int i = 0; i < n+1; ++i) { rank[S[i]] = i; }
for (int i = 0; i < n+1; ++i) {
if (rank[i] > 0) {
int j = S[rank[i]-1];
while (A[i+h] == A[j+h]) { ++h; }
height[rank[i]] = h;
}
if (h > 0) { --h; }
}
}
void build_rmq(const vector<int>& array,
/*OUT*/ vector<int>& rmq) {
int n = array.size(), N = 1;
while (N <= n) { N *= 2; }
rmq = vector<int>(2*N-1, INT_MAX);
copy(array.begin(), array.end(), rmq.begin()+N-1);
for (int v = N-2; v >= 0; --v) {
rmq[v] = min(rmq[2*v+1], rmq[2*v+2]);
}
}
int rmq_min(const vector<int>& rmq, int v,
int Lv, int Rv, int Lq, int Rq) {
int mid = (Lv+Rv)/2, ret = INT_MAX;
if (Rq < mid) { return rmq_min(rmq, 2*v+1, Lv, mid, Lq, Rq); }
if (Lq > mid) { return rmq_min(rmq, 2*v+2, mid, Rv, Lq, Rq); }
if (Lq < mid) {
if (Lq == Lv) { ret = min(ret, rmq[2*v+1]); }
else { ret = min(ret, rmq_min(rmq, 2*v+1, Lv, mid, Lq, mid)); }
}
if (Rq > mid) {
if (Rq == Rv) { ret = min(ret, rmq[2*v+2]); }
else { ret = min(ret, rmq_min(rmq, 2*v+2, mid, Rv, mid, Rq)); }
}
return ret;
}
int rmq_min(const vector<int>& rmq, int L, int R) {
int n = (rmq.size()+1)/2;
if (L >= R) { return INT_MAX; }
if (L == 0 && R == n) { return rmq[0]; }
return rmq_min(rmq, 0, 0, n, L, R);
}
struct Comp {
const char* A;
Comp(const char* A): A(A) {}
bool operator()(int s, const char* w) const {
return strcmp(A+s, w) < 0;
}
};
bool search0(const char* W, const char* A) {
for (int i = 0; A[i]; ++i) {
int j;
for (j = 0; W[j]; ++j) {
if (W[j] != A[i+j]) { break; }
}
if (W[j] == 0) { return true; }
}
return false;
}
bool search1(const char* W, const char* A, const vector<int>& S) {
return strncmp(A+*lower_bound(S.begin(), S.end(), W, Comp(A)),
W, strlen(W)) == 0;
}
inline int suffix_suffix_lcp(int i, int j, const vector<int>& rmq) {
return rmq_min(rmq, i+1, j+1);
}
inline int suffix_string_lcp(int s_i, int p, const char* W,
int n, const char* A) {
int j = 0;
while (j < p && A[s_i + j] == W[j]) { ++j; }
return j;
}
bool search2(const char* W, const char* A,
const vector<int>& S, const vector<int>& rmq) {
int p = strlen(W), n = S.size()-1;
int L = 0, R = n;
int l = suffix_string_lcp(S[L], p, W, n, A);
int r = suffix_string_lcp(S[R], p, W, n, A);
while (R - L > 1) {
int M = (L+R)/2;
if (l >= r) {
int m = suffix_suffix_lcp(L, M, rmq);
if (m < l) { R = M; r = m; }
else if (m > l) { L = M; }
else {
int k = l + suffix_string_lcp(S[M]+l, p-l, W+l, n, A);
if (k == p || A[S[M]+k] < W[k]) { L = M; l = k; }
else { R = M; r = k; }
}
} else {
int m = suffix_suffix_lcp(M, R, rmq);
if (m < r) { L = M; l = m; }
else if (m > r) { R = M; }
else {
int k = r + suffix_string_lcp(S[M]+r, p-r, W+r, n, A);
if (k == p || A[S[M]+k] < W[k]) { L = M; l = k; }
else { R = M; r = k; }
}
}
}
return l == p || r == p;
}
//// TEST /////////////////////////////////////////////////////////////////////
#include <cstdio>
void test_build_suffix_array() {
printf("testing build_suffix_array()...");
fflush(stdout);
bool ok = true;
char str[1024];
for (int t = 0; t < 10000; ++t) {
int n = rand() % 100;
int p = rand() % 254 + 1;
for (int i = 0; i < n; ++i) {
str[i] = rand() % p + 1;
}
str[n] = 0;
vector<int> expected;
build_suffix_array1(str, expected);
vector<int> answer;
build_suffix_array2(str, answer);
if (expected != answer) {
printf("FAILED!! t=%d, n=%d, p=%d\n", t, n, p);
ok = false;
}
}
for (int t = 0; t < 1000; ++t) {
int n = t, c = rand() % 254 + 1;
for (int i = 0; i < n; ++i) { str[i] = c; }
str[n] = 0;
vector<int> expected;
build_suffix_array1(str, expected);
vector<int> answer;
build_suffix_array2(str, answer);
if (expected != answer) {
printf("FAILED!! t=%d, n=%d\n", t, n);
ok = false;
}
}
if (ok) { puts("OK!!"); }
}
void test_build_lcp() {
printf("testing build_lcp()...");
fflush(stdout);
bool ok = true;
char str[1024];
for (int t = 0; t < 10000; ++t) {
int n = rand() % 100;
int p = rand() % 254 + 1;
for (int i = 0; i < n; ++i) {
str[i] = rand() % p + 1;
}
str[n] = 0;
vector<int> sa;
build_suffix_array2(str, sa);
vector<int> expected;
build_lcp1(str, sa, expected);
vector<int> answer;
build_lcp2(str, sa, answer);
if (expected != answer) {
printf("FAILED!! t=%d, n=%d, p=%d\n", t, n, p);
ok = false;
}
}
for (int t = 0; t < 1000; ++t) {
int n = rand() % 100, c = rand() % 254 + 1;
for (int i = 0; i < n; ++i) { str[i] = c; }
str[n] = 0;
vector<int> sa;
build_suffix_array2(str, sa);
vector<int> expected;
build_lcp1(str, sa, expected);
vector<int> answer;
build_lcp2(str, sa, answer);
if (expected != answer) {
printf("FAILED!! t=%d, n=%d\n", t, n);
ok = false;
}
}
if (ok) { puts("OK!!"); }
}
const int N = 1000;
char str[N+1];
void test_search() {
printf("testing search()...");
fflush(stdout);
bool ok = true;
for (int T = 0; T < 300; ++T) {
for (int i = 0; i < N; ++i) { str[i] = rand() % 8 + 'a'; }
str[N] = 0;
vector<int> sa, height, rmq;
build_suffix_array2(str, sa);
build_lcp2(str, sa, height);
build_rmq(height, rmq);
for (int t = 0; t < 10000; ++t) {
char w[256];
int n = rand() % 10 + 1;
for (int i = 0; i < n; ++i) { w[i] = rand() % 8 + 'a'; }
w[n] = 0;
bool expected = search1(w, str, sa);
bool answer = search2(w, str, sa, rmq);
if (expected != answer) {
printf("FAILED!! T=%d, t=%d, n=%d, expected=%d, answer=%d\n", T, t, n, (int)expected, (int)answer);
ok = false;
}
}
}
for (int T = 0; T < 8; ++T) {
int c = 'a'+T;
for (int i = 0; i < N; ++i) { str[i] = c; }
str[N] = 0;
vector<int> sa, height, rmq;
build_suffix_array2(str, sa);
build_lcp2(str, sa, height);
build_rmq(height, rmq);
for (int t = 0; t < 10000; ++t) {
char w[256];
int n = rand() % 10 + 1;
for (int i = 0; i < n; ++i) { w[i] = rand() % 8 + 'a'; }
w[n] = 0;
bool expected = search1(w, str, sa);
bool answer = search2(w, str, sa, rmq);
if (expected != answer) {
printf("FAILED!! T=%d, t=%d, n=%d, expected=%d, answer=%d\n", T, t, n, (int)expected, (int)answer);
ok = false;
}
}
}
if (ok) { puts("OK!!"); }
}
int main() {
srand(12345);
test_build_suffix_array();
test_build_lcp();
test_search();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment