Created
December 25, 2013 00:13
-
-
Save calmhandtitan/8119030 to your computer and use it in GitHub Desktop.
Suffix Array Construction in O(nlogn) using Manber and Myers algo
and linear time LCP array construction
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "stdio.h" | |
#include "stdlib.h" | |
#include "string.h" | |
#include "algorithm" | |
using namespace std; | |
// Begins Suffix Arrays implementation | |
// O(n log n) - Manber and Myers algorithm | |
// Refer to "Suffix arrays: A new method for on-line txting searches", | |
// by Udi Manber and Gene Myers | |
//Usage: | |
// Fill txt with the characters of the txting. | |
// Call SuffixSort(n), where n is the length of the txting stored in txt. | |
// That's it! | |
//Output: | |
// SA = The suffix array. Contains the n suffixes of txt sorted in lexicographical order. | |
// Each suffix is represented as a single integer (the SAition of txt where it starts). | |
// iSA = The inverse of the suffix array. iSA[i] = the index of the suffix txt[i..n) | |
// in the SA array. (In other words, SA[i] = k <==> iSA[k] = i) | |
// With this array, you can compare two suffixes in O(1): Suffix txt[i..n) is smaller | |
// than txt[j..n) if and only if iSA[i] < iSA[j] | |
const int MAX = 100010; | |
char txt[MAX]; //input | |
int iSA[MAX], SA[MAX]; //output | |
int cnt[MAX], next[MAX]; //internal | |
bool bh[MAX], b2h[MAX]; | |
// Compares two suffixes according to their first characters | |
bool smaller_first_char(int a, int b){ | |
return txt[a] < txt[b]; | |
} | |
void suffixSort(int n){ | |
//sort suffixes according to their first characters | |
for (int i=0; i<n; ++i){ | |
SA[i] = i; | |
} | |
sort(SA, SA + n, smaller_first_char); | |
//{SA contains the list of suffixes sorted by their first character} | |
for (int i=0; i<n; ++i){ | |
bh[i] = i == 0 || txt[SA[i]] != txt[SA[i-1]]; | |
b2h[i] = false; | |
} | |
for (int h = 1; h < n; h <<= 1){ | |
//{bh[i] == false if the first h characters of SA[i-1] == the first h characters of SA[i]} | |
int buckets = 0; | |
for (int i=0, j; i < n; i = j){ | |
j = i + 1; | |
while (j < n && !bh[j]) j++; | |
next[i] = j; | |
buckets++; | |
} | |
if (buckets == n) break; // We are done! Lucky bastards! | |
//{suffixes are separted in buckets containing txtings starting with the same h characters} | |
for (int i = 0; i < n; i = next[i]){ | |
cnt[i] = 0; | |
for (int j = i; j < next[i]; ++j){ | |
iSA[SA[j]] = i; | |
} | |
} | |
cnt[iSA[n - h]]++; | |
b2h[iSA[n - h]] = true; | |
for (int i = 0; i < n; i = next[i]){ | |
for (int j = i; j < next[i]; ++j){ | |
int s = SA[j] - h; | |
if (s >= 0){ | |
int head = iSA[s]; | |
iSA[s] = head + cnt[head]++; | |
b2h[iSA[s]] = true; | |
} | |
} | |
for (int j = i; j < next[i]; ++j){ | |
int s = SA[j] - h; | |
if (s >= 0 && b2h[iSA[s]]){ | |
for (int k = iSA[s]+1; !bh[k] && b2h[k]; k++) b2h[k] = false; | |
} | |
} | |
} | |
for (int i=0; i<n; ++i){ | |
SA[iSA[i]] = i; | |
bh[i] |= b2h[i]; | |
} | |
} | |
for (int i=0; i<n; ++i){ | |
iSA[SA[i]] = i; | |
} | |
} | |
// End of suffix array algorithm | |
// Begin of the O(n) longest common prefix algorithm | |
// Refer to "Linear-Time Longest-Common-Prefix Computation in Suffix | |
// Arrays and Its Applications" by Toru Kasai, Gunho Lee, Hiroki | |
// Arimura, Setsuo Arikawa, and Kunsoo Park. | |
int lcp[MAX]; | |
// lcp[i] = length of the longest common prefix of suffix SA[i] and suffix SA[i-1] | |
// lcp[0] = 0 | |
void getlcp(int n) | |
{ | |
for (int i=0; i<n; ++i) | |
iSA[SA[i]] = i; | |
lcp[0] = 0; | |
for (int i=0, h=0; i<n; ++i) | |
{ | |
if (iSA[i] > 0) | |
{ | |
int j = SA[iSA[i]-1]; | |
while (i + h < n && j + h < n && txt[i+h] == txt[j+h]) | |
h++; | |
lcp[iSA[i]] = h; | |
if (h > 0) | |
h--; | |
} | |
} | |
} | |
// End of longest common prefixes algorithm | |
int main() | |
{ | |
int len; | |
gets(txt); | |
len = strlen(txt); | |
suffixSort(len); | |
for (int i = 0; i < len; ++i) | |
{ | |
printf("%d\n",SA[i] ); | |
} | |
return 0; | |
} |
FYI: On line 81 k
may reach n
, meaning bh[k]
and b2h[k]
are out-of-bounds accesses.
Something that can be easily detected if you use a proper C++ container, instead of writing unsafe C-style code and saving it with a .cpp extension.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Please explain the SA part, specially the linear time sorting, because of which the code is nlgn.