Skip to content

Instantly share code, notes, and snippets.

@calmhandtitan
Created December 25, 2013 00:13
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save calmhandtitan/8119030 to your computer and use it in GitHub Desktop.
Save calmhandtitan/8119030 to your computer and use it in GitHub Desktop.
Suffix Array Construction in O(nlogn) using Manber and Myers algo and linear time LCP array construction
#include "stdio.h"
#include "stdlib.h"
#include "string.h"
#include "algorithm"
using namespace std;
// Begins Suffix Arrays implementation
// O(n log n) - Manber and Myers algorithm
// Refer to "Suffix arrays: A new method for on-line txting searches",
// by Udi Manber and Gene Myers
//Usage:
// Fill txt with the characters of the txting.
// Call SuffixSort(n), where n is the length of the txting stored in txt.
// That's it!
//Output:
// SA = The suffix array. Contains the n suffixes of txt sorted in lexicographical order.
// Each suffix is represented as a single integer (the SAition of txt where it starts).
// iSA = The inverse of the suffix array. iSA[i] = the index of the suffix txt[i..n)
// in the SA array. (In other words, SA[i] = k <==> iSA[k] = i)
// With this array, you can compare two suffixes in O(1): Suffix txt[i..n) is smaller
// than txt[j..n) if and only if iSA[i] < iSA[j]
const int MAX = 100010;
char txt[MAX]; //input
int iSA[MAX], SA[MAX]; //output
int cnt[MAX], next[MAX]; //internal
bool bh[MAX], b2h[MAX];
// Compares two suffixes according to their first characters
bool smaller_first_char(int a, int b){
return txt[a] < txt[b];
}
void suffixSort(int n){
//sort suffixes according to their first characters
for (int i=0; i<n; ++i){
SA[i] = i;
}
sort(SA, SA + n, smaller_first_char);
//{SA contains the list of suffixes sorted by their first character}
for (int i=0; i<n; ++i){
bh[i] = i == 0 || txt[SA[i]] != txt[SA[i-1]];
b2h[i] = false;
}
for (int h = 1; h < n; h <<= 1){
//{bh[i] == false if the first h characters of SA[i-1] == the first h characters of SA[i]}
int buckets = 0;
for (int i=0, j; i < n; i = j){
j = i + 1;
while (j < n && !bh[j]) j++;
next[i] = j;
buckets++;
}
if (buckets == n) break; // We are done! Lucky bastards!
//{suffixes are separted in buckets containing txtings starting with the same h characters}
for (int i = 0; i < n; i = next[i]){
cnt[i] = 0;
for (int j = i; j < next[i]; ++j){
iSA[SA[j]] = i;
}
}
cnt[iSA[n - h]]++;
b2h[iSA[n - h]] = true;
for (int i = 0; i < n; i = next[i]){
for (int j = i; j < next[i]; ++j){
int s = SA[j] - h;
if (s >= 0){
int head = iSA[s];
iSA[s] = head + cnt[head]++;
b2h[iSA[s]] = true;
}
}
for (int j = i; j < next[i]; ++j){
int s = SA[j] - h;
if (s >= 0 && b2h[iSA[s]]){
for (int k = iSA[s]+1; !bh[k] && b2h[k]; k++) b2h[k] = false;
}
}
}
for (int i=0; i<n; ++i){
SA[iSA[i]] = i;
bh[i] |= b2h[i];
}
}
for (int i=0; i<n; ++i){
iSA[SA[i]] = i;
}
}
// End of suffix array algorithm
// Begin of the O(n) longest common prefix algorithm
// Refer to "Linear-Time Longest-Common-Prefix Computation in Suffix
// Arrays and Its Applications" by Toru Kasai, Gunho Lee, Hiroki
// Arimura, Setsuo Arikawa, and Kunsoo Park.
int lcp[MAX];
// lcp[i] = length of the longest common prefix of suffix SA[i] and suffix SA[i-1]
// lcp[0] = 0
void getlcp(int n)
{
for (int i=0; i<n; ++i)
iSA[SA[i]] = i;
lcp[0] = 0;
for (int i=0, h=0; i<n; ++i)
{
if (iSA[i] > 0)
{
int j = SA[iSA[i]-1];
while (i + h < n && j + h < n && txt[i+h] == txt[j+h])
h++;
lcp[iSA[i]] = h;
if (h > 0)
h--;
}
}
}
// End of longest common prefixes algorithm
int main()
{
int len;
gets(txt);
len = strlen(txt);
suffixSort(len);
for (int i = 0; i < len; ++i)
{
printf("%d\n",SA[i] );
}
return 0;
}
@onewiththeonly1
Copy link

Please explain the SA part, specially the linear time sorting, because of which the code is nlgn.

@Andreshk
Copy link

Andreshk commented Jan 12, 2019

FYI: On line 81 k may reach n, meaning bh[k] and b2h[k] are out-of-bounds accesses.

Something that can be easily detected if you use a proper C++ container, instead of writing unsafe C-style code and saving it with a .cpp extension.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment