Skip to content

Instantly share code, notes, and snippets.

@nishidy
Last active January 9, 2017 14:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nishidy/6a846f2d174b2b3f8c31833513c67918 to your computer and use it in GitHub Desktop.
Save nishidy/6a846f2d174b2b3f8c31833513c67918 to your computer and use it in GitHub Desktop.
Working implementation with main function of SA-IS algorithm for building Suffix Array in C from the paper.
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
unsigned char mask[]={0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
#define tget(i) ( (t[(i)/8]&mask[(i)%8]) ? 1 : 0 )
#define tset(i, b) t[(i)/8]=(b)\
?(mask[(i)%8]|t[(i)/8])\
:((~mask[(i)%8])&t[(i)/8])
#define chr(i) (cs==sizeof(int)\
?((int*)s)[i]\
:((unsigned char *)s)[i])
#define isLMS(i) (i>0 && tget(i) && !tget(i-1))
// find the start or end of each bucket
// - cs is used in definition chr(i)
// - cs means the size(type) of each character in s
// - 'bool end' means you want to find end rather than start
void getBuckets(unsigned char *s, int *bkt, int n, int K, int cs, bool end) {
int i, sum=0;
// clear all buckets
for(i=0; i<=K; i++) bkt[i]=0;
// compute the size of each bucket
for(i=0; i<n; i++) bkt[chr(i)]++;
for(i=0; i<=K; i++)
{ sum+=bkt[i]; bkt[i]=end ? sum : sum-bkt[i]; }
}
// compute SAl
// - end is always false for SAl
void induceSAl(unsigned char *t, int *SA,
unsigned char *s, int *bkt,
int n, int K, int cs, bool end) {
int i, j;
// find starts of buckets
getBuckets(s, bkt, n, K, cs, end);
// - Here, forget the index of LMS in bkt
for(i=0; i<n; i++) {
j=SA[i]-1; // * The next to its left in s
if(j>=0 && !tget(j)) SA[bkt[chr(j)]++]=j;
// - !tget(j) means that this is for S-Type
}
}
// compute SAs
// - end is always true for SAs
void induceSAs(unsigned char *t, int *SA,
unsigned char *s, int *bkt,
int n, int K, int cs, bool end) {
int i, j;
// find ends of buckets
getBuckets(s, bkt, n, K, cs, end);
// find ends of buckets
for(i=n-1; i>=0; i--) {
j=SA[i]-1;
if(j>=0 && tget(j)) SA[--bkt[chr(j)]]=j;
// - tget(j) means that this is for L-Type
// - Be sure that LMS is overwritten here in SA!
}
}
// find the suffix array SA of s[0..n-1] in {1..K}ˆn
// require s[n-1]=0 (the sentinel!), n>=2
// use a working space (excluding s and SA) of
// at most 2.25n+O(1) for a constant alphabet
// - K is # of alphabets in s
// - cs is character size (int or unsignec char)
// This works to use SA(int *) as string s(unsigned char*)
// in recursive call
// - s is represented by integers in ascii
// ex) XXYYZ -> 112230 (K=3) (0 is sentinel)
// - The interger is used as index of bkt
// - This is durable for recursive call
void SA_IS(unsigned char *s, int *SA, int n, int K, int cs) {
// LS-type array in bits
unsigned char *t=(unsigned char *)malloc(n/8+1);
// - this is bit array made of type unsigned char
int i, j;
// classify the type of each character
// the sentinel must be in s1, important!!!
tset(n-2, 0); // - make the bit down
tset(n-1, 1); // - make the bit up // The sentinel
// - The quotinent means the place of a byte
// - The remainder menas a bit
// - ex) n = 10
// - tset(10-2=8, 0)
// - t[8/8]=t[1]=
// - : ((~mask[(8)%8])&t[(8)/8] )
// - ((~mask[0])&t[1])
// - (0b01111111&0)
// - (0b00000000)
// - tset(10-1=9, 1)
// - t[9/8]=t[1]=
// - ? (mask[(9)%8])|t[(9)/8])
// - (mask[1]|t[1])
// - (0b01000000|0)
// - (0b01000000)
for(i=n-3; i>=0; i--){
tset(i, (chr(i)<chr(i+1)
|| (chr(i)==chr(i+1)
&& tget(i+1)==1))?1:0);
}
// stage 1: reduce the problem by at least 1/2
// sort all the S-substrings
// bucket array
int *bkt = (int *)malloc(sizeof(int)*(K+1));
// find ends of buckets
getBuckets(s, bkt, n, K, cs, true);
// - If end is true, bkt saves index of each end of alphabets for SA
// - If end is false , bkt saves index of each start of alphabets for SA
// - SA is n length
// - Initialize the suffix array with -1
for(i=0; i<n; i++) SA[i]=-1;
// - The first character in s is not LMS
// - Therefore, i always starts with 1, not 0
for(i=1; i<n; i++)
if(isLMS(i)) SA[--bkt[chr(i)]]=i;
// - Put the index of each LMS backward
// - in each container of bkt into SA
induceSAl(t, SA, s, bkt, n, K, cs, false);
induceSAs(t, SA, s, bkt, n, K, cs, true);
free(bkt);
// compact all the sorted substrings into
// the first n1 items of SA
// 2*n1 must be not larger than n (proveable)
int n1=0;
for(i=0; i<n; i++)
if(isLMS(SA[i])) SA[n1++]=SA[i];
// - Put LMS forward from the first index in SA
// - After that, initialize SA except for LMS (below)
// - SA is just working variable for this purpose
// find the lexicographic names of substrings
// init the name array buffer
for(i=n1; i<n; i++) SA[i]=-1;
int name=0, prev=-1;
for(i=0; i<n1; i++) {
int pos=SA[i];
bool diff=false;
for(int d=0; d<n; d++) {
if(prev==-1 || chr(pos+d)!=chr(prev+d) || tget(pos+d)!=tget(prev+d)) {
// - The first one (i=0) must come here due to prev==-1
diff=true;
break;
} else if (d>0 && (isLMS(pos+d) || isLMS(prev+d))) {
// - Get out of this loop at next LMS
break;
}
}
if(diff) {
name++; // - count the unique names
prev=pos;
}
pos=(pos%2==0)?pos/2:(pos-1)/2;
// - Why divide by 2?
SA[n1+pos]=name-1;
// - n1 is # of LMS in s
// - Remember that SA contains all LMS in the tail of it
// - This is like to copy from head to tail of SA
}
for(i=n-1, j=n-1; i>=n1; i--)
if(SA[i]>=0) SA[j--]=SA[i];
// stage 2: solve the reduced problem
// recurse if names are not yet unique
int *SA1=SA, *s1=SA+n-n1;
// - Be sure that s1 is int* and +(n-n1) moves the pointer
// - from the head of SA to (n-n1)*sizeof(int)
if(name<n1){
// - The last argument is sizeof(int)
// - Because s1 is int*, not unsigned char*
SA_IS((unsigned char*)s1, SA1, n1, name-1, sizeof(int));
}else{ // generate the suffix array of s1 directly
for(i=0; i<n1; i++) SA1[s1[i]] = i;
}
// stage 3: induce the result for
// the original problem
// bucket array
bkt = (int *)malloc(sizeof(int)*(K+1));
// put all the LMS characters into their buckets
// find ends of buckets
getBuckets(s, bkt, n, K, cs, true);
for(i=1, j=0; i<n; i++)
if(isLMS(i)) s1[j++]=i; // get p1
// get index in s
for(i=0; i<n1; i++) SA1[i]=s1[SA1[i]];
// init SA[n1..n-1]
for(i=n1; i<n; i++) SA[i]=-1;
for(i=n1-1; i>=0; i--) {
j=SA[i];
SA[i]=-1;
SA[--bkt[chr(j)]]=j;
}
induceSAl(t, SA, s, bkt, n, K, cs, false);
induceSAs(t, SA, s, bkt, n, K, cs, true);
free(bkt);
free(t);
}
int count_uniq(int num, unsigned char* input, unsigned char* names){
int i, ascii[127] = {0}, count = 0;
for(i=0;i<num;i++)
ascii[input[i]]++;
for(i=0;i<127;i++)
if(ascii[i]>0)
names[i] = ++count;
return count;
}
int main(int argc, char* argv[]){
if(argc!=2) return 1;
unsigned char* input = (unsigned char*)argv[1];
int n=0;
for(;input[n]!='\0';++n);
printf("Raw input string(%d) : %s\n",n,input);
unsigned char names[127] = {0};
unsigned char* s = malloc(sizeof(unsigned char*)*(n+1));
int K = count_uniq(n,input,names);
n++; // For sentinel
for(int i=0;i<n-1;i++)
s[i] = (unsigned char)names[input[i]];
s[n-1] = 0; // sentinel
printf("Named input string(%d) : ",n);
for(int i=0;i<n;i++)
printf("%d",s[i]);
printf("\n");
int* SA = malloc(sizeof(int*)*n);
SA_IS(s,SA,n,K,sizeof(unsigned char));
printf("The suffix array :\n");
// - The first one is always sentinel, so skip it.
for(int i=1;i<n;i++){
printf("%d:%d:",i,SA[i]);
for(int j=SA[i];j<n;j++){
for(int k=0;k<127;k++){
if(names[k]==s[j]){
printf("%c",(unsigned char)k);
break;
}
}
}
printf("\n");
}
}
@nishidy
Copy link
Author

nishidy commented Jan 9, 2017

$ ./sais mmiissiissiippii
Raw input string(16) : mmiissiissiippii
Named input string(17) : 22114411441133110
The suffix array :
1:15:i
2:14:ii
3:10:iippii
4:6:iissiippii
5:2:iissiissiippii
6:11:ippii
7:7:issiippii
8:3:issiissiippii
9:1:miissiissiippii
10:0:mmiissiissiippii
11:13:pii
12:12:ppii
13:9:siippii
14:5:siissiippii
15:8:ssiippii
16:4:ssiissiippii

@nishidy
Copy link
Author

nishidy commented Jan 9, 2017

$ ./sais abracadabra
Raw input string(11) : abracadabra
Named input string(12) : 125131412510
The suffix array :
1:10:a
2:7:abra
3:0:abracadabra
4:3:acadabra
5:5:adabra
6:8:bra
7:1:bracadabra
8:4:cadabra
9:6:dabra
10:9:ra
11:2:racadabra

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment