Last active
January 9, 2017 14:31
-
-
Save nishidy/6a846f2d174b2b3f8c31833513c67918 to your computer and use it in GitHub Desktop.
Working implementation with main function of SA-IS algorithm for building Suffix Array in C from the paper.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdbool.h> | |
unsigned char mask[]={0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01}; | |
#define tget(i) ( (t[(i)/8]&mask[(i)%8]) ? 1 : 0 ) | |
#define tset(i, b) t[(i)/8]=(b)\ | |
?(mask[(i)%8]|t[(i)/8])\ | |
:((~mask[(i)%8])&t[(i)/8]) | |
#define chr(i) (cs==sizeof(int)\ | |
?((int*)s)[i]\ | |
:((unsigned char *)s)[i]) | |
#define isLMS(i) (i>0 && tget(i) && !tget(i-1)) | |
// find the start or end of each bucket | |
// - cs is used in definition chr(i) | |
// - cs means the size(type) of each character in s | |
// - 'bool end' means you want to find end rather than start | |
void getBuckets(unsigned char *s, int *bkt, int n, int K, int cs, bool end) { | |
int i, sum=0; | |
// clear all buckets | |
for(i=0; i<=K; i++) bkt[i]=0; | |
// compute the size of each bucket | |
for(i=0; i<n; i++) bkt[chr(i)]++; | |
for(i=0; i<=K; i++) | |
{ sum+=bkt[i]; bkt[i]=end ? sum : sum-bkt[i]; } | |
} | |
// compute SAl | |
// - end is always false for SAl | |
void induceSAl(unsigned char *t, int *SA, | |
unsigned char *s, int *bkt, | |
int n, int K, int cs, bool end) { | |
int i, j; | |
// find starts of buckets | |
getBuckets(s, bkt, n, K, cs, end); | |
// - Here, forget the index of LMS in bkt | |
for(i=0; i<n; i++) { | |
j=SA[i]-1; // * The next to its left in s | |
if(j>=0 && !tget(j)) SA[bkt[chr(j)]++]=j; | |
// - !tget(j) means that this is for S-Type | |
} | |
} | |
// compute SAs | |
// - end is always true for SAs | |
void induceSAs(unsigned char *t, int *SA, | |
unsigned char *s, int *bkt, | |
int n, int K, int cs, bool end) { | |
int i, j; | |
// find ends of buckets | |
getBuckets(s, bkt, n, K, cs, end); | |
// find ends of buckets | |
for(i=n-1; i>=0; i--) { | |
j=SA[i]-1; | |
if(j>=0 && tget(j)) SA[--bkt[chr(j)]]=j; | |
// - tget(j) means that this is for L-Type | |
// - Be sure that LMS is overwritten here in SA! | |
} | |
} | |
// find the suffix array SA of s[0..n-1] in {1..K}ˆn | |
// require s[n-1]=0 (the sentinel!), n>=2 | |
// use a working space (excluding s and SA) of | |
// at most 2.25n+O(1) for a constant alphabet | |
// - K is # of alphabets in s | |
// - cs is character size (int or unsignec char) | |
// This works to use SA(int *) as string s(unsigned char*) | |
// in recursive call | |
// - s is represented by integers in ascii | |
// ex) XXYYZ -> 112230 (K=3) (0 is sentinel) | |
// - The interger is used as index of bkt | |
// - This is durable for recursive call | |
void SA_IS(unsigned char *s, int *SA, int n, int K, int cs) { | |
// LS-type array in bits | |
unsigned char *t=(unsigned char *)malloc(n/8+1); | |
// - this is bit array made of type unsigned char | |
int i, j; | |
// classify the type of each character | |
// the sentinel must be in s1, important!!! | |
tset(n-2, 0); // - make the bit down | |
tset(n-1, 1); // - make the bit up // The sentinel | |
// - The quotinent means the place of a byte | |
// - The remainder menas a bit | |
// - ex) n = 10 | |
// - tset(10-2=8, 0) | |
// - t[8/8]=t[1]= | |
// - : ((~mask[(8)%8])&t[(8)/8] ) | |
// - ((~mask[0])&t[1]) | |
// - (0b01111111&0) | |
// - (0b00000000) | |
// - tset(10-1=9, 1) | |
// - t[9/8]=t[1]= | |
// - ? (mask[(9)%8])|t[(9)/8]) | |
// - (mask[1]|t[1]) | |
// - (0b01000000|0) | |
// - (0b01000000) | |
for(i=n-3; i>=0; i--){ | |
tset(i, (chr(i)<chr(i+1) | |
|| (chr(i)==chr(i+1) | |
&& tget(i+1)==1))?1:0); | |
} | |
// stage 1: reduce the problem by at least 1/2 | |
// sort all the S-substrings | |
// bucket array | |
int *bkt = (int *)malloc(sizeof(int)*(K+1)); | |
// find ends of buckets | |
getBuckets(s, bkt, n, K, cs, true); | |
// - If end is true, bkt saves index of each end of alphabets for SA | |
// - If end is false , bkt saves index of each start of alphabets for SA | |
// - SA is n length | |
// - Initialize the suffix array with -1 | |
for(i=0; i<n; i++) SA[i]=-1; | |
// - The first character in s is not LMS | |
// - Therefore, i always starts with 1, not 0 | |
for(i=1; i<n; i++) | |
if(isLMS(i)) SA[--bkt[chr(i)]]=i; | |
// - Put the index of each LMS backward | |
// - in each container of bkt into SA | |
induceSAl(t, SA, s, bkt, n, K, cs, false); | |
induceSAs(t, SA, s, bkt, n, K, cs, true); | |
free(bkt); | |
// compact all the sorted substrings into | |
// the first n1 items of SA | |
// 2*n1 must be not larger than n (proveable) | |
int n1=0; | |
for(i=0; i<n; i++) | |
if(isLMS(SA[i])) SA[n1++]=SA[i]; | |
// - Put LMS forward from the first index in SA | |
// - After that, initialize SA except for LMS (below) | |
// - SA is just working variable for this purpose | |
// find the lexicographic names of substrings | |
// init the name array buffer | |
for(i=n1; i<n; i++) SA[i]=-1; | |
int name=0, prev=-1; | |
for(i=0; i<n1; i++) { | |
int pos=SA[i]; | |
bool diff=false; | |
for(int d=0; d<n; d++) { | |
if(prev==-1 || chr(pos+d)!=chr(prev+d) || tget(pos+d)!=tget(prev+d)) { | |
// - The first one (i=0) must come here due to prev==-1 | |
diff=true; | |
break; | |
} else if (d>0 && (isLMS(pos+d) || isLMS(prev+d))) { | |
// - Get out of this loop at next LMS | |
break; | |
} | |
} | |
if(diff) { | |
name++; // - count the unique names | |
prev=pos; | |
} | |
pos=(pos%2==0)?pos/2:(pos-1)/2; | |
// - Why divide by 2? | |
SA[n1+pos]=name-1; | |
// - n1 is # of LMS in s | |
// - Remember that SA contains all LMS in the tail of it | |
// - This is like to copy from head to tail of SA | |
} | |
for(i=n-1, j=n-1; i>=n1; i--) | |
if(SA[i]>=0) SA[j--]=SA[i]; | |
// stage 2: solve the reduced problem | |
// recurse if names are not yet unique | |
int *SA1=SA, *s1=SA+n-n1; | |
// - Be sure that s1 is int* and +(n-n1) moves the pointer | |
// - from the head of SA to (n-n1)*sizeof(int) | |
if(name<n1){ | |
// - The last argument is sizeof(int) | |
// - Because s1 is int*, not unsigned char* | |
SA_IS((unsigned char*)s1, SA1, n1, name-1, sizeof(int)); | |
}else{ // generate the suffix array of s1 directly | |
for(i=0; i<n1; i++) SA1[s1[i]] = i; | |
} | |
// stage 3: induce the result for | |
// the original problem | |
// bucket array | |
bkt = (int *)malloc(sizeof(int)*(K+1)); | |
// put all the LMS characters into their buckets | |
// find ends of buckets | |
getBuckets(s, bkt, n, K, cs, true); | |
for(i=1, j=0; i<n; i++) | |
if(isLMS(i)) s1[j++]=i; // get p1 | |
// get index in s | |
for(i=0; i<n1; i++) SA1[i]=s1[SA1[i]]; | |
// init SA[n1..n-1] | |
for(i=n1; i<n; i++) SA[i]=-1; | |
for(i=n1-1; i>=0; i--) { | |
j=SA[i]; | |
SA[i]=-1; | |
SA[--bkt[chr(j)]]=j; | |
} | |
induceSAl(t, SA, s, bkt, n, K, cs, false); | |
induceSAs(t, SA, s, bkt, n, K, cs, true); | |
free(bkt); | |
free(t); | |
} | |
int count_uniq(int num, unsigned char* input, unsigned char* names){ | |
int i, ascii[127] = {0}, count = 0; | |
for(i=0;i<num;i++) | |
ascii[input[i]]++; | |
for(i=0;i<127;i++) | |
if(ascii[i]>0) | |
names[i] = ++count; | |
return count; | |
} | |
int main(int argc, char* argv[]){ | |
if(argc!=2) return 1; | |
unsigned char* input = (unsigned char*)argv[1]; | |
int n=0; | |
for(;input[n]!='\0';++n); | |
printf("Raw input string(%d) : %s\n",n,input); | |
unsigned char names[127] = {0}; | |
unsigned char* s = malloc(sizeof(unsigned char*)*(n+1)); | |
int K = count_uniq(n,input,names); | |
n++; // For sentinel | |
for(int i=0;i<n-1;i++) | |
s[i] = (unsigned char)names[input[i]]; | |
s[n-1] = 0; // sentinel | |
printf("Named input string(%d) : ",n); | |
for(int i=0;i<n;i++) | |
printf("%d",s[i]); | |
printf("\n"); | |
int* SA = malloc(sizeof(int*)*n); | |
SA_IS(s,SA,n,K,sizeof(unsigned char)); | |
printf("The suffix array :\n"); | |
// - The first one is always sentinel, so skip it. | |
for(int i=1;i<n;i++){ | |
printf("%d:%d:",i,SA[i]); | |
for(int j=SA[i];j<n;j++){ | |
for(int k=0;k<127;k++){ | |
if(names[k]==s[j]){ | |
printf("%c",(unsigned char)k); | |
break; | |
} | |
} | |
} | |
printf("\n"); | |
} | |
} |
Author
nishidy
commented
Jan 9, 2017
•
$ ./sais abracadabra
Raw input string(11) : abracadabra
Named input string(12) : 125131412510
The suffix array :
1:10:a
2:7:abra
3:0:abracadabra
4:3:acadabra
5:5:adabra
6:8:bra
7:1:bracadabra
8:4:cadabra
9:6:dabra
10:9:ra
11:2:racadabra
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment