nishidy/sais_orig.c

## sais_orig.c
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>

unsigned char mask[]={0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};

#define tget(i) ( (t[(i)/8]&mask[(i)%8]) ? 1 : 0 )

#define tset(i, b) t[(i)/8]=(b)\
                      ?(mask[(i)%8]|t[(i)/8])\
                      :((~mask[(i)%8])&t[(i)/8])

#define chr(i) (cs==sizeof(int)\
                      ?((int*)s)[i]\
                      :((unsigned char *)s)[i])

#define isLMS(i) (i>0 && tget(i) && !tget(i-1))

// find the start or end of each bucket
// - cs is used in definition chr(i)
// - cs means the size(type) of each character in s
// - 'bool end' means you want to find end rather than start
void getBuckets(unsigned char *s, int *bkt, int n, int K, int cs, bool end) {

  int i, sum=0;

  // clear all buckets
  for(i=0; i<=K; i++) bkt[i]=0;

  // compute the size of each bucket
  for(i=0; i<n; i++) bkt[chr(i)]++;

  for(i=0; i<=K; i++)
  { sum+=bkt[i]; bkt[i]=end ? sum : sum-bkt[i]; }

}

// compute SAl
// - end is always false for SAl
void induceSAl(unsigned char *t, int *SA,
               unsigned char *s, int *bkt,
               int n, int K, int cs, bool end) {
  int i, j;

  // find starts of buckets
  getBuckets(s, bkt, n, K, cs, end);
  // - Here, forget the index of LMS in bkt

  for(i=0; i<n; i++) {
      j=SA[i]-1; // * The next to its left in s
      if(j>=0 && !tget(j)) SA[bkt[chr(j)]++]=j;
      // - !tget(j) means that this is for S-Type
  }
}

// compute SAs
// - end is always true for SAs
void induceSAs(unsigned char *t, int *SA,
               unsigned char *s, int *bkt,
               int n, int K, int cs, bool end) {
  int i, j;

  // find ends of buckets
  getBuckets(s, bkt, n, K, cs, end);

  // find ends of buckets
  for(i=n-1; i>=0; i--) {
      j=SA[i]-1;
      if(j>=0 && tget(j)) SA[--bkt[chr(j)]]=j;
      // - tget(j) means that this is for L-Type
      // - Be sure that LMS is overwritten here in SA!
  }
}

// find the suffix array SA of s[0..n-1] in {1..K}ˆn
// require s[n-1]=0 (the sentinel!), n>=2
// use a working space (excluding s and SA) of
// at most 2.25n+O(1) for a constant alphabet

// - K is # of alphabets in s
// - cs is character size (int or unsignec char)
//   This works to use SA(int *) as string s(unsigned char*)
//   in recursive call
// - s is represented by integers in ascii
//   ex) XXYYZ -> 112230 (K=3) (0 is sentinel)
//   - The interger is used as index of bkt
//   - This is durable for recursive call

void SA_IS(unsigned char *s, int *SA, int n, int K, int cs) {

  // LS-type array in bits
  unsigned char *t=(unsigned char *)malloc(n/8+1);
  // - this is bit array made of type unsigned char

  int i, j;

  // classify the type of each character
  // the sentinel must be in s1, important!!!

  tset(n-2, 0); // - make the bit down
  tset(n-1, 1); // - make the bit up // The sentinel

  // - The quotinent means the place of a byte
  // - The remainder menas a bit

  // - ex) n = 10
  // - tset(10-2=8, 0)
  // - t[8/8]=t[1]=
  // - : ((~mask[(8)%8])&t[(8)/8] )
  // -   ((~mask[0])&t[1])
  // -   (0b01111111&0)
  // -   (0b00000000)

  // - tset(10-1=9, 1)
  // - t[9/8]=t[1]=
  // - ? (mask[(9)%8])|t[(9)/8])
  // -   (mask[1]|t[1])
  // -   (0b01000000|0)
  // -   (0b01000000)

  for(i=n-3; i>=0; i--){
    tset(i, (chr(i)<chr(i+1)
             || (chr(i)==chr(i+1)
             && tget(i+1)==1))?1:0);
  }

  // stage 1: reduce the problem by at least 1/2
  // sort all the S-substrings

  // bucket array
  int *bkt = (int *)malloc(sizeof(int)*(K+1));

  // find ends of buckets
  getBuckets(s, bkt, n, K, cs, true);
  // - If end is true, bkt saves index of each end of alphabets for SA
  // - If end is false , bkt saves index of each start of alphabets for SA

  // - SA is n length
  // - Initialize the suffix array with -1
  for(i=0; i<n; i++) SA[i]=-1;

  // - The first character in s is not LMS
  // - Therefore, i always starts with 1, not 0
  for(i=1; i<n; i++)
    if(isLMS(i)) SA[--bkt[chr(i)]]=i;
  // - Put the index of each LMS backward
  // - in each container of bkt into SA

  induceSAl(t, SA, s, bkt, n, K, cs, false);
  induceSAs(t, SA, s, bkt, n, K, cs, true);
  free(bkt);

  // compact all the sorted substrings into
  // the first n1 items of SA
  // 2*n1 must be not larger than n (proveable)
  int n1=0;
  for(i=0; i<n; i++)
    if(isLMS(SA[i])) SA[n1++]=SA[i];
  // - Put LMS forward from the first index in SA
  // - After that, initialize SA except for LMS (below)
  // - SA is just working variable for this purpose

  // find the lexicographic names of substrings
  // init the name array buffer
  for(i=n1; i<n; i++) SA[i]=-1;

  int name=0, prev=-1;
  for(i=0; i<n1; i++) {
    int pos=SA[i];
    bool diff=false;
    for(int d=0; d<n; d++) {
      if(prev==-1 || chr(pos+d)!=chr(prev+d) || tget(pos+d)!=tget(prev+d)) {
          // - The first one (i=0) must come here due to prev==-1
          diff=true;
          break;
      } else if (d>0 && (isLMS(pos+d) || isLMS(prev+d))) {
          // - Get out of this loop at next LMS
          break;
      }
    }

    if(diff) {
        name++; // - count the unique names
        prev=pos;
    }
    pos=(pos%2==0)?pos/2:(pos-1)/2;
    // - Why divide by 2?
    SA[n1+pos]=name-1;
    // - n1 is # of LMS in s
    // - Remember that SA contains all LMS in the tail of it
    // - This is like to copy from head to tail of SA
  }

  for(i=n-1, j=n-1; i>=n1; i--)
      if(SA[i]>=0) SA[j--]=SA[i];

  // stage 2: solve the reduced problem
  // recurse if names are not yet unique
  int *SA1=SA, *s1=SA+n-n1;
  // - Be sure that s1 is int* and +(n-n1) moves the pointer
  // - from the head of SA to (n-n1)*sizeof(int)

  if(name<n1){
    // - The last argument is sizeof(int)
    // - Because s1 is int*, not unsigned char*
    SA_IS((unsigned char*)s1, SA1, n1, name-1, sizeof(int));
  }else{ // generate the suffix array of s1 directly
    for(i=0; i<n1; i++) SA1[s1[i]] = i;
  }

  // stage 3: induce the result for
  // the original problem

  // bucket array
  bkt = (int *)malloc(sizeof(int)*(K+1));

  // put all the LMS characters into their buckets
  // find ends of buckets
  getBuckets(s, bkt, n, K, cs, true);

  for(i=1, j=0; i<n; i++)
    if(isLMS(i)) s1[j++]=i; // get p1

  // get index in s
  for(i=0; i<n1; i++) SA1[i]=s1[SA1[i]];

  // init SA[n1..n-1]
  for(i=n1; i<n; i++) SA[i]=-1;

  for(i=n1-1; i>=0; i--) {
      j=SA[i];
      SA[i]=-1;
      SA[--bkt[chr(j)]]=j;
  }

  induceSAl(t, SA, s, bkt, n, K, cs, false);
  induceSAs(t, SA, s, bkt, n, K, cs, true);
  free(bkt);
  free(t);
}

int count_uniq(int num, unsigned char* input, unsigned char* names){
    int i, ascii[127] = {0}, count = 0;
    for(i=0;i<num;i++)
        ascii[input[i]]++;
    for(i=0;i<127;i++)
        if(ascii[i]>0)
            names[i] = ++count;
    return count;
}

int main(int argc, char* argv[]){
    if(argc!=2) return 1;

    unsigned char* input = (unsigned char*)argv[1];
    int n=0;
    for(;input[n]!='\0';++n);

    printf("Raw input string(%d) : %s\n",n,input);

    unsigned char names[127] = {0};
    unsigned char* s = malloc(sizeof(unsigned char*)*(n+1));

    int K = count_uniq(n,input,names);

    n++; // For sentinel
    for(int i=0;i<n-1;i++)
        s[i] = (unsigned char)names[input[i]];
    s[n-1] = 0; // sentinel

    printf("Named input string(%d) : ",n);
    for(int i=0;i<n;i++)
        printf("%d",s[i]);
    printf("\n");

    int* SA = malloc(sizeof(int*)*n);
    SA_IS(s,SA,n,K,sizeof(unsigned char));

    printf("The suffix array :\n");
    // - The first one is always sentinel, so skip it.
    for(int i=1;i<n;i++){
        printf("%d:%d:",i,SA[i]);
        for(int j=SA[i];j<n;j++){
            for(int k=0;k<127;k++){
                if(names[k]==s[j]){
                    printf("%c",(unsigned char)k);
                    break;
                }
            }
        }
        printf("\n");
    }
}
	#include <stdio.h>
	#include <stdlib.h>
	#include <stdbool.h>

	unsigned char mask[]={0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};

	#define tget(i) ( (t[(i)/8]&mask[(i)%8]) ? 1 : 0 )

	#define tset(i, b) t[(i)/8]=(b)\
	?(mask[(i)%8]\|t[(i)/8])\
	:((~mask[(i)%8])&t[(i)/8])

	#define chr(i) (cs==sizeof(int)\
	?((int*)s)[i]\
	:((unsigned char *)s)[i])

	#define isLMS(i) (i>0 && tget(i) && !tget(i-1))

	// find the start or end of each bucket
	// - cs is used in definition chr(i)
	// - cs means the size(type) of each character in s
	// - 'bool end' means you want to find end rather than start
	void getBuckets(unsigned char s, int bkt, int n, int K, int cs, bool end) {

	int i, sum=0;

	// clear all buckets
	for(i=0; i<=K; i++) bkt[i]=0;

	// compute the size of each bucket
	for(i=0; i<n; i++) bkt[chr(i)]++;

	for(i=0; i<=K; i++)
	{ sum+=bkt[i]; bkt[i]=end ? sum : sum-bkt[i]; }

	}

	// compute SAl
	// - end is always false for SAl
	void induceSAl(unsigned char t, int SA,
	unsigned char s, int bkt,
	int n, int K, int cs, bool end) {
	int i, j;

	// find starts of buckets
	getBuckets(s, bkt, n, K, cs, end);
	// - Here, forget the index of LMS in bkt

	for(i=0; i<n; i++) {
	j=SA[i]-1; // * The next to its left in s
	if(j>=0 && !tget(j)) SA[bkt[chr(j)]++]=j;
	// - !tget(j) means that this is for S-Type
	}
	}

	// compute SAs
	// - end is always true for SAs
	void induceSAs(unsigned char t, int SA,
	unsigned char s, int bkt,
	int n, int K, int cs, bool end) {
	int i, j;

	// find ends of buckets
	getBuckets(s, bkt, n, K, cs, end);

	// find ends of buckets
	for(i=n-1; i>=0; i--) {
	j=SA[i]-1;
	if(j>=0 && tget(j)) SA[--bkt[chr(j)]]=j;
	// - tget(j) means that this is for L-Type
	// - Be sure that LMS is overwritten here in SA!
	}
	}

	// find the suffix array SA of s[0..n-1] in {1..K}ˆn
	// require s[n-1]=0 (the sentinel!), n>=2
	// use a working space (excluding s and SA) of
	// at most 2.25n+O(1) for a constant alphabet

	// - K is # of alphabets in s
	// - cs is character size (int or unsignec char)
	// This works to use SA(int ) as string s(unsigned char)
	// in recursive call
	// - s is represented by integers in ascii
	// ex) XXYYZ -> 112230 (K=3) (0 is sentinel)
	// - The interger is used as index of bkt
	// - This is durable for recursive call

	void SA_IS(unsigned char s, int SA, int n, int K, int cs) {

	// LS-type array in bits
	unsigned char t=(unsigned char )malloc(n/8+1);
	// - this is bit array made of type unsigned char

	int i, j;

	// classify the type of each character
	// the sentinel must be in s1, important!!!

	tset(n-2, 0); // - make the bit down
	tset(n-1, 1); // - make the bit up // The sentinel

	// - The quotinent means the place of a byte
	// - The remainder menas a bit

	// - ex) n = 10
	// - tset(10-2=8, 0)
	// - t[8/8]=t[1]=
	// - : ((~mask[(8)%8])&t[(8)/8] )
	// - ((~mask[0])&t[1])
	// - (0b01111111&0)
	// - (0b00000000)

	// - tset(10-1=9, 1)
	// - t[9/8]=t[1]=
	// - ? (mask[(9)%8])\|t[(9)/8])
	// - (mask[1]\|t[1])
	// - (0b01000000\|0)
	// - (0b01000000)

	for(i=n-3; i>=0; i--){
	tset(i, (chr(i)<chr(i+1)
	\|\| (chr(i)==chr(i+1)
	&& tget(i+1)==1))?1:0);
	}

	// stage 1: reduce the problem by at least 1/2
	// sort all the S-substrings

	// bucket array
	int bkt = (int )malloc(sizeof(int)*(K+1));

	// find ends of buckets
	getBuckets(s, bkt, n, K, cs, true);
	// - If end is true, bkt saves index of each end of alphabets for SA
	// - If end is false , bkt saves index of each start of alphabets for SA

	// - SA is n length
	// - Initialize the suffix array with -1
	for(i=0; i<n; i++) SA[i]=-1;

	// - The first character in s is not LMS
	// - Therefore, i always starts with 1, not 0
	for(i=1; i<n; i++)
	if(isLMS(i)) SA[--bkt[chr(i)]]=i;
	// - Put the index of each LMS backward
	// - in each container of bkt into SA

	induceSAl(t, SA, s, bkt, n, K, cs, false);
	induceSAs(t, SA, s, bkt, n, K, cs, true);
	free(bkt);

	// compact all the sorted substrings into
	// the first n1 items of SA
	// 2*n1 must be not larger than n (proveable)
	int n1=0;
	for(i=0; i<n; i++)
	if(isLMS(SA[i])) SA[n1++]=SA[i];
	// - Put LMS forward from the first index in SA
	// - After that, initialize SA except for LMS (below)
	// - SA is just working variable for this purpose

	// find the lexicographic names of substrings
	// init the name array buffer
	for(i=n1; i<n; i++) SA[i]=-1;

	int name=0, prev=-1;
	for(i=0; i<n1; i++) {
	int pos=SA[i];
	bool diff=false;
	for(int d=0; d<n; d++) {
	if(prev==-1 \|\| chr(pos+d)!=chr(prev+d) \|\| tget(pos+d)!=tget(prev+d)) {
	// - The first one (i=0) must come here due to prev==-1
	diff=true;
	break;
	} else if (d>0 && (isLMS(pos+d) \|\| isLMS(prev+d))) {
	// - Get out of this loop at next LMS
	break;
	}
	}

	if(diff) {
	name++; // - count the unique names
	prev=pos;
	}
	pos=(pos%2==0)?pos/2:(pos-1)/2;
	// - Why divide by 2?
	SA[n1+pos]=name-1;
	// - n1 is # of LMS in s
	// - Remember that SA contains all LMS in the tail of it
	// - This is like to copy from head to tail of SA
	}

	for(i=n-1, j=n-1; i>=n1; i--)
	if(SA[i]>=0) SA[j--]=SA[i];

	// stage 2: solve the reduced problem
	// recurse if names are not yet unique
	int SA1=SA, s1=SA+n-n1;
	// - Be sure that s1 is int* and +(n-n1) moves the pointer
	// - from the head of SA to (n-n1)*sizeof(int)

	if(name<n1){
	// - The last argument is sizeof(int)
	// - Because s1 is int, not unsigned char
	SA_IS((unsigned char*)s1, SA1, n1, name-1, sizeof(int));
	}else{ // generate the suffix array of s1 directly
	for(i=0; i<n1; i++) SA1[s1[i]] = i;
	}

	// stage 3: induce the result for
	// the original problem

	// bucket array
	bkt = (int )malloc(sizeof(int)(K+1));

	// put all the LMS characters into their buckets
	// find ends of buckets
	getBuckets(s, bkt, n, K, cs, true);

	for(i=1, j=0; i<n; i++)
	if(isLMS(i)) s1[j++]=i; // get p1

	// get index in s
	for(i=0; i<n1; i++) SA1[i]=s1[SA1[i]];

	// init SA[n1..n-1]
	for(i=n1; i<n; i++) SA[i]=-1;

	for(i=n1-1; i>=0; i--) {
	j=SA[i];
	SA[i]=-1;
	SA[--bkt[chr(j)]]=j;
	}

	induceSAl(t, SA, s, bkt, n, K, cs, false);
	induceSAs(t, SA, s, bkt, n, K, cs, true);
	free(bkt);
	free(t);
	}

	int count_uniq(int num, unsigned char* input, unsigned char* names){
	int i, ascii[127] = {0}, count = 0;
	for(i=0;i<num;i++)
	ascii[input[i]]++;
	for(i=0;i<127;i++)
	if(ascii[i]>0)
	names[i] = ++count;
	return count;
	}

	int main(int argc, char* argv[]){
	if(argc!=2) return 1;

	unsigned char* input = (unsigned char*)argv[1];
	int n=0;
	for(;input[n]!='\0';++n);

	printf("Raw input string(%d) : %s\n",n,input);

	unsigned char names[127] = {0};
	unsigned char* s = malloc(sizeof(unsigned char)(n+1));

	int K = count_uniq(n,input,names);

	n++; // For sentinel
	for(int i=0;i<n-1;i++)
	s[i] = (unsigned char)names[input[i]];
	s[n-1] = 0; // sentinel

	printf("Named input string(%d) : ",n);
	for(int i=0;i<n;i++)
	printf("%d",s[i]);
	printf("\n");

	int* SA = malloc(sizeof(int)n);
	SA_IS(s,SA,n,K,sizeof(unsigned char));

	printf("The suffix array :\n");
	// - The first one is always sentinel, so skip it.
	for(int i=1;i<n;i++){
	printf("%d:%d:",i,SA[i]);
	for(int j=SA[i];j<n;j++){
	for(int k=0;k<127;k++){
	if(names[k]==s[j]){
	printf("%c",(unsigned char)k);
	break;
	}
	}
	}
	printf("\n");
	}
	}