Skip to content

Instantly share code, notes, and snippets.

@tomonari-masada
Created February 28, 2017 08:47
Show Gist options
  • Save tomonari-masada/a2a744a5b8cb19e0727f4efa62577352 to your computer and use it in GitHub Desktop.
Save tomonari-masada/a2a744a5b8cb19e0727f4efa62577352 to your computer and use it in GitHub Desktop.
extracting maximal substrings from UTF8 Japanese strings
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#define SEPCHAR '_'
#define MAXLEN 32
#define BUFFSIZE 1000000
#define TOKENLEN 8
#define getUTF8CharSize(c) ((((c) & 0x80) == 0) ? 1 : ((((c) & 0xe0) == 0xc0) ? 2 : ((((c) & 0xf0) == 0xe0) ? 3 : ((((c) & 0xf8) == 0xf0) ? 0 : ((((c) & 0xfc) == 0xf8) ? 0 : ((((c) & 0xfe) == 0xfc) ? 0 : 0))))))
int nNumOfChars;
char **sChars;
int getUTF8CharInt(char *s)
{
char c = s[0];
if ((c & 0x80) == 0) return (int) (c & 0x7f);
else if ((c & 0xe0) == 0xc0)
return ((int) ((s[0] & 0x1f) >> 2)) * 0x100 + ((int) ((s[0] & 0x3) << 6)) + ((int) (s[1] & 0x3f));
else if ((c & 0xf0) == 0xe0)
return (((int) ((s[0] & 0xf) << 4)) + ((int) ((s[1] & 0x3c) >> 2))) * 0x100
+ ((int) ((s[1] & 0x3) << 6)) + ((int) (s[2] & 0x3f));
else if ((c & 0xf8) == 0xf0) return 0;
else if ((c & 0xfc) == 0xf8) return 0;
else if ((c & 0xfe) == 0xfc) return 0;
else return 0;
}
int getUTF8IntChar(int n, char *s)
{
if (n <= 0x7f) { s[0] = (char) n; return 1; }
else if (n <= 0x7ff) {
s[0] = ((char) (n / 0x40)) | 0xc0;
s[1] = ((char) (n & 0x3f)) | 0x80;
return 2; }
else if (n <= 0xffff) {
s[0] = ((char) (n / 0x1000)) | 0xe0;
s[1] = ((char) ((n & 0xfff) / 0x40)) | 0x80;
s[2] = ((char) (n & 0x3f)) | 0x80;
return 3; }
else
return 0;
}
void fprintUTF8(FILE *f, int *p, int n)
{
char s[6];
int i;
if (n == 0) n = 1;
for (i = 0; i < n; i ++) {
if ((*(p)) == 0) return;
memset(s, 0, 6);
getUTF8IntChar(*(p ++), s);
fprintf(f, "%s", s);
}
return;
}
int getUTF8str(char *t, int *p, int n)
{
char s[6];
int i, l;
if (n == 0) n = 1;
l = 0;
for (i = 0; i < n; i ++) {
if ((*(p)) == 0) break;
memset(s, 0, 6);
getUTF8IntChar(*(p ++), s);
l += strlen(s);
if (l >= MAXLEN) return 1;
strcat(t, s);
}
i = 0;
while (t[i] != 0) if (t[i] == SEPCHAR) return 1; else i ++;
return 0;
}
// https://sites.google.com/site/yuta256/sais
unsigned char mask[] = {0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
#define tget(i) ( (t[(i) / 8] & mask[(i) % 8]) ? 1 : 0 )
#define tset(i, b) t[(i) / 8] = (b) ? (mask[(i) % 8] | t[(i) / 8]) : ((~ mask[(i) % 8]) & t[(i) / 8])
#define chr(i) (cs == sizeof(int) ? ((int *) s)[i] : ((unsigned char *) s)[i])
#define isLMS(i) (i > 0 && tget(i) && ! tget(i - 1))
void getBuckets(unsigned char *s, int *bkt, int n, int K, int cs, bool end)
{
int i, sum = 0;
for (i = 0; i <= K; i ++) bkt[i] = 0;
for (i = 0; i < n; i ++) bkt[chr(i)] ++;
for (i = 0; i <= K; i ++) { sum += bkt[i]; bkt[i] = end ? sum : sum - bkt[i]; }
}
void induceSAl(unsigned char *t, int *SA, unsigned char *s, int *bkt, int n, int K, int cs)
{
int i, j;
getBuckets(s, bkt, n, K, cs, false);
for (i = 0; i < n; i ++) {
j = SA[i] - 1;
if (j >= 0 && ! tget(j)) SA[bkt[chr(j)] ++ ] = j;
}
}
void induceSAs(unsigned char *t, int *SA, unsigned char *s, int *bkt, int n, int K, int cs)
{
int i, j;
getBuckets(s, bkt, n, K, cs, true);
for (i = n - 1; i >= 0; i --) {
j = SA[i] - 1;
if (j >= 0 && tget(j)) SA[-- bkt[chr(j)]] = j;
}
}
void SAIS(unsigned char *s, int *SA, int n, int K, int cs)
{
int i, j;
unsigned char *t = (unsigned char *) malloc(n / 8 + 1);
tset(n - 2, 0); tset(n - 1, 1);
for (i = n - 3; i >= 0; i --)
tset(i, (chr(i) < chr(i + 1) || (chr(i) == chr(i + 1) && tget(i + 1) == 1)) ? 1 : 0);
int *bkt = (int *) malloc(sizeof(int) * (K + 1));
getBuckets(s, bkt, n, K, cs, true);
for (i = 0; i < n; i ++) SA[i] = -1;
for (i = 1; i < n; i ++)
if (isLMS(i)) SA[-- bkt[chr(i)]] = i;
induceSAl(t, SA, s, bkt, n, K, cs);
induceSAs(t, SA, s, bkt, n, K, cs);
free(bkt);
int n1 = 0;
for (i = 0; i < n; i ++)
if (isLMS(SA[i])) SA[n1 ++] = SA[i];
for (i = n1; i < n; i ++) SA[i] = -1;
int name = 0, prev = -1;
for (i = 0; i < n1; i ++) {
int pos = SA[i]; bool diff = false;
int d;
for (d = 0; d < n; d ++)
if (prev == -1 || chr(pos + d) != chr(prev + d) || tget(pos + d) != tget(prev + d))
{ diff = true; break; }
else if (d > 0 && (isLMS(pos + d) || isLMS(prev + d))) break;
if (diff) { name ++; prev = pos; }
pos = (pos % 2 == 0) ? pos / 2 : (pos - 1) / 2;
SA[n1 + pos] = name - 1;
}
for (i = n - 1, j = n - 1; i >= n1; i --)
if (SA[i] >= 0) SA[j -- ] = SA[i];
int *SA1; int *s1;
SA1 = SA; s1 = SA + n - n1;
if (name < n1)
SAIS((unsigned char *) s1, SA1, n1, name - 1, sizeof(int));
else
for (i = 0; i < n1; i ++) SA1[s1[i]] = i;
bkt = (int *) malloc(sizeof(int) * (K + 1));
getBuckets(s, bkt, n, K, cs, true);
for (i = 1, j = 0; i < n; i ++)
if (isLMS(i)) s1[j ++] = i;
for (i = 0; i < n1; i ++) SA1[i] = s1[SA1[i]];
for (i = n1; i < n; i ++) SA[i] = -1;
for (i = n1 - 1; i >= 0; i --) {
j = SA[i]; SA[i] = -1;
SA[-- bkt[chr(j)]] = j;
}
induceSAl(t, SA, s, bkt, n, K, cs);
induceSAs(t, SA, s, bkt, n, K, cs);
free(bkt); free(t);
}
void GetHeight(unsigned char *s, int *SA, int *Height, int n, int cs)
{
int i;
int *r = (int *) malloc(sizeof(int) * n); // rank in SA
for (i = 0; i < n; i ++) r[SA[i]] = i;
int h = 0;
for (i = 0; i < n; i ++)
if (r[i] > 0) {
int j = SA[r[i] - 1];
while (chr(i + h) == chr(j + h)) h ++;
Height[r[i] - 1] = h;
if (h > 0) h --;
}
Height[n - 1] = 0;
return;
}
void ChildTable(int *Height, int *Up, int *Down, int n)
{
int i;
int *stack = (int *) malloc(sizeof(int));
int size;
int last = -1;
for (i = 0; i < n; i ++) { Up[i] = Down[i] = -1; }
size = 1;
stack[size - 1] = 0;
for (i = 1; i < n; i ++) {
while (Height[i] < Height[stack[size - 1]]) {
last = stack[size - 1];
size --;
if ((Height[i] <= Height[stack[size - 1]]) && (Height[stack[size - 1]] != Height[last]))
Down[stack[size - 1]] = last;
}
if (last != -1) {
Up[i] = last;
last = -1;
}
size ++;
stack = (int *) realloc(stack, sizeof(int) * size);
stack[size - 1] = i;
}
free(stack);
return;
}
void NextIndex(int *Height, int *Next, int n)
{
int i;
int last;
int *stack = (int *) malloc(sizeof(int));
int size = 0;
for (i = 0; i < n; i ++) { Next[i] = -1; }
size = 1;
stack[size - 1] = 0;
for (i = 1; i < n; i ++) {
while (Height[i] < Height[stack[size - 1]]) size --;
if (Height[i] == Height[stack[size - 1]]) {
last = stack[size - 1];
size --;
Next[last] = i;
}
size ++;
stack = (int *) realloc(stack, sizeof(int) * size);
stack[size - 1] = i;
}
free(stack);
return;
}
int nDocs;
int *nPos2Doc;
int GetInterval(int *input, int *SA, int *BWT, int *height, int *up, int *down, int *next, int n1, int n2)
{
int k;
char sStr[MAXLEN];
if (n1 != n2) {
int i, j, k2;
if ((n1 <= up[n2]) && (up[n2] < n2))
i = up[n2];
else
i = down[n1 - 1];
k2 = BWT[n1];
k = GetInterval(input, SA, BWT, height, up, down, next, n1, i);
if ((k == 0) || (k != k2)) k = 0;
j = next[i];
while (j != -1) {
k2 = k;
k = GetInterval(input, SA, BWT, height, up, down, next, i + 1, j);
if ((k == 0) || (k != k2)) k = 0;
i = j;
j = next[i];
}
k2 = k;
k = GetInterval(input, SA, BWT, height, up, down, next, i + 1, n2);
if ((k == 0) || (k != k2)) k = 0;
if (k == 0) {
int l = height[n1];
i = n1 + 1;
while (i < n2) {
if (height[i] < l) l = height[i];
i ++;
}
for (i = n1; i <= n2; i ++) {
/*
printf("%d\t%d\t", nPos2Doc[SA[i]], SA[i]);
fprintUTF8(stdout, &(input[SA[n1]]), l);
printf("\n");
*/
memset(sStr, 0, MAXLEN);
if (! getUTF8str(sStr, &(input[SA[n1]]), l))
printf("%d\t%d\t%s\n", nPos2Doc[SA[i]], SA[i], sStr);
}
}
} else {
k = BWT[n1];
}
return k;
}
int main()
{
char sBuff[BUFFSIZE];
int nCharSize;
int nCount;
int nLen;
int *nInput;
int *nSA;
int *nBWT;
int *nHeight;
int *nUp;
int *nDown;
int *nNext;
nLen = 0;
nInput = (int *) malloc(sizeof(int));
nDocs = 0;
nPos2Doc = (int *) malloc(sizeof(int));
while (! feof(stdin)) {
memset(sBuff, 0, BUFFSIZE);
fgets(sBuff, BUFFSIZE - 1, stdin);
nCount = 0;
while (sBuff[nCount] != 0 && sBuff[nCount] != '\r' && sBuff[nCount] != '\n') {
nCharSize = getUTF8CharSize(sBuff[nCount]);
if (nCharSize > 3) { fprintf(stderr, "a character of length more than 3 bytes\n"); exit(1); }
else if (nCharSize > 0) {
nLen ++;
nInput = (int *) realloc(nInput, sizeof(int) * nLen);
nInput[nLen - 1] = getUTF8CharInt(&(sBuff[nCount]));
nCount += nCharSize;
nPos2Doc = (int *) realloc(nPos2Doc, sizeof(int) * nLen);
nPos2Doc[nLen - 1] = nDocs;
}
}
nLen ++;
nInput = (int *) realloc(nInput, sizeof(int) * nLen);
nInput[nLen - 1] = SEPCHAR;
nCount ++;
nPos2Doc = (int *) realloc(nPos2Doc, sizeof(int) * nLen);
nPos2Doc[nLen - 1] = nDocs;
nDocs ++;
}
nLen ++;
nInput = (int *) realloc(nInput, sizeof(int) * nLen);
nInput[nLen - 1] = 0;
nSA = (int *) malloc(sizeof(int) * nLen);
SAIS((unsigned char *) nInput, nSA, nLen, 0xffff, sizeof(int));
nBWT = (int *) malloc(sizeof(int) * nLen);
for (nCount = 0; nCount < nLen; nCount ++)
if (nSA[nCount] > 0) nBWT[nCount] = nInput[nSA[nCount] - 1];
else nBWT[nCount] = nInput[nLen - 1];
nHeight = (int *) malloc(sizeof(int) * nLen);
GetHeight((unsigned char *) nInput, nSA, nHeight, nLen, sizeof(int));
nUp = (int *) malloc(sizeof(int) * nLen);
nDown = (int *) malloc(sizeof(int) * nLen);
nNext = (int *) malloc(sizeof(int) * nLen);
ChildTable(nHeight, nUp, nDown, nLen);
NextIndex(nHeight, nNext, nLen);
/*
for (nCount = 0; nCount < nLen; nCount ++) {
char sChar[6];
memset(sChar, 0, 6);
getUTF8IntChar(nBWT[nCount], sChar);
printf("%d\t%d\t%s\t%d\t%d\t%d\t%d\t", nCount, nSA[nCount], sChar,
nHeight[nCount], nUp[nCount], nDown[nCount], nNext[nCount]);
fprintUTF8(stdout, &(nInput[nSA[nCount]]), 20);
printf("\n");
}
*/
nCount = 1;
do {
int nCount2 = nCount + 1;
while ((nCount2 < nLen) && (nInput[nSA[nCount]] == nInput[nSA[nCount2]]))
nCount2 ++;
GetInterval(nInput, nSA, nBWT, nHeight, nUp, nDown, nNext, nCount, nCount2 - 1);
nCount = nCount2;
} while (nCount < nLen);
return 0;
}
@tomonari-masada
Copy link
Author

Input format:
Each document in each line.

Output format:
[doc id] [position] [maximal substring]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment