Learn Burrows-Wheeler indexing (vanilla)
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#define L 14 // Length of the text. | |
// Global variables. | |
char TXT[L] = "GATGCGAGAGATG$"; | |
int SA[L] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13}; | |
char BWT[L] = {0}; | |
int C[4] = {0}; | |
int OCC[4][L] = {0}; | |
int NUM[256] = { | |
['A'] = 0, ['C'] = 1, ['G'] = 2, ['T'] = 3, | |
['a'] = 0, ['c'] = 1, ['g'] = 2, ['t'] = 3, | |
}; | |
int compare_suffixes (const void * a, const void * b) { | |
const char * suff_a = &TXT[*(int *)a]; | |
const char * suff_b = &TXT[*(int *)b]; | |
return strcmp(suff_a, suff_b); | |
} | |
void construct_BWT (void) { | |
for (int i = 0 ; i < L ; i++) { | |
if (SA[i] > 0) BWT[i] = TXT[SA[i]-1]; | |
else BWT[i] = '$'; | |
} | |
} | |
void construct_C_and_OCC (void) { | |
for (int i = 0 ; i < L ; i++) { | |
if (BWT[i] != '$') C[NUM[BWT[i]]]++; | |
for (int j = 0 ; j < 4 ; j++) OCC[j][i] = C[j]; | |
} | |
int tmp = 1; | |
for (int j = 0 ; j < 4 ; j++) { | |
tmp += C[j]; | |
C[j] = tmp - C[j]; | |
} | |
} | |
void backward_search (char * query) { | |
int bot = 0; | |
int top = L-1; | |
for (int pos = strlen(query)-1 ; pos > -1 ; pos--) { | |
int c = NUM[query[pos]]; | |
bot = C[c] + (bot > 0 ? OCC[c][bot-1] : 0); | |
top = C[c] + OCC[c][top]-1; | |
if (top < bot) break; | |
} | |
for (int i = bot ; i <= top ; i++) { | |
printf("%s: %d\n", query, SA[i]); | |
} | |
} | |
int main(void) { | |
qsort(SA, L, sizeof(int), compare_suffixes); | |
construct_BWT(); | |
construct_C_and_OCC(); | |
backward_search("GAGA"); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
See blog post commenting the code at the following link.
http://blog.thegrandlocus.com/2017/05/a-tutorial-on-burrows-wheeler-indexing-methods-2
Compile with
gcc -std=c99 -g -O0 learn_bwt_indexing_vanilla.c -o run
and execute with./run
.Start debug session with
gdb run
. In the session typeb main
and thenr
.