Skip to content

Instantly share code, notes, and snippets.

@gui11aume
Last active February 13, 2021 22:19
Show Gist options
  • Save gui11aume/b6b941f8e61a98d1b304dab8322cf319 to your computer and use it in GitHub Desktop.
Save gui11aume/b6b941f8e61a98d1b304dab8322cf319 to your computer and use it in GitHub Desktop.
Learn Burrows-Wheeler indexing (compression)
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
struct occ_t { uint32_t bits; uint32_t smpl; };
typedef struct occ_t occ_t;
#define L 14 // Length of the text.
#define L4 ((L+3) / 4 ) // Length down-sampled 4 times.
#define L16 ((L+15) / 16) // Length down-sampled 16 times.
#define L32 ((L+31) / 32) // Length down-sampled 32 times.
// Global variables.
char TXT[L] = "GATGCGAGAGATG$";
int SA[L] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13};
int CSA[L16] = {0};
char BWT[L4] = {0};
int C[4] = {0};
occ_t OCC[4][L32] = {0};
int po$; // Position of the terminator in BWT.
int NUM[256] = {
['A'] = 0, ['C'] = 1, ['G'] = 2, ['T'] = 3,
['a'] = 0, ['c'] = 1, ['g'] = 2, ['t'] = 3,
};
int compare_suffixes (const void * a, const void * b) {
const char * suff_a = &TXT[*(int *)a];
const char * suff_b = &TXT[*(int *)b];
return strcmp(suff_a, suff_b);
}
void construct_BWT (void) {
char tmp = 0;
for (int i = 0 ; i < L ; i++) {
if (SA[i] > 0) tmp |= (NUM[TXT[SA[i]-1]] << (2 * (i % 4)));
else po$ = i;
if (i % 4 == 3) {
BWT[i/4] = tmp; // Write 4 characters per byte.
tmp = 0;
}
}
if (L % 4 != 3) BWT[L4] = tmp; // Write the last byte if needed.
}
int get_BWT (int pos) {
if (pos == po$) return -1;
return (BWT[pos/4] >> (2 * (pos % 4))) & 0b11;
}
void construct_C_and_OCC (void) {
occ_t entry[4] = {0};
for (int i = 0 ; i < L ; i++) {
if (po$ != i) {
entry[get_BWT(i)].smpl++;
entry[get_BWT(i)].bits |= (1 << (31 - (i % 32)));
}
if (i % 32 == 31) {
for (int j = 0 ; j < 4 ; j++) {
OCC[j][i/32] = entry[j]; // Write entry to 'OCC'.
entry[j].bits = 0;
}
}
}
if (L % 32 != 31) { // Write the last entry if needed.
for (int j = 0 ; j < 4 ; j++) OCC[j][L32-1] = entry[j];
}
int tmp = 1;
for (int j = 0 ; j < 4 ; j++) {
tmp += entry[j].smpl;
C[j] = tmp - entry[j].smpl;
}
}
void compress_SA (void) {
for (int i = 0 ; i <= L ; i += 16) {
CSA[i/16] = SA[i];
}
}
int get_rank (int c, int pos) {
return C[c] + OCC[c][pos/32].smpl -
__builtin_popcount(OCC[c][pos/32].bits << (1 + (pos % 32)));
}
int query_CSA (int pos) {
if (pos % 16 == 0) return CSA[pos/16];
if (pos == po$) return 0;
return 1 + query_CSA(get_rank(get_BWT(pos),pos-1));
}
void backward_search (char * query) {
int bot = 0;
int top = L-1;
for (int pos = strlen(query)-1 ; pos > -1 ; pos--) {
int c = NUM[query[pos]];
bot = bot > 0 ? get_rank(c, bot-1): C[c];
top = get_rank(c, top)-1;
if (top < bot) break;
}
for (int i = bot ; i <= top ; i++) {
printf("%s:%d\n", query, query_CSA(i));
}
}
int main(void) {
qsort(SA, L, sizeof(int), compare_suffixes);
construct_BWT();
construct_C_and_OCC();
compress_SA(); // Can delete 'SA' from here.
backward_search("GAGA");
}
@gui11aume
Copy link
Author

gui11aume commented May 14, 2017

See blog post commenting the code at the following link.
http://blog.thegrandlocus.com/2017/05/a-tutorial-on-burrows-wheeler-indexing-methods-3

Compile with gcc -std=c99 -g -O0 learn_bwt_indexing_compression.c -o run and execute with ./run.
Start debug session with gdb run. In the session type b main and then r.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment