Last active
February 13, 2021 22:19
-
-
Save gui11aume/b6b941f8e61a98d1b304dab8322cf319 to your computer and use it in GitHub Desktop.
Learn Burrows-Wheeler indexing (compression)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
struct occ_t { uint32_t bits; uint32_t smpl; }; | |
typedef struct occ_t occ_t; | |
#define L 14 // Length of the text. | |
#define L4 ((L+3) / 4 ) // Length down-sampled 4 times. | |
#define L16 ((L+15) / 16) // Length down-sampled 16 times. | |
#define L32 ((L+31) / 32) // Length down-sampled 32 times. | |
// Global variables. | |
char TXT[L] = "GATGCGAGAGATG$"; | |
int SA[L] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13}; | |
int CSA[L16] = {0}; | |
char BWT[L4] = {0}; | |
int C[4] = {0}; | |
occ_t OCC[4][L32] = {0}; | |
int po$; // Position of the terminator in BWT. | |
int NUM[256] = { | |
['A'] = 0, ['C'] = 1, ['G'] = 2, ['T'] = 3, | |
['a'] = 0, ['c'] = 1, ['g'] = 2, ['t'] = 3, | |
}; | |
int compare_suffixes (const void * a, const void * b) { | |
const char * suff_a = &TXT[*(int *)a]; | |
const char * suff_b = &TXT[*(int *)b]; | |
return strcmp(suff_a, suff_b); | |
} | |
void construct_BWT (void) { | |
char tmp = 0; | |
for (int i = 0 ; i < L ; i++) { | |
if (SA[i] > 0) tmp |= (NUM[TXT[SA[i]-1]] << (2 * (i % 4))); | |
else po$ = i; | |
if (i % 4 == 3) { | |
BWT[i/4] = tmp; // Write 4 characters per byte. | |
tmp = 0; | |
} | |
} | |
if (L % 4 != 3) BWT[L4] = tmp; // Write the last byte if needed. | |
} | |
int get_BWT (int pos) { | |
if (pos == po$) return -1; | |
return (BWT[pos/4] >> (2 * (pos % 4))) & 0b11; | |
} | |
void construct_C_and_OCC (void) { | |
occ_t entry[4] = {0}; | |
for (int i = 0 ; i < L ; i++) { | |
if (po$ != i) { | |
entry[get_BWT(i)].smpl++; | |
entry[get_BWT(i)].bits |= (1 << (31 - (i % 32))); | |
} | |
if (i % 32 == 31) { | |
for (int j = 0 ; j < 4 ; j++) { | |
OCC[j][i/32] = entry[j]; // Write entry to 'OCC'. | |
entry[j].bits = 0; | |
} | |
} | |
} | |
if (L % 32 != 31) { // Write the last entry if needed. | |
for (int j = 0 ; j < 4 ; j++) OCC[j][L32-1] = entry[j]; | |
} | |
int tmp = 1; | |
for (int j = 0 ; j < 4 ; j++) { | |
tmp += entry[j].smpl; | |
C[j] = tmp - entry[j].smpl; | |
} | |
} | |
void compress_SA (void) { | |
for (int i = 0 ; i <= L ; i += 16) { | |
CSA[i/16] = SA[i]; | |
} | |
} | |
int get_rank (int c, int pos) { | |
return C[c] + OCC[c][pos/32].smpl - | |
__builtin_popcount(OCC[c][pos/32].bits << (1 + (pos % 32))); | |
} | |
int query_CSA (int pos) { | |
if (pos % 16 == 0) return CSA[pos/16]; | |
if (pos == po$) return 0; | |
return 1 + query_CSA(get_rank(get_BWT(pos),pos-1)); | |
} | |
void backward_search (char * query) { | |
int bot = 0; | |
int top = L-1; | |
for (int pos = strlen(query)-1 ; pos > -1 ; pos--) { | |
int c = NUM[query[pos]]; | |
bot = bot > 0 ? get_rank(c, bot-1): C[c]; | |
top = get_rank(c, top)-1; | |
if (top < bot) break; | |
} | |
for (int i = bot ; i <= top ; i++) { | |
printf("%s:%d\n", query, query_CSA(i)); | |
} | |
} | |
int main(void) { | |
qsort(SA, L, sizeof(int), compare_suffixes); | |
construct_BWT(); | |
construct_C_and_OCC(); | |
compress_SA(); // Can delete 'SA' from here. | |
backward_search("GAGA"); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
See blog post commenting the code at the following link.
http://blog.thegrandlocus.com/2017/05/a-tutorial-on-burrows-wheeler-indexing-methods-3
Compile with
gcc -std=c99 -g -O0 learn_bwt_indexing_compression.c -o run
and execute with./run
.Start debug session with
gdb run
. In the session typeb main
and thenr
.