Skip to content

Instantly share code, notes, and snippets.

@Wollw
Created March 1, 2012 04:54
Show Gist options
  • Save Wollw/1947371 to your computer and use it in GitHub Desktop.
Save Wollw/1947371 to your computer and use it in GitHub Desktop.
Wavelet Tree
/*
* An implementation of the Wavelet Tree data structure.
* It is similar to a binary search tree using bits.
*
* More info here:
* http://siganakis.com/challenge-design-a-data-structure-thats-small
* http://www.alexbowe.com/wavelet-trees
*/
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <string.h>
#include <assert.h>
#define SET_BIT(a, i) (a[i/(sizeof(*a)*CHAR_BIT)]\
|= 1 << (i%(sizeof(*a)*CHAR_BIT)))
#define CLR_BIT(a, i) (a[i/(sizeof(*a)*CHAR_BIT)]\
&= ~(1 << (i%(sizeof(*a)*CHAR_BIT))))
typedef struct wavelet_t {
unsigned char *table;
struct wavelet_t *left;
struct wavelet_t *right;
} wavelet_t;
wavelet_t * wavelet_encode(char *str, size_t len, char min, char max);
char * wavelet_decode(char *str, wavelet_t *w);
char wavelet_decode_index(wavelet_t *w, size_t i, char min, char max);
void wavelet_free(wavelet_t *w);
int main(int argc, char **argv) {
char *str = "Hello, world!";
if (argc > 1) {
str = argv[1];
}
wavelet_t *w = wavelet_encode(str, strlen(str)+1, 0, 127);
str = malloc(strlen(str)+1 * sizeof(char));
assert(str != NULL);
puts(wavelet_decode(str, w));
free(str);
wavelet_free(w);
return 0;
}
wavelet_t * wavelet_encode(char *str, size_t len, char min, char max) {
wavelet_t *w = malloc(sizeof(wavelet_t));
assert(w != NULL);
w->left = NULL;
w->right = NULL;
w->table = malloc(len * sizeof(char));
assert(w->table != NULL);
char *str_l, *str_r;
size_t len_l, len_r;
str_l = malloc(len * sizeof(char));
assert(str_l != NULL);
str_r = malloc(len * sizeof(char));
assert(str_r != NULL);
len_l = 0;
len_r = 0;
/*
* Each char in the allocated space can hold CHAR_BIT bits
* so for each character being encoded we divide by CHAR_BIT
* to get the current byte to store in. (i % CHAR_BIT) gives
* the offset into that byte we need to store the next bit at.
*
* The characters themselves are sorted into left and right char
* arrays that will be passed to the next call of this function
* to repeat until there are only two values to encode.
*/
size_t i;
char mid = min+(max-min)/2;
for (i = 0; i < len; i++) {
if (str[i] > mid) {
SET_BIT(w->table, i);
str_r[len_r++] = str[i];
} else {
CLR_BIT(w->table, i);
str_l[len_l++] = str[i];
}
}
if (max - min > 1) {
w->left = wavelet_encode(str_l, len_l, min, mid);
w->right = wavelet_encode(str_r, len_r, mid + 1, max);
}
free(str_l);
free(str_r);
return w;
}
char * wavelet_decode(char *str, wavelet_t *w) {
size_t i = 0;
while ((str[i] = wavelet_decode_index(w, i, 0, 127))) i++;
return str;
}
/*
* We keep track of the min and max values so that we can return the
* character code for the index being decoded based on it. I doubt this is
* the best way to do it but it's what I came up with.
*/
char wavelet_decode_index(wavelet_t *w, size_t i, char min, char max) {
char mid = min + (max-min)/2;
char bit = w->table[i/CHAR_BIT] & 1 << (i % CHAR_BIT) ? 1 : 0;
char b;
size_t j;
size_t k = 0;
for (j = 0; j < i; j++) {
b = w->table[j/CHAR_BIT] & 1 << (j % CHAR_BIT) ? 1 : 0;
k += b == bit ? 1 : 0;
}
w = bit ? w->right : w->left;
if (w == NULL)
return min + (bit ? 1 : 0);
else if (bit)
return wavelet_decode_index(w, k, mid+1, max);
else
return wavelet_decode_index(w, k, min, mid);
}
void wavelet_free(wavelet_t *w) {
if (w->left) {
wavelet_free(w->left);
w->left = NULL;
}
if (w->right) {
wavelet_free(w->right);
w->right = NULL;
}
free(w->table);
free(w);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment