Skip to content

Instantly share code, notes, and snippets.

@dorianim
Last active November 30, 2022 12:24
Show Gist options
  • Save dorianim/f2d61e9ae836dd073c0162547667282d to your computer and use it in GitHub Desktop.
Save dorianim/f2d61e9ae836dd073c0162547667282d to your computer and use it in GitHub Desktop.
Super simple Huffman text compression in Python and c

Huffman from Python to C

This can be used to encode a string using huffman compression in Python and decode it in Python or c.

class HuffmanTree:
def __init__(self, char: str, count: int, left=None, right=None):
self.char = char
self.count = count
self.left: HuffmanTree | None = left
self.rigth: HuffmanTree | None = right
def __repr__(self) -> str:
return f"Node('{self.char}'({ord(self.char)}), {self.count})"
def as_byte_list(self):
if self.char == "\x00":
return [ord(self.char)] + self.left.as_byte_list() + self.rigth.as_byte_list()
return [ord(self.char)]
@staticmethod
def __count_chars(string: str) -> dict:
counts = {}
for char in string:
if char in counts:
counts[char] += 1
else:
counts[char] = 1
return counts
@staticmethod
def from_string(string: str):
char_counts = HuffmanTree.__count_chars(string)
nodes = []
for char in char_counts:
nodes.append(HuffmanTree(char, char_counts[char]))
while len(nodes) > 1:
nodes.sort(key=lambda node: node.count, reverse=True)
right = nodes.pop()
left = nodes.pop()
parent_node = HuffmanTree("\x00",
left.count + right.count, left, right)
nodes.append(parent_node)
return nodes[0]
def __calculate_lut(tree: HuffmanTree, dict={}, val=[]) -> dict:
if tree.char != "\x00":
dict[tree.char] = val
else:
dict = __calculate_lut(tree.left, dict, val + [0b1])
dict = __calculate_lut(tree.rigth, dict, val + [0b0])
return dict
def __group_bits_as_bytes(bits: list) -> list:
byte_list = []
while len(bits) > 0:
byte = 0b0
for _ in range(8):
byte = (byte << 1)
if len(bits) > 0:
byte |= bits.pop(0)
byte_list.append(byte)
return byte_list
def encode(string) -> tuple:
"""Encode a string as a huffman compressed list of bytes
The string must not conatain \x00 and \x01!
Args:
string (str): the string to encode
Returns:
tuple: the list and the HuffmanTree
"""
if "\x00" in string or "\x01" in string:
raise Exception("string must not conatain \\x00 or \\x01!")
string += "\x01"
tree = HuffmanTree.from_string(string)
lut = __calculate_lut(tree)
encoded = []
for char in string:
encoded += lut[char]
encoded = __group_bits_as_bytes(encoded)
return encoded, tree
def __find_char(tree: HuffmanTree, encoded: list, current_index: list) -> str:
# the current index is a list with one element, so it acts like a pointer
if tree.char != "\x00":
return tree.char
current_bit = (encoded[int(current_index[0] / 8)] >>
(7 - (current_index[0] % 8))) & 1
current_index[0] += 1
if current_bit == 1:
tree = tree.left
else:
tree = tree.rigth
return __find_char(tree, encoded, current_index)
def decode(encoded: list, tree: HuffmanTree) -> str:
"""Decode a huffman compressed list of bytes to a string
Args:
code (list): the encoded list
tree (HuffmanTree): the tree
Returns:
str: the decoded string
"""
string = ""
current_index = [0]
while True:
c = __find_char(tree, encoded, current_index)
if c == "\x01":
break
string += c
return string
def byte_list_to_c(name: str, bytes: list) -> str:
# remove last comma
headerString = "const uint8_t " + name + "[] = {\n "
counter = 0
for byte in bytes:
headerString += f"{hex(byte).zfill(2)}, "
counter += 1
if counter % 16 == 0:
headerString += "\n "
# remove last comma
headerString = headerString[:-2]
headerString += "\n};"
return headerString
if __name__ == "__main__":
string = "Hello World!"
encoded, tree = encode(string)
print(byte_list_to_c("huffman_tree", tree.as_byte_list()))
print(byte_list_to_c("encoded_string", encoded))
decoded = decode(encoded, tree)
assert decoded == string
print(
f"Without compression: {len(string)}, with compression: {len(encoded)}, gain: {int((1 - len(encoded) / len(string)) * 100)}%")
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
const uint8_t huffman_tree[] = {
0x0, 0x0, 0x0, 0x0, 0x20, 0x57, 0x0, 0x48, 0x65, 0x0, 0x0, 0x21, 0x1, 0x0, 0x72, 0x64,
0x0, 0x6c, 0x6f};
const uint8_t encoded_string[] = {
0xdc, 0x53, 0xf8, 0x96, 0x2e, 0x80};
typedef struct Node
{
char c;
struct Node *p;
struct Node *l;
struct Node *r;
} Node;
Node *build_tree(const uint8_t *raw_tree)
{
Node *root = NULL;
Node *last_node = NULL;
int children_missing = 0;
do
{
Node *current_node = (Node *)malloc(sizeof(Node));
current_node->p = last_node;
current_node->c = *raw_tree;
raw_tree++;
if (current_node->c == 0)
{
children_missing += 2;
last_node = current_node;
}
if (root == NULL)
{
root = current_node;
continue;
}
if (current_node->p->l == NULL)
{
current_node->p->l = current_node;
}
else
{
current_node->p->r = current_node;
}
children_missing--;
while (last_node != NULL && last_node->r != NULL)
{
last_node = last_node->p;
}
} while (children_missing > 0);
return root;
}
void print_tree(Node *tree, int level)
{
if (tree == NULL)
return;
level += 5;
print_tree(tree->r, level);
for (int i = 5; i < level; i++)
printf(" ");
if (tree->c == 0)
printf("<\n");
else if (tree->c == ' ')
printf("␣\n");
else
printf("%c\n", tree->c);
print_tree(tree->l, level);
}
char find_char(Node *tree, const uint8_t encoded[], size_t *current_index)
{
if (tree->c != 0)
return tree->c;
uint8_t current_bit = (encoded[*current_index / 8] >> (7 - (*current_index % 8))) & 1;
(*current_index)++;
if (current_bit == 1)
tree = tree->l;
else
tree = tree->r;
return find_char(tree, encoded, current_index);
}
char *decode(Node *tree, const uint8_t encoded[])
{
char *string = (char *)malloc(sizeof(char));
size_t current_length = 1;
size_t current_index = 0;
for (;;)
{
char c = find_char(tree, encoded, &current_index);
if (c == 1)
{
string[current_length - 1] = 0;
break;
}
string[current_length - 1] = c;
string = (char *)realloc((void *)string, sizeof(char) * current_length++);
}
return string;
}
int main(int argc, char **argv)
{
Node *tree = build_tree(huffman_tree);
char *string = decode(tree, encoded_string);
printf("%s\n", string);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment