Last active
July 12, 2019 16:57
-
-
Save twonds/8051854955d03833a232b385cca2fae7 to your computer and use it in GitHub Desktop.
Code that compresses data using variable length encoding: https://www.techiedelight.com/huffman-coding/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import bisect | |
from collections import defaultdict | |
import sys | |
class HuffmanNode: | |
""" | |
A node class for encoding data. It provides a way to created a weighted tree | |
and then produced an encoded string from the tree. | |
""" | |
def __init__(self, weight, character=""): | |
self.weight = weight | |
self.character = character | |
self.zero_node = None | |
self.one_node = None | |
self.value = "" | |
def __eq__(self, other): | |
return self.character == other.character and self.weight == other.weight | |
def __ne__(self, other): | |
return not self == other | |
def __gt__(self, other): | |
return (self.weight, self.character) > (other.weight, other.character) | |
def __lt__(self, other): | |
return (self.weight, self.character) < (other.weight, other.character) | |
def __ge__(self, other): | |
return (self.weight, self.character) >= (other.weight, other.character) | |
def __le__(self, other): | |
return (self.weight, self.character) <= (other.weight, other.character) | |
def __repr__(self): | |
return "HuffmanNode: {} {}".format(self.character, self.weight) | |
def encode_str(self, value, code_map): | |
if self.zero_node: | |
self.zero_node.encode_str(value+"0", code_map) | |
if self.one_node: | |
self.one_node.encode_str(value+"1", code_map) | |
if self.zero_node is None and self.one_node is None: | |
self.value = value | |
code_map[self.character] = value | |
class Huffman: | |
def __init__(self): | |
self.character_map = defaultdict(int) | |
self.node = None | |
self.encoded_string = None | |
self.original_string = None | |
self.decoded_string = None | |
self.code_map = {} | |
self.decode_map = {} | |
def get_encoded_string(self): | |
self.node.encode_str("", self.code_map) | |
self.encoded_string = "" | |
for c in self.original_string: | |
self.encoded_string += self.code_map[c] | |
return self.encoded_string | |
def encode(self, decoded_string): | |
""" | |
Encode bytes | |
""" | |
self.original_string = decoded_string | |
# find frequency | |
for c in decoded_string: | |
self.character_map[c] += 1 | |
# create priority queue, use list for now | |
queue = sorted((HuffmanNode(val, c) for c, val in self.character_map.items())) | |
while queue: | |
right = queue.pop(0) | |
left = queue.pop(0) | |
new_weight = left.weight + right.weight | |
new_node = HuffmanNode(new_weight) | |
new_node.zero_node = left | |
new_node.one_node = right | |
if len(queue) > 1: | |
# We could do our own insert into a sorted list | |
bisect.insort(queue, new_node) | |
else: | |
left_node = queue.pop() | |
self.node = HuffmanNode(left_node.weight+new_node.weight) | |
self.node.zero_node = left_node | |
self.node.one_node = new_node | |
break | |
return self.get_encoded_string() | |
def get_decoded_chr(self, node, encoded_string): | |
if node is None: | |
return | |
if node.zero_node is None and node.one_node is None: | |
self.decoded_string += node.character | |
return | |
self._index += 1 | |
if encoded_string[self._index] == '0': | |
self.get_decoded_chr(node.zero_node, encoded_string) | |
else: | |
self.get_decoded_chr(node.one_node, encoded_string) | |
def decode(self, encoded_string): | |
""" | |
Given an huffman encoded string decode it back to its original. | |
""" | |
self.decoded_string = "" | |
# Have to have a tree created | |
str_len = len(encoded_string) | |
self._index = -1 | |
while self._index < str_len - 2: | |
self.get_decoded_chr(self.node, encoded_string) | |
return self.decoded_string | |
if __name__ == "__main__": | |
hf = Huffman() | |
input_string = input() | |
encoded_string = hf.encode(input_string) | |
decoded_string = hf.decode(encoded_string) | |
print(encoded_string) | |
print(int(encoded_string, 2)) | |
print("0x%x" % int(encoded_string, 2)) | |
print(decoded_string) | |
print(sys.getsizeof(int(encoded_string, 2)), sys.getsizeof(decoded_string)) | |
assert(input_string == decoded_string) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
flake8 | |
nose |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unittest | |
import huffman | |
class HuffmanEncodingTestCase(unittest.TestCase): | |
def test_encode(self): | |
hf = huffman.Huffman() | |
encoded_string = hf.encode("AAAABBBCCCDDE") | |
self.assertIsNotNone(encoded_string) | |
def test_decode(self): | |
hf = huffman.Huffman() | |
input_string = "Can I compress this? AAAABBBCCCDDE" | |
encoded_string = hf.encode(input_string) | |
decoded_string = hf.decode(encoded_string) | |
self.assertEqual(input_string, decoded_string) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment