Skip to content

Instantly share code, notes, and snippets.

@twonds
Last active July 12, 2019 16:57
Show Gist options
  • Save twonds/8051854955d03833a232b385cca2fae7 to your computer and use it in GitHub Desktop.
Save twonds/8051854955d03833a232b385cca2fae7 to your computer and use it in GitHub Desktop.
Code that compresses data using variable length encoding: https://www.techiedelight.com/huffman-coding/
import argparse
import bisect
from collections import defaultdict
import sys
class HuffmanNode:
"""
A node class for encoding data. It provides a way to created a weighted tree
and then produced an encoded string from the tree.
"""
def __init__(self, weight, character=""):
self.weight = weight
self.character = character
self.zero_node = None
self.one_node = None
self.value = ""
def __eq__(self, other):
return self.character == other.character and self.weight == other.weight
def __ne__(self, other):
return not self == other
def __gt__(self, other):
return (self.weight, self.character) > (other.weight, other.character)
def __lt__(self, other):
return (self.weight, self.character) < (other.weight, other.character)
def __ge__(self, other):
return (self.weight, self.character) >= (other.weight, other.character)
def __le__(self, other):
return (self.weight, self.character) <= (other.weight, other.character)
def __repr__(self):
return "HuffmanNode: {} {}".format(self.character, self.weight)
def encode_str(self, value, code_map):
if self.zero_node:
self.zero_node.encode_str(value+"0", code_map)
if self.one_node:
self.one_node.encode_str(value+"1", code_map)
if self.zero_node is None and self.one_node is None:
self.value = value
code_map[self.character] = value
class Huffman:
def __init__(self):
self.character_map = defaultdict(int)
self.node = None
self.encoded_string = None
self.original_string = None
self.decoded_string = None
self.code_map = {}
self.decode_map = {}
def get_encoded_string(self):
self.node.encode_str("", self.code_map)
self.encoded_string = ""
for c in self.original_string:
self.encoded_string += self.code_map[c]
return self.encoded_string
def encode(self, decoded_string):
"""
Encode bytes
"""
self.original_string = decoded_string
# find frequency
for c in decoded_string:
self.character_map[c] += 1
# create priority queue, use list for now
queue = sorted((HuffmanNode(val, c) for c, val in self.character_map.items()))
while queue:
right = queue.pop(0)
left = queue.pop(0)
new_weight = left.weight + right.weight
new_node = HuffmanNode(new_weight)
new_node.zero_node = left
new_node.one_node = right
if len(queue) > 1:
# We could do our own insert into a sorted list
bisect.insort(queue, new_node)
else:
left_node = queue.pop()
self.node = HuffmanNode(left_node.weight+new_node.weight)
self.node.zero_node = left_node
self.node.one_node = new_node
break
return self.get_encoded_string()
def get_decoded_chr(self, node, encoded_string):
if node is None:
return
if node.zero_node is None and node.one_node is None:
self.decoded_string += node.character
return
self._index += 1
if encoded_string[self._index] == '0':
self.get_decoded_chr(node.zero_node, encoded_string)
else:
self.get_decoded_chr(node.one_node, encoded_string)
def decode(self, encoded_string):
"""
Given an huffman encoded string decode it back to its original.
"""
self.decoded_string = ""
# Have to have a tree created
str_len = len(encoded_string)
self._index = -1
while self._index < str_len - 2:
self.get_decoded_chr(self.node, encoded_string)
return self.decoded_string
if __name__ == "__main__":
hf = Huffman()
input_string = input()
encoded_string = hf.encode(input_string)
decoded_string = hf.decode(encoded_string)
print(encoded_string)
print(int(encoded_string, 2))
print("0x%x" % int(encoded_string, 2))
print(decoded_string)
print(sys.getsizeof(int(encoded_string, 2)), sys.getsizeof(decoded_string))
assert(input_string == decoded_string)
import unittest
import huffman
class HuffmanEncodingTestCase(unittest.TestCase):
def test_encode(self):
hf = huffman.Huffman()
encoded_string = hf.encode("AAAABBBCCCDDE")
self.assertIsNotNone(encoded_string)
def test_decode(self):
hf = huffman.Huffman()
input_string = "Can I compress this? AAAABBBCCCDDE"
encoded_string = hf.encode(input_string)
decoded_string = hf.decode(encoded_string)
self.assertEqual(input_string, decoded_string)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment