Skip to content

Instantly share code, notes, and snippets.

@almer-t
Created February 19, 2021 15:48
Show Gist options
  • Save almer-t/58fd749b230c6b90b6054930fb8b519a to your computer and use it in GitHub Desktop.
Save almer-t/58fd749b230c6b90b6054930fb8b519a to your computer and use it in GitHub Desktop.
Conceptual/Didactic Huffman Encoding/Decoding
#!/usr/bin/env python3
#
# Huffman.py: A basic (didactic) implementation of Huffman encoding.
#
# Copyright (C) 2021 Almer S. Tigelaar
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, under version 3 of the License.
#
from collections import defaultdict
import heapq
import textwrap
import itertools
class Node:
"""Represents a Node in the Huffman Tree.
Includes support for comparing for usage in the heap.
"""
def __init__(self, label, value, left=None, right=None):
"""Create a new Node.
Args:
label: A label for this node (used for output/decoding)
value: The count for this node.
left: Left subtree of type Node (or None).
right: Right subtree of type Node (or None).
"""
self.label = label
self.value = value
self.left = left
self.right = right
def __eq__(self, other):
return self.value == other.value
def __lt__(self, other):
return self.value < other.value
class HuffmanEncoderDecoder:
"""Encodes and Decodes messages using Static Huffman Encoding."""
def _get_counts(self, text):
"""Returns the count for each unique character in text.
Args:
text: The text to examine.
Returns:
A dictionary with characters as keys and counts as values.
For example: { 'a': 10, 'b': 5, ... }
"""
assert text
counts = defaultdict(int)
for c in text:
counts[c] += 1
return counts
def _transform_to_heap(self, counts):
"""Given a dictionary with counts, creates childless nodes and inserts them in a heap.
Args:
counts: A dictionary with characters as keys and counts as values.
Returns:
A heap with Node elements that can be manipulated with heapq calls.
"""
assert counts
heap = [ Node(k, v) for k, v in counts.items() ]
heapq.heapify(heap)
return heap
def _heap_to_encoding_tree(self, heap):
"""Transforms a heap with Nodes into a tree by building
a bottom-up style Huffman tree.
Args:
heap: The with nodes to transform to a tree.
Returns:
The root element of the tree.
"""
assert heap
while len(heap) > 1:
a = heapq.heappop(heap)
b = heapq.heappop(heap)
parent = Node(None, a.value + b.value, a, b)
heapq.heappush(heap, parent)
return heap[0]
def _print_tree(self, tree, prefix='', level=0):
"""Recursively prints a tree of Nodes.
Args:
tree: The current node to print.
prefix: Any prefix no pass on and output.
level: The current level (depth) in the tree.
"""
# Output
if not tree.label:
label = "[*]"
elif tree.label == ' ':
label = "[SPC]"
else:
label = tree.label
if not prefix:
text = "[R] {}".format(label, tree.value)
else:
text = "{} {} = {}".format(label, tree.value, prefix)
print(textwrap.indent(text, ' ' * level))
# Recurse
if tree.left:
self._print_tree(tree.left, prefix + '0', level + 1)
if tree.right:
self._print_tree(tree.right, prefix + '1', level + 1)
def _build_encoding_table(self, node, prefix=[]):
"""Recursively transform a tree into an encoding table.
Args:
node: The current node to consider.
prefix: The current prefix of 0s/1s (expresses the position in the tree).
Returns:
A dictionary with a mapping from character to a code (list of 0s/1s).
Only produces this output with hitting leaf nodes in the tree.
"""
assert node
table = {}
if not node.left and not node.right:
table[node.label] = prefix
else:
if node.left:
table.update(self._build_encoding_table(node.left, prefix + [ 0 ]))
if node.right:
table.update(self._build_encoding_table(node.right, prefix + [ 1 ]))
return table
def _encode_bits(self, text, tree):
"""Given a text and an encoding tree, this Huffman encodes the text.
Args:
text: The text to encode.
tree: The root of the encoding tree.
Returns:
Text encoded to bits using tree as a list of 0s and 1s.
"""
assert text
assert tree
result = []
table = self._build_encoding_table(tree)
for character in text:
result.append(table[character])
return list(itertools.chain(*result))
def _decode_bits(self, bits, tree):
"""Decodes a series of bits into the original symbols given a Huffman tree.
Args:
bits: The bit sequence to decode (list of 0s and 1s).
tree: The root of the tree to use for decoding.
"""
output = []
current = tree
for b in bits:
if b == 0:
current = current.left
else:
current = current.right
if current.label:
output.append(current.label)
current = tree
return "".join(output)
def encode(self, text):
# Get the counts and convert this into a tree
counts = self._get_counts(text)
heap = self._transform_to_heap(counts)
root = self._heap_to_encoding_tree(heap)
# Encode the message
bits = self._encode_bits(text, root)
return bits, root, counts
def decode(self, bits, tree):
# No pre-work needed, we can simply invoke _decode_bits
return self._decode_bits(bits, tree)
class HuffmanTester:
"""Simple inspection-based tester for HuffmanEncoderDecoder."""
def test(self):
# Input text
text = "An optimum method of coding an ensemble of messages consisting of a finite number of members is developed. A minimum-redudancy code is one constructed in such a way that the average number of coding digits per message is minimized."
text = text.lower()
# Encode and print
coder = HuffmanEncoderDecoder()
bits, tree, counts = coder.encode(text)
# Print counts and the tree and coding table for inspection
# This redoes some work, but that's okay for the purposes of this script.
debug = True
if debug:
print("Counts:")
print(list(sorted([(v, k) for k, v in counts.items()])))
print("\nTree:")
coder._print_tree(tree)
print("\nCoding Table:")
print(coder._build_encoding_table(tree))
# Output encoded form
print("\nEncoded:")
print(bits)
# Decode and print
print("\nDecoded:")
print(coder.decode(bits, tree))
# Run the HuffmanTester
ht = HuffmanTester()
ht.test()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment