-
-
Save almer-t/58fd749b230c6b90b6054930fb8b519a to your computer and use it in GitHub Desktop.
Conceptual/Didactic Huffman Encoding/Decoding
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# | |
# Huffman.py: A basic (didactic) implementation of Huffman encoding. | |
# | |
# Copyright (C) 2021 Almer S. Tigelaar | |
# | |
# This program is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation, under version 3 of the License. | |
# | |
from collections import defaultdict | |
import heapq | |
import textwrap | |
import itertools | |
class Node: | |
"""Represents a Node in the Huffman Tree. | |
Includes support for comparing for usage in the heap. | |
""" | |
def __init__(self, label, value, left=None, right=None): | |
"""Create a new Node. | |
Args: | |
label: A label for this node (used for output/decoding) | |
value: The count for this node. | |
left: Left subtree of type Node (or None). | |
right: Right subtree of type Node (or None). | |
""" | |
self.label = label | |
self.value = value | |
self.left = left | |
self.right = right | |
def __eq__(self, other): | |
return self.value == other.value | |
def __lt__(self, other): | |
return self.value < other.value | |
class HuffmanEncoderDecoder: | |
"""Encodes and Decodes messages using Static Huffman Encoding.""" | |
def _get_counts(self, text): | |
"""Returns the count for each unique character in text. | |
Args: | |
text: The text to examine. | |
Returns: | |
A dictionary with characters as keys and counts as values. | |
For example: { 'a': 10, 'b': 5, ... } | |
""" | |
assert text | |
counts = defaultdict(int) | |
for c in text: | |
counts[c] += 1 | |
return counts | |
def _transform_to_heap(self, counts): | |
"""Given a dictionary with counts, creates childless nodes and inserts them in a heap. | |
Args: | |
counts: A dictionary with characters as keys and counts as values. | |
Returns: | |
A heap with Node elements that can be manipulated with heapq calls. | |
""" | |
assert counts | |
heap = [ Node(k, v) for k, v in counts.items() ] | |
heapq.heapify(heap) | |
return heap | |
def _heap_to_encoding_tree(self, heap): | |
"""Transforms a heap with Nodes into a tree by building | |
a bottom-up style Huffman tree. | |
Args: | |
heap: The with nodes to transform to a tree. | |
Returns: | |
The root element of the tree. | |
""" | |
assert heap | |
while len(heap) > 1: | |
a = heapq.heappop(heap) | |
b = heapq.heappop(heap) | |
parent = Node(None, a.value + b.value, a, b) | |
heapq.heappush(heap, parent) | |
return heap[0] | |
def _print_tree(self, tree, prefix='', level=0): | |
"""Recursively prints a tree of Nodes. | |
Args: | |
tree: The current node to print. | |
prefix: Any prefix no pass on and output. | |
level: The current level (depth) in the tree. | |
""" | |
# Output | |
if not tree.label: | |
label = "[*]" | |
elif tree.label == ' ': | |
label = "[SPC]" | |
else: | |
label = tree.label | |
if not prefix: | |
text = "[R] {}".format(label, tree.value) | |
else: | |
text = "{} {} = {}".format(label, tree.value, prefix) | |
print(textwrap.indent(text, ' ' * level)) | |
# Recurse | |
if tree.left: | |
self._print_tree(tree.left, prefix + '0', level + 1) | |
if tree.right: | |
self._print_tree(tree.right, prefix + '1', level + 1) | |
def _build_encoding_table(self, node, prefix=[]): | |
"""Recursively transform a tree into an encoding table. | |
Args: | |
node: The current node to consider. | |
prefix: The current prefix of 0s/1s (expresses the position in the tree). | |
Returns: | |
A dictionary with a mapping from character to a code (list of 0s/1s). | |
Only produces this output with hitting leaf nodes in the tree. | |
""" | |
assert node | |
table = {} | |
if not node.left and not node.right: | |
table[node.label] = prefix | |
else: | |
if node.left: | |
table.update(self._build_encoding_table(node.left, prefix + [ 0 ])) | |
if node.right: | |
table.update(self._build_encoding_table(node.right, prefix + [ 1 ])) | |
return table | |
def _encode_bits(self, text, tree): | |
"""Given a text and an encoding tree, this Huffman encodes the text. | |
Args: | |
text: The text to encode. | |
tree: The root of the encoding tree. | |
Returns: | |
Text encoded to bits using tree as a list of 0s and 1s. | |
""" | |
assert text | |
assert tree | |
result = [] | |
table = self._build_encoding_table(tree) | |
for character in text: | |
result.append(table[character]) | |
return list(itertools.chain(*result)) | |
def _decode_bits(self, bits, tree): | |
"""Decodes a series of bits into the original symbols given a Huffman tree. | |
Args: | |
bits: The bit sequence to decode (list of 0s and 1s). | |
tree: The root of the tree to use for decoding. | |
""" | |
output = [] | |
current = tree | |
for b in bits: | |
if b == 0: | |
current = current.left | |
else: | |
current = current.right | |
if current.label: | |
output.append(current.label) | |
current = tree | |
return "".join(output) | |
def encode(self, text): | |
# Get the counts and convert this into a tree | |
counts = self._get_counts(text) | |
heap = self._transform_to_heap(counts) | |
root = self._heap_to_encoding_tree(heap) | |
# Encode the message | |
bits = self._encode_bits(text, root) | |
return bits, root, counts | |
def decode(self, bits, tree): | |
# No pre-work needed, we can simply invoke _decode_bits | |
return self._decode_bits(bits, tree) | |
class HuffmanTester: | |
"""Simple inspection-based tester for HuffmanEncoderDecoder.""" | |
def test(self): | |
# Input text | |
text = "An optimum method of coding an ensemble of messages consisting of a finite number of members is developed. A minimum-redudancy code is one constructed in such a way that the average number of coding digits per message is minimized." | |
text = text.lower() | |
# Encode and print | |
coder = HuffmanEncoderDecoder() | |
bits, tree, counts = coder.encode(text) | |
# Print counts and the tree and coding table for inspection | |
# This redoes some work, but that's okay for the purposes of this script. | |
debug = True | |
if debug: | |
print("Counts:") | |
print(list(sorted([(v, k) for k, v in counts.items()]))) | |
print("\nTree:") | |
coder._print_tree(tree) | |
print("\nCoding Table:") | |
print(coder._build_encoding_table(tree)) | |
# Output encoded form | |
print("\nEncoded:") | |
print(bits) | |
# Decode and print | |
print("\nDecoded:") | |
print(coder.decode(bits, tree)) | |
# Run the HuffmanTester | |
ht = HuffmanTester() | |
ht.test() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment