Skip to content

Instantly share code, notes, and snippets.

@ryanwitt
Last active November 29, 2016 14:41
  • Star 1 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save ryanwitt/3167463 to your computer and use it in GitHub Desktop.
silly huffman coding example
__all__ = (
'frequency_histogram',
'endode_simple',
'decode_simple',
'huffman_tree',
'encode_character',
'encode_huffman',
'hist2dot',
'tree2dot',
)
phrase = 'A man a plan a canal Panama'.lower()
def frequency_histogram(phrase):
"""
>>> frequency_histogram(phrase)
[(10, 'a'), (6, ' '), (4, 'n'), (2, 'p'), (2, 'm'), (2, 'l'), (1, 'c')]
"""
d = {}
for c in phrase:
d[c] = d.get(c, 0) + 1
return list(reversed(sorted(zip(d.values(), d.keys()))))
def encode_simple(phrase, tree=None):
"""
>>> ''.join(encode_simple(phrase))
'01011110011010010111011111001101001011111100110011111010111001100111100'
"""
if tree is None:
tree = frequency_histogram(phrase)
for c in phrase:
for count, character in tree:
if character == c:
yield '0'
break
else:
yield '1'
def decode_simple(phrase, tree):
"""
>>> ''.join(decode_simple('01011110011010010111011111001101001011111100110011111010111001100111100', frequency_histogram(phrase)))
'a man a plan a canal panama'
"""
cur = iter(tree)
for c in phrase:
if c == '1':
cur.next()
elif c == '0':
yield cur.next()[1]
cur = iter(tree)
import heapq
def huffman_tree(phrase):
"""
>>> huffman_tree(phrase)
(27, None, (10, 'a', None, None), (17, None, (7, None, (3, None, (1, 'c', None, None), (2, 'l', None, None)), (4, None, (2, 'm', None, None), (2, 'p', None, None))), (10, None, (4, 'n', None, None), (6, ' ', None, None))))
"""
tree = [(freq, char, None, None) for freq, char in frequency_histogram(phrase)] # last 2 args are left and right children
heapq.heapify(tree)
while len(tree) > 1:
p1, c1, r1, l1 = heapq.heappop(tree)
p2, c2, r2, l2 = heapq.heappop(tree)
heapq.heappush(tree, (p1 + p2, None, (p1, c1, r1, l1), (p2, c2, r2, l2)))
return tree[0]
def encode_character(c, tree, append=''):
"""
>>> encode_character('a', huffman_tree(phrase))
'0'
"""
if not tree:
return string_so_far
freq, character, right_child, left_child = tree
if c == character:
return append
if right_child:
r = encode_character(c, right_child, append='1')
if r:
return append + r
if left_child:
l = encode_character(c, left_child, append='0')
if l:
return append + l
return None
def encode_huffman(phrase, tree=None):
"""
>>> encode_huffman(phrase)
'10000101100100010000100011010010001000011110011011000001001001101011'
"""
if tree is None:
tree = huffman_tree(phrase)
return ''.join(encode_character(c, tree) for c in phrase)
def hist2dot(hist):
print 'digraph g {'
last = 'root'
i = 0
for freq, c in hist:
print last, '-> ', c.replace(' ','space'), ' [label=0];'
new = bin(i).replace('b','')
print last, '->', new, '[label=1];'
last = new
i += 1
print '}'
print
def tree2dot(tree):
total, char, left, right = tree
titles = {}
def get_title(node):
freq, char, left, right = node
if id(node) not in titles:
count = len(titles)
titles[id(node)] = count
return titles[id(node)]
def get_node(node):
freq, char, left, right = node
title = get_title(node)
yield '\t%s [label="%sp=%0.3f"];' % (
title,
'%s\\n' % repr(char).encode('unicode-escape').replace('"', '\\"') if char else '',
1.0 * freq / total,
)
if right:
yield '\t%s -> %s [label=1];' % (title, get_title(right))
if left:
yield '\t%s -> %s [label=0];' % (title, get_title(left))
def inner(node):
freq, char, left, right = node
text = list(get_node(node))
if left:
text.extend(inner(left))
if right:
text.extend(inner(right))
return text
text = ['digraph g {']
text.extend(inner(tree))
text.append('}')
return '\n'.join(text);
if __name__ == '__main__':
import sys
if '--help' in sys.argv:
print >>sys.stderr, 'usage: huffman.py --help # print this message'
print >>sys.stderr, ' huffman.py --test # run tests'
print >>sys.stderr, ' huffman.py < file # output the string representation of binary encoding of file'
print >>sys.stderr, ' huffman.py --tree < file # output the tree (in dot format) generated from file'
raise SystemExit(0)
if '--test' in sys.argv:
def test_character(c, phrase):
print c, encode_character(c, huffman_tree(phrase)), phrase
def test_phrase(phrase):
print "Phrase: %s" % phrase
print "Length: %d" % (len(phrase) * 8)
encoded = encode_huffman(phrase)
print "Encoded: %s" % encoded
print "Code Length: %d" % len(encoded)
print "Compression: %f" % (1.0 * len(encoded) / (len(phrase) * 8))
test_character('a', 'a'*50 + 'b'*25 + 'c'*13 + 'd'*12)
test_character('b', 'a'*50 + 'b'*25 + 'c'*13 + 'd'*12)
test_character('c', 'a'*50 + 'b'*25 + 'c'*13 + 'd'*12)
test_character('d', 'a'*50 + 'b'*25 + 'c'*13 + 'd'*12)
test_phrase(phrase)
test_phrase('a'*50 + 'b'*25 + 'c'*13 + 'd'*12)
test_character('a', phrase)
test_character(' ', phrase)
test_character('m', phrase)
test_character('n', phrase)
test_character('p', phrase)
test_character('c', phrase)
test_character('l', phrase)
raise SystemExit(0)
def calculate_compression(original, encoded):
original_bits = len(original.decode('UTF-8')*8)
encoded_bits = len(encoded)
print >>sys.stderr, "Original bits: %d" % original_bits
print >>sys.stderr, "Encoded bits: %d" % encoded_bits
print >>sys.stderr, "Compression: %f" % (1.0 * encoded_bits / original_bits)
original = sys.stdin.read()
tree = huffman_tree(original)
encoded = encode_huffman(original, tree)
calculate_compression(original, encoded)
if '--tree' in sys.argv:
digraph = tree2dot(tree)
print digraph
else:
print encoded
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment