Skip to content

Instantly share code, notes, and snippets.

@samolisov
Last active March 25, 2019 11:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save samolisov/bdd493e8d3fffe0d0a3ed8a0b2bad51d to your computer and use it in GitHub Desktop.
Save samolisov/bdd493e8d3fffe0d0a3ed8a0b2bad51d to your computer and use it in GitHub Desktop.
Cormen et al, 16.3 Huffman code (test for German)
import heapq
def main():
freqmap = {'a':45, 'b':13, 'c':12, 'd':16, 'e':9, 'f':5}
char_to_code = huffman(freqmap)
print(char_to_code)
print(invert_dictionary(char_to_code))
print()
# see https://en.wikipedia.org/wiki/Letter_frequency#Relative_frequencies_of_letters_in_other_languages
freqmap_german = {'a':6.516, 'b':1.886, 'c':2.732, 'd':5.076, 'e':16.396, 'f':1.656, 'g':3.009, 'h':4.577, 'i':6.550,
'j':0.268, 'k':1.417, 'l':3.437, 'm':2.534, 'n':9.776, 'o':2.594, 'p':0.670, 'q':0.018, 'r':7.003,
's':7.270, 't':6.154, 'u':4.166, 'v':0.846, 'w':1.921, 'x':0.034, 'y':0.039, 'z':1.134,
'ß':0.307, 'ä':0.578, 'ö':0.443, 'ü':0.995}
char_to_code = huffman(freqmap_german)
print(char_to_code)
print()
text = "aaaa"
print(text)
char_to_code, encoded = huffman_encode(text)
print(char_to_code)
decoded = huffman_decode(invert_dictionary(char_to_code), encoded)
print(decoded)
print("Is text encoded/decoded correctly?:", (decoded == text))
print()
text = "Du grosses Gestirn! Was wäre dein Glück, wenn du nicht Die hättest, welchen du leuchtest!"
print(text)
char_to_code, encoded = huffman_encode(text)
decoded = huffman_decode(invert_dictionary(char_to_code), encoded)
print(decoded)
print("Is text encoded/decoded correctly?:", (decoded == text))
def huffman(freqmap):
"""
Builds a code map emloyeing Huffman's algorithm (see Cormen et al, chapter 16.3). The frequences are given in the 'freqmap'
parameter. The function returns a map of "char" - "code". Because we are using a queue implemented as a binary min-heap (see the
documentation for the standard library: https://docs.python.org/3/library/heapq.html), the total running time of the algorithm
on a set of n characters is O(n lg n). We can reduce the running time to O(n lg lg n) by replacing the binary min-heap with
a van Emde Boas tree (see Cormen et al, chapter 20).
Args:
-----
freqmap: a map of a character to its frequency in a text, dictionary. For example,
{'a':45, 'b':13, 'c':12, 'd':16, 'e':9, 'f':5}.
Returns:
--------
a map of a charater to its Huffman's prefix-code, for example,
{'a': '0', 'b': '101', 'c': '100', 'd': '111', 'e': '1101', 'f': '1100'}.
"""
class Node:
def __init__(self, freq, char, left = None, right = None):
self.__left = left
self.__right = right
self.__freq = freq
self.__char = char
def addLeft(self, left):
self.__left = left
def addRight(self, right):
self.__right = right
def freq(self):
return self.__freq
def char(self):
return self.__char
def hasLeft(self):
return self.__left != None
def hasRight(self):
return self.__right != None
def isTerminal(self):
return not(self.hasLeft()) and not(self.hasRight())
def left(self):
return self.__left
def right(self):
return self.__right
def __lt__(self, other):
return self.freq() < other.freq()
queue = [Node(freq, key) for key, freq in freqmap.items()]
heapq.heapify(queue)
for i in range(len(freqmap) - 1):
left_node = heapq.heappop(queue)
right_node = heapq.heappop(queue)
heapq.heappush(queue, Node(left_node.freq() + right_node.freq(), None, left_node, right_node))
root = heapq.heappop(queue)
# special case: freqmap with one element only
if root.isTerminal():
return {root.char(): '0'}
stack = [(root, "")]
char_to_code = {}
while len(stack) > 0:
node, code = stack.pop()
if node.hasLeft():
stack.append((node.left(), code + '0'))
if node.hasRight():
stack.append((node.right(), code + '1'))
if node.isTerminal():
char_to_code[node.char()] = code
return dict(sorted(char_to_code.items())) # sort the dict for better readability
def huffman_encode(text):
"""
Encodes the text 'text' to a binary representation (each binary number encoded as a character '1' or '0' for
the demonstration purposes) using a based on character frequency variable-length prefix-code. The prefix-code
is called as Huffman code (see the function 'huffman' that is used to build the Huffman code).
Args:
-----
text (string): a text for encoding.
Returns:
--------
char_to_code: a map of a charater to its Huffman's prefix-code, for example,
{'a': '0', 'b': '101', 'c': '100', 'd': '111', 'e': '1101', 'f': '1100'}.
encoded: the encoded text 'text'.
"""
def build_freqmap(text):
"""
Analyses the 'text' text and builds a frequency map for the text: a map of a character to a normalized
number of how many times the character appears in the text.
Args:
-----
text (string): a text for analysis.
Returns:
--------
a map of a charater to a normaliized number of occuriences of the character in the text,
e.g. {'a':45, 'b':13, 'c':12, 'd':16, 'e':9, 'f':5}
"""
length = len(text)
freqmap = {}
# get the frequences
for i in text:
freqmap[i] = freqmap.get(i, 0) + 1
# normalize the frequences
for key in freqmap.keys():
freqmap[key] = (freqmap[key] / length) * 100
return freqmap
char_to_code = huffman(build_freqmap(text))
encoded_symbols = []
for x in text:
encoded_symbols.append(char_to_code[x])
return char_to_code, ''.join(encoded_symbols)
def huffman_decode(code_to_char, encoded):
"""
Decodes an encoded using a variable-length prefix-code called as Huffman code (see the function 'huffman' that
is used to build the Huffman code) text.
Args:
-----
code_to_char: a map of a Huffman's prefix-code of a character to the character, for example,
{'0': 'a', '101': 'b', '100': 'c', '111': 'd', '1101': 'e', '1100': 'f'}.
encoded (string): an encoded text to be decoded.
Returns:
--------
the decoded text (string).
"""
decoded_symbols = []
symbol_start = 0
symbol_end = 1
length = len(encoded)
while symbol_end <= length:
key = encoded[symbol_start:symbol_end]
if key in code_to_char:
decoded_symbols.append(code_to_char[key])
symbol_start = symbol_end
else:
symbol_end = symbol_end + 1
if symbol_start < length:
raise ValueError("The encoded string cannot be recognized")
return ''.join(decoded_symbols)
def invert_dictionary(dictionary):
return dict(map(reversed, dictionary.items()))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment