samolisov/huffman_code.py

## huffman_code.py
import heapq

def main():
    freqmap = {'a':45, 'b':13, 'c':12, 'd':16, 'e':9, 'f':5}
    char_to_code = huffman(freqmap)
    print(char_to_code)
    print(invert_dictionary(char_to_code))
    print()

    # see https://en.wikipedia.org/wiki/Letter_frequency#Relative_frequencies_of_letters_in_other_languages
    freqmap_german = {'a':6.516, 'b':1.886, 'c':2.732, 'd':5.076, 'e':16.396, 'f':1.656, 'g':3.009, 'h':4.577, 'i':6.550,
                      'j':0.268, 'k':1.417, 'l':3.437, 'm':2.534, 'n':9.776, 'o':2.594, 'p':0.670, 'q':0.018, 'r':7.003,
                      's':7.270, 't':6.154, 'u':4.166, 'v':0.846, 'w':1.921, 'x':0.034, 'y':0.039, 'z':1.134,
                      'ß':0.307, 'ä':0.578, 'ö':0.443, 'ü':0.995}
    char_to_code = huffman(freqmap_german)
    print(char_to_code)
    print()

    text = "aaaa"
    print(text)
    char_to_code, encoded = huffman_encode(text)
    print(char_to_code)
    decoded = huffman_decode(invert_dictionary(char_to_code), encoded)
    print(decoded)
    print("Is text encoded/decoded correctly?:", (decoded == text))
    print()

    text = "Du grosses Gestirn! Was wäre dein Glück, wenn du nicht Die hättest, welchen du leuchtest!"
    print(text)
    char_to_code, encoded = huffman_encode(text)
    decoded = huffman_decode(invert_dictionary(char_to_code), encoded)
    print(decoded)
    print("Is text encoded/decoded correctly?:", (decoded == text))

def huffman(freqmap):
    """
    Builds a code map emloyeing Huffman's algorithm (see Cormen et al, chapter 16.3). The frequences are given in the 'freqmap'
    parameter. The function returns a map of "char" - "code". Because we are using a queue implemented as a binary min-heap (see the
    documentation for the standard library: https://docs.python.org/3/library/heapq.html), the total running time of the algorithm
    on a set of n characters is O(n lg n). We can reduce the running time to O(n lg lg n) by replacing the binary min-heap with
    a van Emde Boas tree (see Cormen et al, chapter 20).

    Args:
    -----
        freqmap: a map of a character to its frequency in a text, dictionary. For example,
        {'a':45, 'b':13, 'c':12, 'd':16, 'e':9, 'f':5}.
    Returns:
    --------
        a map of a charater to its Huffman's prefix-code, for example,
        {'a': '0', 'b': '101', 'c': '100', 'd': '111', 'e': '1101', 'f': '1100'}.
    """
    class Node:
        def __init__(self, freq, char, left = None, right = None):
            self.__left = left
            self.__right = right
            self.__freq = freq
            self.__char = char

        def addLeft(self, left):
            self.__left = left

        def addRight(self, right):
            self.__right = right

        def freq(self):
            return self.__freq

        def char(self):
            return self.__char

        def hasLeft(self):
            return self.__left != None

        def hasRight(self):
            return self.__right != None

        def isTerminal(self):
            return not(self.hasLeft()) and not(self.hasRight())

        def left(self):
            return self.__left

        def right(self):
            return self.__right

        def __lt__(self, other):
            return self.freq() < other.freq()

    queue = [Node(freq, key) for key, freq in freqmap.items()]
    heapq.heapify(queue)
    for i in range(len(freqmap) - 1):
        left_node = heapq.heappop(queue)
        right_node = heapq.heappop(queue)
        heapq.heappush(queue, Node(left_node.freq() + right_node.freq(), None, left_node, right_node))

    root = heapq.heappop(queue)

    # special case: freqmap with one element only
    if root.isTerminal():
        return {root.char(): '0'}

    stack = [(root, "")]
    char_to_code = {}
    while len(stack) > 0:
        node, code = stack.pop()
        if node.hasLeft():
            stack.append((node.left(), code + '0'))

        if node.hasRight():
            stack.append((node.right(), code + '1'))

        if node.isTerminal():
            char_to_code[node.char()] = code

    return dict(sorted(char_to_code.items())) # sort the dict for better readability

def huffman_encode(text):
    """
    Encodes the text 'text' to a binary representation (each binary number encoded as a character '1' or '0' for
    the demonstration purposes) using a based on character frequency variable-length prefix-code. The prefix-code
    is called as Huffman code (see the function 'huffman' that is used to build the Huffman code).

    Args:
    -----
        text (string): a text for encoding.
    Returns:
    --------
        char_to_code: a map of a charater to its Huffman's prefix-code, for example,
        {'a': '0', 'b': '101', 'c': '100', 'd': '111', 'e': '1101', 'f': '1100'}.
        encoded: the encoded text 'text'.
    """

    def build_freqmap(text):
        """
        Analyses the 'text' text and builds a frequency map for the text: a map of a character to a normalized
        number of how many times the character appears in the text.

        Args:
        -----
        text (string): a text for analysis.

        Returns:
        --------
        a map of a charater to a normaliized number of occuriences of the character in the text,
        e.g. {'a':45, 'b':13, 'c':12, 'd':16, 'e':9, 'f':5}
        """

        length = len(text)
        freqmap = {}

        # get the frequences
        for i in text:
            freqmap[i] = freqmap.get(i, 0) + 1

        # normalize the frequences
        for key in freqmap.keys():
            freqmap[key] = (freqmap[key] / length) * 100

        return freqmap

    char_to_code = huffman(build_freqmap(text))
    encoded_symbols = []
    for x in text:
        encoded_symbols.append(char_to_code[x])
    return char_to_code, ''.join(encoded_symbols)

def huffman_decode(code_to_char, encoded):
    """
    Decodes an encoded using a variable-length prefix-code called as Huffman code (see the function 'huffman' that
    is used to build the Huffman code) text.

    Args:
    -----
        code_to_char: a map of a Huffman's prefix-code of a character to the character, for example,
        {'0': 'a', '101': 'b', '100': 'c', '111': 'd', '1101': 'e', '1100': 'f'}.
        encoded (string): an encoded text to be decoded.
    Returns:
    --------
        the decoded text (string).
    """

    decoded_symbols = []
    symbol_start = 0
    symbol_end = 1
    length = len(encoded)
    while symbol_end <= length:
        key = encoded[symbol_start:symbol_end]
        if key in code_to_char:
            decoded_symbols.append(code_to_char[key])
            symbol_start = symbol_end
        else:
            symbol_end = symbol_end + 1

    if symbol_start < length:
        raise ValueError("The encoded string cannot be recognized")

    return ''.join(decoded_symbols)

def invert_dictionary(dictionary):
    return dict(map(reversed, dictionary.items()))

if __name__ == "__main__":
    main()
	import heapq

	def main():
	freqmap = {'a':45, 'b':13, 'c':12, 'd':16, 'e':9, 'f':5}
	char_to_code = huffman(freqmap)
	print(char_to_code)
	print(invert_dictionary(char_to_code))
	print()

	# see https://en.wikipedia.org/wiki/Letter_frequency#Relative_frequencies_of_letters_in_other_languages
	freqmap_german = {'a':6.516, 'b':1.886, 'c':2.732, 'd':5.076, 'e':16.396, 'f':1.656, 'g':3.009, 'h':4.577, 'i':6.550,
	'j':0.268, 'k':1.417, 'l':3.437, 'm':2.534, 'n':9.776, 'o':2.594, 'p':0.670, 'q':0.018, 'r':7.003,
	's':7.270, 't':6.154, 'u':4.166, 'v':0.846, 'w':1.921, 'x':0.034, 'y':0.039, 'z':1.134,
	'ß':0.307, 'ä':0.578, 'ö':0.443, 'ü':0.995}
	char_to_code = huffman(freqmap_german)
	print(char_to_code)
	print()

	text = "aaaa"
	print(text)
	char_to_code, encoded = huffman_encode(text)
	print(char_to_code)
	decoded = huffman_decode(invert_dictionary(char_to_code), encoded)
	print(decoded)
	print("Is text encoded/decoded correctly?:", (decoded == text))
	print()

	text = "Du grosses Gestirn! Was wäre dein Glück, wenn du nicht Die hättest, welchen du leuchtest!"
	print(text)
	char_to_code, encoded = huffman_encode(text)
	decoded = huffman_decode(invert_dictionary(char_to_code), encoded)
	print(decoded)
	print("Is text encoded/decoded correctly?:", (decoded == text))

	def huffman(freqmap):
	"""
	Builds a code map emloyeing Huffman's algorithm (see Cormen et al, chapter 16.3). The frequences are given in the 'freqmap'
	parameter. The function returns a map of "char" - "code". Because we are using a queue implemented as a binary min-heap (see the
	documentation for the standard library: https://docs.python.org/3/library/heapq.html), the total running time of the algorithm
	on a set of n characters is O(n lg n). We can reduce the running time to O(n lg lg n) by replacing the binary min-heap with
	a van Emde Boas tree (see Cormen et al, chapter 20).

	Args:
	-----
	freqmap: a map of a character to its frequency in a text, dictionary. For example,
	{'a':45, 'b':13, 'c':12, 'd':16, 'e':9, 'f':5}.
	Returns:
	--------
	a map of a charater to its Huffman's prefix-code, for example,
	{'a': '0', 'b': '101', 'c': '100', 'd': '111', 'e': '1101', 'f': '1100'}.
	"""
	class Node:
	def __init__(self, freq, char, left = None, right = None):
	self.__left = left
	self.__right = right
	self.__freq = freq
	self.__char = char

	def addLeft(self, left):
	self.__left = left

	def addRight(self, right):
	self.__right = right

	def freq(self):
	return self.__freq

	def char(self):
	return self.__char

	def hasLeft(self):
	return self.__left != None

	def hasRight(self):
	return self.__right != None

	def isTerminal(self):
	return not(self.hasLeft()) and not(self.hasRight())

	def left(self):
	return self.__left

	def right(self):
	return self.__right

	def __lt__(self, other):
	return self.freq() < other.freq()

	queue = [Node(freq, key) for key, freq in freqmap.items()]
	heapq.heapify(queue)
	for i in range(len(freqmap) - 1):
	left_node = heapq.heappop(queue)
	right_node = heapq.heappop(queue)
	heapq.heappush(queue, Node(left_node.freq() + right_node.freq(), None, left_node, right_node))

	root = heapq.heappop(queue)

	# special case: freqmap with one element only
	if root.isTerminal():
	return {root.char(): '0'}

	stack = [(root, "")]
	char_to_code = {}
	while len(stack) > 0:
	node, code = stack.pop()
	if node.hasLeft():
	stack.append((node.left(), code + '0'))

	if node.hasRight():
	stack.append((node.right(), code + '1'))

	if node.isTerminal():
	char_to_code[node.char()] = code

	return dict(sorted(char_to_code.items())) # sort the dict for better readability

	def huffman_encode(text):
	"""
	Encodes the text 'text' to a binary representation (each binary number encoded as a character '1' or '0' for
	the demonstration purposes) using a based on character frequency variable-length prefix-code. The prefix-code
	is called as Huffman code (see the function 'huffman' that is used to build the Huffman code).

	Args:
	-----
	text (string): a text for encoding.
	Returns:
	--------
	char_to_code: a map of a charater to its Huffman's prefix-code, for example,
	{'a': '0', 'b': '101', 'c': '100', 'd': '111', 'e': '1101', 'f': '1100'}.
	encoded: the encoded text 'text'.
	"""

	def build_freqmap(text):
	"""
	Analyses the 'text' text and builds a frequency map for the text: a map of a character to a normalized
	number of how many times the character appears in the text.

	Args:
	-----
	text (string): a text for analysis.

	Returns:
	--------
	a map of a charater to a normaliized number of occuriences of the character in the text,
	e.g. {'a':45, 'b':13, 'c':12, 'd':16, 'e':9, 'f':5}
	"""

	length = len(text)
	freqmap = {}

	# get the frequences
	for i in text:
	freqmap[i] = freqmap.get(i, 0) + 1

	# normalize the frequences
	for key in freqmap.keys():
	freqmap[key] = (freqmap[key] / length) * 100

	return freqmap

	char_to_code = huffman(build_freqmap(text))
	encoded_symbols = []
	for x in text:
	encoded_symbols.append(char_to_code[x])
	return char_to_code, ''.join(encoded_symbols)

	def huffman_decode(code_to_char, encoded):
	"""
	Decodes an encoded using a variable-length prefix-code called as Huffman code (see the function 'huffman' that
	is used to build the Huffman code) text.

	Args:
	-----
	code_to_char: a map of a Huffman's prefix-code of a character to the character, for example,
	{'0': 'a', '101': 'b', '100': 'c', '111': 'd', '1101': 'e', '1100': 'f'}.
	encoded (string): an encoded text to be decoded.
	Returns:
	--------
	the decoded text (string).
	"""

	decoded_symbols = []
	symbol_start = 0
	symbol_end = 1
	length = len(encoded)
	while symbol_end <= length:
	key = encoded[symbol_start:symbol_end]
	if key in code_to_char:
	decoded_symbols.append(code_to_char[key])
	symbol_start = symbol_end
	else:
	symbol_end = symbol_end + 1

	if symbol_start < length:
	raise ValueError("The encoded string cannot be recognized")

	return ''.join(decoded_symbols)

	def invert_dictionary(dictionary):
	return dict(map(reversed, dictionary.items()))

	if __name__ == "__main__":
	main()