metula/huffman.py

## huffman.py
import http.client
import random

def random_string(n):
  return "".join(chr(random.randrange(128)) for i in range(n))

def str_to_bin(text):
  """ translates text to binary reprersentation using
  7 bits per character. Non ASCII characters are discarded"""
  return ''.join('{:07b}'.format(ord(ch)) for ch in text if ord(ch) < 128)

def bin_to_bytes(bin_string):
  """ translates a string of 0s and 1s to raw bytes """
  return bytes(int(bin_string[i : i + 8], 2) for i in range(0, len(bin_string), 8))

def purify(text):
  """ Discards non ascii characters from text"""
  return ''.join(ch for ch in text if ord(ch) < 128)

def char_count(text):
  """ counts the number of occurances of ASCII characters -
  ord(ch)<128, in a text. Returns a dictionary, with keys
  being the observed characters, values being the counts """
  d = {}
  for ch in text:
    if ord(ch) >= 128: continue
    if ch in d:
      d[ch] = 1 + d[ch]
    else:
      d[ch] = 1
  return d

def build_huffman_tree(letter_count):
  """ recieves dictionary with char:count entries
      generates a list of letters representing
      the binary Huffman encoding tree"""
  queue = sorted(letter_count.items(), key=lambda elem: elem[1])
  # sort by the count
  while len(queue) >= 2:
  # combine two smallest elements
    ax, ay = queue.pop(0)   # smallest in queue
    bx, by = queue.pop(0)   # next smallest
    chars = [ax,bx]
    weight = ay+by # combined weight
    queue.append((chars,weight)) # insert new pair at beginning of queue
    queue.sort(key=lambda elem: elem[1]) # re-arange (sort) queue
  return queue[0][0]  # peels off the irrelevant data (total weight)

def generate_code(tree_node, prefix=""):
  """ recieves a tree node with embedded encoding,
      and a prefix of encodings.
      returns a dictionary where characters are
      keys and associated binary strings are values.
      *** assumes more than a single leaf *** """
  if isinstance(tree_node, str):  # a leaf
    return {tree_node: prefix}
  else:
    lchild, rchild = tree_node
    codebook = {}

    codebook.update( generate_code(lchild, prefix + '0'))
    codebook.update( generate_code(rchild, prefix + '1'))
    # oh, the beauty of recursion...
    return codebook

def build_decoding_dict(encoding_dict):
  """build the "reverse" of encoding dictionary"""
  return {y:x for x,y in encoding_dict.items()}

def compress(text, encoding_dict):
  """ compress text using encoding dictionary """
  assert isinstance(text, str)
  return ''.join(encoding_dict[ch] for ch in text if ord(ch) <= 128)

def decompress(bits, decoding_dict):
  """ decompress encoded bits using decoding dictionary """
  prefix = ''
  result = []
  for bit in bits:
    prefix += bit
    if prefix not in decoding_dict:
      continue
    result.append(decoding_dict[prefix])
    prefix = ''
  assert prefix == ''  # must finish last codeword
  return ''.join(result)  # converts list of chars to a string

# Requires the TreeNode class from trees.py.
def convert_to_tree(lst, prefix=''):
  """Given a Huffman tree, represented as a list,
     returns its TreeNode equivalent."""
  if len(lst) == 1:
    return TreeNode(lst[0], prefix)
  left = convert_to_tree(lst[0], prefix + '0')
  right = convert_to_tree(lst[1], prefix + '1')
  return TreeNode(left.key + right.key, prefix, left, right)

def download_wiki():
  HOST = "en.wikipedia.org"
  GET_METHOD = "GET"
  URL = "/wiki/Huffman_coding"
  ENCODING = "utf-8"

  # Get the HTML raw data
  conn = http.client.HTTPConnection(HOST)
  conn.request(GET_METHOD, URL)
  data = conn.getresponse().read()
  conn.close()
  return str(data, ENCODING)
	import http.client
	import random

	def random_string(n):
	return "".join(chr(random.randrange(128)) for i in range(n))

	def str_to_bin(text):
	""" translates text to binary reprersentation using
	7 bits per character. Non ASCII characters are discarded"""
	return ''.join('{:07b}'.format(ord(ch)) for ch in text if ord(ch) < 128)

	def bin_to_bytes(bin_string):
	""" translates a string of 0s and 1s to raw bytes """
	return bytes(int(bin_string[i : i + 8], 2) for i in range(0, len(bin_string), 8))

	def purify(text):
	""" Discards non ascii characters from text"""
	return ''.join(ch for ch in text if ord(ch) < 128)

	def char_count(text):
	""" counts the number of occurances of ASCII characters -
	ord(ch)<128, in a text. Returns a dictionary, with keys
	being the observed characters, values being the counts """
	d = {}
	for ch in text:
	if ord(ch) >= 128: continue
	if ch in d:
	d[ch] = 1 + d[ch]
	else:
	d[ch] = 1
	return d

	def build_huffman_tree(letter_count):
	""" recieves dictionary with char:count entries
	generates a list of letters representing
	the binary Huffman encoding tree"""
	queue = sorted(letter_count.items(), key=lambda elem: elem[1])
	# sort by the count
	while len(queue) >= 2:
	# combine two smallest elements
	ax, ay = queue.pop(0) # smallest in queue
	bx, by = queue.pop(0) # next smallest
	chars = [ax,bx]
	weight = ay+by # combined weight
	queue.append((chars,weight)) # insert new pair at beginning of queue
	queue.sort(key=lambda elem: elem[1]) # re-arange (sort) queue
	return queue[0][0] # peels off the irrelevant data (total weight)

	def generate_code(tree_node, prefix=""):
	""" recieves a tree node with embedded encoding,
	and a prefix of encodings.
	returns a dictionary where characters are
	keys and associated binary strings are values.
	* assumes more than a single leaf * """
	if isinstance(tree_node, str): # a leaf
	return {tree_node: prefix}
	else:
	lchild, rchild = tree_node
	codebook = {}

	codebook.update( generate_code(lchild, prefix + '0'))
	codebook.update( generate_code(rchild, prefix + '1'))
	# oh, the beauty of recursion...
	return codebook

	def build_decoding_dict(encoding_dict):
	"""build the "reverse" of encoding dictionary"""
	return {y:x for x,y in encoding_dict.items()}

	def compress(text, encoding_dict):
	""" compress text using encoding dictionary """
	assert isinstance(text, str)
	return ''.join(encoding_dict[ch] for ch in text if ord(ch) <= 128)

	def decompress(bits, decoding_dict):
	""" decompress encoded bits using decoding dictionary """
	prefix = ''
	result = []
	for bit in bits:
	prefix += bit
	if prefix not in decoding_dict:
	continue
	result.append(decoding_dict[prefix])
	prefix = ''
	assert prefix == '' # must finish last codeword
	return ''.join(result) # converts list of chars to a string

	# Requires the TreeNode class from trees.py.
	def convert_to_tree(lst, prefix=''):
	"""Given a Huffman tree, represented as a list,
	returns its TreeNode equivalent."""
	if len(lst) == 1:
	return TreeNode(lst[0], prefix)
	left = convert_to_tree(lst[0], prefix + '0')
	right = convert_to_tree(lst[1], prefix + '1')
	return TreeNode(left.key + right.key, prefix, left, right)

	def download_wiki():
	HOST = "en.wikipedia.org"
	GET_METHOD = "GET"
	URL = "/wiki/Huffman_coding"
	ENCODING = "utf-8"

	# Get the HTML raw data
	conn = http.client.HTTPConnection(HOST)
	conn.request(GET_METHOD, URL)
	data = conn.getresponse().read()
	conn.close()
	return str(data, ENCODING)