Skip to content

Instantly share code, notes, and snippets.

@metula
Last active August 6, 2016 14:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save metula/9595064 to your computer and use it in GitHub Desktop.
Save metula/9595064 to your computer and use it in GitHub Desktop.
import http.client
import random
def random_string(n):
return "".join(chr(random.randrange(128)) for i in range(n))
def str_to_bin(text):
""" translates text to binary reprersentation using
7 bits per character. Non ASCII characters are discarded"""
return ''.join('{:07b}'.format(ord(ch)) for ch in text if ord(ch) < 128)
def bin_to_bytes(bin_string):
""" translates a string of 0s and 1s to raw bytes """
return bytes(int(bin_string[i : i + 8], 2) for i in range(0, len(bin_string), 8))
def purify(text):
""" Discards non ascii characters from text"""
return ''.join(ch for ch in text if ord(ch) < 128)
def char_count(text):
""" counts the number of occurances of ASCII characters -
ord(ch)<128, in a text. Returns a dictionary, with keys
being the observed characters, values being the counts """
d = {}
for ch in text:
if ord(ch) >= 128: continue
if ch in d:
d[ch] = 1 + d[ch]
else:
d[ch] = 1
return d
def build_huffman_tree(letter_count):
""" recieves dictionary with char:count entries
generates a list of letters representing
the binary Huffman encoding tree"""
queue = sorted(letter_count.items(), key=lambda elem: elem[1])
# sort by the count
while len(queue) >= 2:
# combine two smallest elements
ax, ay = queue.pop(0) # smallest in queue
bx, by = queue.pop(0) # next smallest
chars = [ax,bx]
weight = ay+by # combined weight
queue.append((chars,weight)) # insert new pair at beginning of queue
queue.sort(key=lambda elem: elem[1]) # re-arange (sort) queue
return queue[0][0] # peels off the irrelevant data (total weight)
def generate_code(tree_node, prefix=""):
""" recieves a tree node with embedded encoding,
and a prefix of encodings.
returns a dictionary where characters are
keys and associated binary strings are values.
*** assumes more than a single leaf *** """
if isinstance(tree_node, str): # a leaf
return {tree_node: prefix}
else:
lchild, rchild = tree_node
codebook = {}
codebook.update( generate_code(lchild, prefix + '0'))
codebook.update( generate_code(rchild, prefix + '1'))
# oh, the beauty of recursion...
return codebook
def build_decoding_dict(encoding_dict):
"""build the "reverse" of encoding dictionary"""
return {y:x for x,y in encoding_dict.items()}
def compress(text, encoding_dict):
""" compress text using encoding dictionary """
assert isinstance(text, str)
return ''.join(encoding_dict[ch] for ch in text if ord(ch) <= 128)
def decompress(bits, decoding_dict):
""" decompress encoded bits using decoding dictionary """
prefix = ''
result = []
for bit in bits:
prefix += bit
if prefix not in decoding_dict:
continue
result.append(decoding_dict[prefix])
prefix = ''
assert prefix == '' # must finish last codeword
return ''.join(result) # converts list of chars to a string
# Requires the TreeNode class from trees.py.
def convert_to_tree(lst, prefix=''):
"""Given a Huffman tree, represented as a list,
returns its TreeNode equivalent."""
if len(lst) == 1:
return TreeNode(lst[0], prefix)
left = convert_to_tree(lst[0], prefix + '0')
right = convert_to_tree(lst[1], prefix + '1')
return TreeNode(left.key + right.key, prefix, left, right)
def download_wiki():
HOST = "en.wikipedia.org"
GET_METHOD = "GET"
URL = "/wiki/Huffman_coding"
ENCODING = "utf-8"
# Get the HTML raw data
conn = http.client.HTTPConnection(HOST)
conn.request(GET_METHOD, URL)
data = conn.getresponse().read()
conn.close()
return str(data, ENCODING)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment