Last active
August 6, 2016 14:16
-
-
Save metula/9595064 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import http.client | |
import random | |
def random_string(n): | |
return "".join(chr(random.randrange(128)) for i in range(n)) | |
def str_to_bin(text): | |
""" translates text to binary reprersentation using | |
7 bits per character. Non ASCII characters are discarded""" | |
return ''.join('{:07b}'.format(ord(ch)) for ch in text if ord(ch) < 128) | |
def bin_to_bytes(bin_string): | |
""" translates a string of 0s and 1s to raw bytes """ | |
return bytes(int(bin_string[i : i + 8], 2) for i in range(0, len(bin_string), 8)) | |
def purify(text): | |
""" Discards non ascii characters from text""" | |
return ''.join(ch for ch in text if ord(ch) < 128) | |
def char_count(text): | |
""" counts the number of occurances of ASCII characters - | |
ord(ch)<128, in a text. Returns a dictionary, with keys | |
being the observed characters, values being the counts """ | |
d = {} | |
for ch in text: | |
if ord(ch) >= 128: continue | |
if ch in d: | |
d[ch] = 1 + d[ch] | |
else: | |
d[ch] = 1 | |
return d | |
def build_huffman_tree(letter_count): | |
""" recieves dictionary with char:count entries | |
generates a list of letters representing | |
the binary Huffman encoding tree""" | |
queue = sorted(letter_count.items(), key=lambda elem: elem[1]) | |
# sort by the count | |
while len(queue) >= 2: | |
# combine two smallest elements | |
ax, ay = queue.pop(0) # smallest in queue | |
bx, by = queue.pop(0) # next smallest | |
chars = [ax,bx] | |
weight = ay+by # combined weight | |
queue.append((chars,weight)) # insert new pair at beginning of queue | |
queue.sort(key=lambda elem: elem[1]) # re-arange (sort) queue | |
return queue[0][0] # peels off the irrelevant data (total weight) | |
def generate_code(tree_node, prefix=""): | |
""" recieves a tree node with embedded encoding, | |
and a prefix of encodings. | |
returns a dictionary where characters are | |
keys and associated binary strings are values. | |
*** assumes more than a single leaf *** """ | |
if isinstance(tree_node, str): # a leaf | |
return {tree_node: prefix} | |
else: | |
lchild, rchild = tree_node | |
codebook = {} | |
codebook.update( generate_code(lchild, prefix + '0')) | |
codebook.update( generate_code(rchild, prefix + '1')) | |
# oh, the beauty of recursion... | |
return codebook | |
def build_decoding_dict(encoding_dict): | |
"""build the "reverse" of encoding dictionary""" | |
return {y:x for x,y in encoding_dict.items()} | |
def compress(text, encoding_dict): | |
""" compress text using encoding dictionary """ | |
assert isinstance(text, str) | |
return ''.join(encoding_dict[ch] for ch in text if ord(ch) <= 128) | |
def decompress(bits, decoding_dict): | |
""" decompress encoded bits using decoding dictionary """ | |
prefix = '' | |
result = [] | |
for bit in bits: | |
prefix += bit | |
if prefix not in decoding_dict: | |
continue | |
result.append(decoding_dict[prefix]) | |
prefix = '' | |
assert prefix == '' # must finish last codeword | |
return ''.join(result) # converts list of chars to a string | |
# Requires the TreeNode class from trees.py. | |
def convert_to_tree(lst, prefix=''): | |
"""Given a Huffman tree, represented as a list, | |
returns its TreeNode equivalent.""" | |
if len(lst) == 1: | |
return TreeNode(lst[0], prefix) | |
left = convert_to_tree(lst[0], prefix + '0') | |
right = convert_to_tree(lst[1], prefix + '1') | |
return TreeNode(left.key + right.key, prefix, left, right) | |
def download_wiki(): | |
HOST = "en.wikipedia.org" | |
GET_METHOD = "GET" | |
URL = "/wiki/Huffman_coding" | |
ENCODING = "utf-8" | |
# Get the HTML raw data | |
conn = http.client.HTTPConnection(HOST) | |
conn.request(GET_METHOD, URL) | |
data = conn.getresponse().read() | |
conn.close() | |
return str(data, ENCODING) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment