Created
April 12, 2018 13:36
-
-
Save philip-sterne/2e0f27e8981b17094a28475d6190d3f6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import heapq | |
from urllib.request import urlopen | |
import shutil | |
import gzip | |
import os | |
from collections import defaultdict | |
from bitarray import bitarray | |
import pickle | |
# Download the file if need be: | |
def download_file(url, filename): | |
if not os.path.exists(filename): | |
response = urlopen(url + filename) | |
shutil.copyfileobj( | |
gzip.GzipFile(fileobj=response), open(filename, 'wb')) | |
# build a frequency table: | |
def build_freq(filename): | |
freq = defaultdict(int) | |
with open(filename, 'rb') as f: | |
for line in f: | |
for char in line: | |
freq[char] += 1 | |
total = float(sum(freq.values())) | |
return {char: count / total for (char, count) in freq.items()} | |
# Now build the Huffman encoding: | |
def encode(symb2freq): | |
"""Huffman encode the given dict mapping symbols to weights. | |
Accept a dictionary which maps a symbol to a probability. | |
Return a new dictionary which maps a symbol to a bitarray.""" | |
raise NotImplementedError | |
# Now compress the file: | |
def compress(filename, encoding, compressed_name=None): | |
if compressed_name is None: | |
compressed_name = filename + ".huff" | |
output = bitarray() | |
with open(filename, 'rb') as f: | |
for line in f: | |
for char in line: | |
output.extend(encoding[char]) | |
N = len(output) | |
with open(compressed_name, 'wb') as f: | |
pickle.dump(N, f) | |
pickle.dump(encoding, f) | |
output.tofile(f) | |
# Now decompress the file: | |
def decompress(filename, decompressed_name=None): | |
if decompressed_name is None: | |
decompressed_name = filename + ".dehuff" | |
with open(filename, 'rb') as f: | |
N = pickle.load(f) | |
encoding = pickle.load(f) | |
bits = bitarray() | |
bits.fromfile(f) | |
bits = bits[:N] | |
# Totally cheating here and using a builtin method: | |
output = bits.decode(encoding) | |
with open(decompressed_name, 'wb') as f: | |
f.write(bytes(output)) | |
url = "http://www.gutenberg.org/ebooks/" | |
filename = "100.txt.utf-8" | |
download_file(url, filename) | |
freq = build_freq(filename) | |
encoding = encode(freq) | |
compress(filename, encoding) | |
decompress(filename + ".huff") | |
# Do you get identical files? |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment