Skip to content

Instantly share code, notes, and snippets.

@philip-sterne
Created April 12, 2018 13:36
Show Gist options
  • Save philip-sterne/2e0f27e8981b17094a28475d6190d3f6 to your computer and use it in GitHub Desktop.
Save philip-sterne/2e0f27e8981b17094a28475d6190d3f6 to your computer and use it in GitHub Desktop.
import heapq
from urllib.request import urlopen
import shutil
import gzip
import os
from collections import defaultdict
from bitarray import bitarray
import pickle
# Download the file if need be:
def download_file(url, filename):
if not os.path.exists(filename):
response = urlopen(url + filename)
shutil.copyfileobj(
gzip.GzipFile(fileobj=response), open(filename, 'wb'))
# build a frequency table:
def build_freq(filename):
freq = defaultdict(int)
with open(filename, 'rb') as f:
for line in f:
for char in line:
freq[char] += 1
total = float(sum(freq.values()))
return {char: count / total for (char, count) in freq.items()}
# Now build the Huffman encoding:
def encode(symb2freq):
"""Huffman encode the given dict mapping symbols to weights.
Accept a dictionary which maps a symbol to a probability.
Return a new dictionary which maps a symbol to a bitarray."""
raise NotImplementedError
# Now compress the file:
def compress(filename, encoding, compressed_name=None):
if compressed_name is None:
compressed_name = filename + ".huff"
output = bitarray()
with open(filename, 'rb') as f:
for line in f:
for char in line:
output.extend(encoding[char])
N = len(output)
with open(compressed_name, 'wb') as f:
pickle.dump(N, f)
pickle.dump(encoding, f)
output.tofile(f)
# Now decompress the file:
def decompress(filename, decompressed_name=None):
if decompressed_name is None:
decompressed_name = filename + ".dehuff"
with open(filename, 'rb') as f:
N = pickle.load(f)
encoding = pickle.load(f)
bits = bitarray()
bits.fromfile(f)
bits = bits[:N]
# Totally cheating here and using a builtin method:
output = bits.decode(encoding)
with open(decompressed_name, 'wb') as f:
f.write(bytes(output))
url = "http://www.gutenberg.org/ebooks/"
filename = "100.txt.utf-8"
download_file(url, filename)
freq = build_freq(filename)
encoding = encode(freq)
compress(filename, encoding)
decompress(filename + ".huff")
# Do you get identical files?
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment