Skip to content

Instantly share code, notes, and snippets.

@SCOTT-HAMILTON
Last active February 5, 2022 21:41
Show Gist options
  • Save SCOTT-HAMILTON/568e619115343d17cd5045b36b3b3900 to your computer and use it in GitHub Desktop.
Save SCOTT-HAMILTON/568e619115343d17cd5045b36b3b3900 to your computer and use it in GitHub Desktop.
from anytree import Node, RenderTree
from functools import reduce
from itertools import groupby
import numpy as np
import os
import struct
class InvalidFileHeader(Exception):
pass
def make_occurrences_table(cleartext):
return sorted(
map(lambda g: (g[0], len(list(g[1]))), groupby(sorted(cleartext), lambda x: x)),
key=lambda g: g[1],
)
def make_bin_tree(left, right):
n = Node("")
if left:
left.parent = n
if right:
right.parent = n
return n
def tree_2_path_dict(huff_tree):
def node_2_bin_path(node, orig=None, p=0, n=0):
if orig == None:
orig = node.name
bro = node.parent.children[0]
if node != bro:
b = p
p = p | (1 << n)
n += 1
parent = node.parent
if parent and len(parent.siblings) > 0:
return node_2_bin_path(parent, orig, p, n)
else:
p = p | (1 << n)
return (orig, p)
return dict(map(node_2_bin_path, huff_tree.leaves))
def encode(cleartext, path_dict):
def concat_bits(a, b):
return (a << (b.bit_length() - 1)) | (b & (2 ** (b.bit_length() - 1) - 1))
return reduce(concat_bits, map(lambda c: path_dict[c], cleartext))
def left(node):
if len(node.children) >= 1:
return node.children[0]
else:
return None
def right(node):
if len(node.children) >= 2:
return node.children[1]
else:
return None
def make_huffman_tree_2(oc_table):
def make_huff_tree_iter(nl):
if len(nl) % 2 == 0:
cnl = nl
last = None
else:
last = nl[-1]
cnl = nl[: len(nl) - 1]
n = list(
map(
lambda x: make_bin_tree(x[0], x[1]),
np.asarray(cnl).reshape(int(len(cnl) / 2), 2).tolist(),
)
)
if last:
n += [last]
return n
def make_complete_tree(nl):
while len(nl) > 1:
nl = make_huff_tree_iter(nl)
return nl[0]
def group2tree(g):
return make_complete_tree(list(map(lambda x: Node(x[0]), g)))
return make_complete_tree(
[group2tree(g) for _, g in groupby(oc_table, lambda x: x[1])]
)
def encode_to_file(text, out_file):
oc_table = make_occurrences_table(text)
huff_tree = make_huffman_tree_2(oc_table)
path_dict = tree_2_path_dict(huff_tree)
encoded = encode(text, path_dict)
l = (
np.array(
list(map(lambda item: [ord(item[0]), item[1]], oc_table)) + [[0, 0]],
dtype="int32",
)
.flatten()
.tolist()
)
b = b"".join(list(map(lambda n: struct.pack("I", n), l)))
with open(out_file, "wb") as file:
file.write(b)
size = int(encoded.bit_length() / 8) + 1
file.write(encoded.to_bytes(size, "little"))
def decode_from_file(in_file):
def read_oc_table(file):
prev = 0x01
l = []
while True:
try:
one = file.read(2)
two = file.read(2)
except OSError:
raise InvalidFileHeader
if one == b"\x00\x00" and two == b"\x00\x00":
break
else:
l += [struct.unpack("I", one + two)[0]]
if len(l) % 2 != 0:
raise InvalidFileHeader
else:
nchunks = int(len(l) / 2)
return list(
map(
lambda g: (chr(g[0]), g[1]),
(np.asarray(l).reshape(nchunks, 2).tolist()),
)
)
def read_data(file):
try:
while file.read(1) == b"\x00":
pass
file.seek(file.tell() - 1)
data = file.read()
return data
except OSError:
raise InvalidFileHeader
with open(in_file, "rb") as file:
oc_table = read_oc_table(file)
data = int.from_bytes(read_data(file), "little")
huff_tree = make_huffman_tree_2(oc_table)
cnode = huff_tree
decoded = ""
for c in range(2, data.bit_length() + 1):
def bit(n, a):
return (a >> (a.bit_length() - n)) & 1
cb = bit(c, data)
p1 = cnode.path
cnode = right(cnode) if cb else left(cnode)
if cnode.is_leaf:
decoded += cnode.name
cnode = huff_tree
return decoded
text = """"
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Amet tellus cras adipiscing enim eu turpis egestas pretium. In ante metus dictum at tempor commodo ullamcorper. Nulla pharetra diam sit amet nisl suscipit adipiscing bibendum. Turpis nunc eget lorem dolor sed viverra. Egestas dui id ornare arcu odio ut sem. Commodo quis imperdiet massa tincidunt nunc pulvinar sapien et. Nunc non blandit massa enim. Morbi tincidunt ornare massa eget. Sed adipiscing diam donec adipiscing tristique risus nec feugiat in. Interdum varius sit amet mattis vulputate enim. In hendrerit gravida rutrum quisque non. Sed blandit libero volutpat sed cras. Posuere ac ut consequat semper viverra. Et ultrices neque ornare aenean euismod elementum nisi quis eleifend. Imperdiet sed euismod nisi porta lorem mollis aliquam. Adipiscing vitae proin sagittis nisl rhoncus. Vulputate eu scelerisque felis imperdiet proin fermentum leo vel orci.
"""
file = "out.bin"
encode_to_file(text, file)
decoded = decode_from_file(file)
print(f"decoded text=`{decoded}`")
fs = os.path.getsize(file)
pc = int(fs/len(text)*100)
print(f"{len(text)} bytes compressed to {fs} bytes ({pc}%).")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment