Skip to content

Instantly share code, notes, and snippets.

@Shaunwei
Last active January 14, 2016 02:19
Show Gist options
  • Save Shaunwei/83d984ab4b424c8407fe to your computer and use it in GitHub Desktop.
Save Shaunwei/83d984ab4b424c8407fe to your computer and use it in GitHub Desktop.
String compressions - Simple compression and Huffman compression - compare with unpack/pack string problem
"""
Use serveral ways to compress string `everyday is awesome!`
1. use simple bits to replace ASCII value
2. use huffman coding
"""
def get_rate(compressed_binary, uncompressed_bits):
return len(compressed_binary) * 100 / uncompressed_bits
class SimpleCompression:
def __init__(self, string):
self.symbols = set(string)
self.bit_len = 1
while 2**self.bit_len < len(self.symbols):
self.bit_len += 1
self.string = string
self.s2b = {}
self.b2s = {}
i = 0
for s in self.symbols:
b = bin(i)[2:]
if len(b) < self.bit_len:
b = (self.bit_len - len(b)) * '0' + b
self.s2b[s] = b
self.b2s[b] = s
i += 1
def compress(self):
bits = ''
for s in self.string:
bits += self.s2b[s]
return bits
def uncompress(self, bits):
string = ''
for i in xrange(0, len(bits), self.bit_len):
string += self.b2s[bits[i:i + self.bit_len]]
return string
import heapq
import collections
class HuffmanCompression:
class Trie:
def __init__(self, val, char=''):
self.val = val
self.char = char
self.coding = ''
self.left = self.right = None
def __cmp__(self, other):
return self.val - other.val
def __init__(self, string):
self.string = string
counter = collections.Counter(string)
heap = []
for char, cnt in counter.items():
heapq.heappush(heap, HuffmanCompression.Trie(cnt, char))
while len(heap) != 1:
left = heapq.heappop(heap)
right = heapq.heappop(heap)
trie = HuffmanCompression.Trie(left.val + right.val)
trie.left, trie.right = left, right
heapq.heappush(heap, trie)
self.root = heap[0]
self.s2b = {}
self.bfs_encode(self.root, self.s2b)
def bfs_encode(self, root, s2b):
queue = collections.deque()
queue.append(root)
while queue:
node = queue.popleft()
if node.char:
s2b[node.char] = node.coding
continue
if node.left:
node.left.coding = node.coding + '0'
queue.append(node.left)
if node.right:
node.right.coding = node.coding + '1'
queue.append(node.right)
def compress(self):
bits = ''
for char in self.string:
bits += self.s2b[char]
return bits
def uncompress(self, bits):
string = ''
root = self.root
for bit in bits:
if bit == '0':
root = root.left
else:
root = root.right
if root.char:
string += root.char
root = self.root
return string
if __name__ == '__main__':
s = 'everyday is awesome!'
# ASCII
bits = len(s) * 8
print('Total bits: %d' % bits)
# simple compression
sc = SimpleCompression(s)
compressed = sc.compress()
print('Compressed binary: ' + compressed)
print('Uncompressed: ' + sc.uncompress(compressed))
print(sc.s2b)
print('Simple Compression-compress rate: %d%%' % get_rate(compressed, bits))
print('===================')
# huffman compression
hc = HuffmanCompression(s)
compressed = hc.compress()
print('Compressed binary: ' + compressed)
print('Uncompressed: ' + hc.uncompress(compressed))
print(hc.s2b)
print('Huffman Compression-compress rate: %d%%' % get_rate(compressed, bits))
"""
Total bits: 160
Compressed binary: 00101011001010001100001100001100000101000111000100001010001001110110010100101001
Uncompressed: everyday is awesome!
{'a': '0000', ' ': '0001', 'e': '0010', 'd': '0011', 'i': '0100', 'm': '0101', 'o': '0110', 's': '0111', 'r': '1000', '!': '1001', 'w': '1010', 'v': '1011', 'y': '1100'}
Simple Compression-compress rate: 50%
===================
Compressed binary: 011001011011110011010111111000010000111000111111010011110100011011010001
Uncompressed: everyday is awesome!
{'!': '0001', ' ': '001', 'e': '01', 'd': '11010', 'i': '0000', 'm': '11011', 'o': '1000', 's': '1110', 'r': '1011', 'a': '1111', 'w': '1010', 'v': '1001', 'y': '1100'}
Huffman Compression-compress rate: 45%
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment