Skip to content

Instantly share code, notes, and snippets.

@ricsi98
Last active December 12, 2021 18:54
Show Gist options
  • Save ricsi98/5454136f1fc212ada4dc0781df61be2a to your computer and use it in GitHub Desktop.
Save ricsi98/5454136f1fc212ada4dc0781df61be2a to your computer and use it in GitHub Desktop.
Lempel-Ziv-Welch algorithm in python
from typing import Dict, List
def match_length(data: bytearray, ptr: int, word: bytearray):
offset = 0
while offset < len(word) and offset + ptr < len(data):
if data[ptr + offset] != word[offset]: break
offset += 1
return offset
def find_longest_match(data: bytearray, ptr: int, dictionary: Dict[int, bytearray]):
if len(dictionary) == 0: return -1, 0
longest_idx, longest_match = -1, 0
for i, word in dictionary.items():
if type(word) == int:
word = [word]
l = match_length(data, ptr, word)
if l > longest_match:
longest_idx = i
longest_match = l
return longest_idx, longest_match
class LZW:
"""Implementation of Lempel-Ziv-Welch algorithm with a maximal dictionary size"""
def __init__(self, max_dict_len: int):
self.max_dict_len = max_dict_len
def encode(self, data: bytearray) -> List[int]:
D = {}
characters = set(data)
current_idx = 1
for c in characters:
D[current_idx] = c.to_bytes(1, 'big')
current_idx += 1
start_dict = {k: v for k, v in D.items()}
ptr = 0
out = []
while ptr < len(data):
i, l = find_longest_match(data, ptr, D)
out += [i]
if current_idx < self.max_dict_len:
D[current_idx] = data[ptr:ptr + l + 1]
current_idx += 1
ptr += l
return start_dict, out
def decode(self, dictionary: Dict[int, bytearray], code: List[int]) -> bytearray:
out = bytearray("", "utf-8")
current_idx = max(list(dictionary.keys())) + 1
# copy dict
dictionary = {k: v for k, v in dictionary.items()}
for i, j in zip(code, code[1:] + [-1]):
cword = dictionary[i]
out += cword
if j == -1: # stop token
break
if current_idx < self.max_dict_len:
if j < current_idx:
nword = dictionary[j]
elif j == current_idx:
nword = dictionary[i]
nword_first = nword[0]
new_dict_word = cword + nword_first.to_bytes(1, 'big')
dictionary[current_idx] = new_dict_word
current_idx += 1
return out
if __name__ == "__main__":
c = LZW(256)
with open("data.txt", "r") as f:
as_string = bytearray(f.read(), "utf-8")
cdict, comp = c.encode(as_string)
print(len(cdict), len(comp))
decomp = c.decode(cdict, comp)
print(decomp == as_string)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment