Last active
December 12, 2021 18:54
-
-
Save ricsi98/5454136f1fc212ada4dc0781df61be2a to your computer and use it in GitHub Desktop.
Lempel-Ziv-Welch algorithm in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Dict, List | |
def match_length(data: bytearray, ptr: int, word: bytearray): | |
offset = 0 | |
while offset < len(word) and offset + ptr < len(data): | |
if data[ptr + offset] != word[offset]: break | |
offset += 1 | |
return offset | |
def find_longest_match(data: bytearray, ptr: int, dictionary: Dict[int, bytearray]): | |
if len(dictionary) == 0: return -1, 0 | |
longest_idx, longest_match = -1, 0 | |
for i, word in dictionary.items(): | |
if type(word) == int: | |
word = [word] | |
l = match_length(data, ptr, word) | |
if l > longest_match: | |
longest_idx = i | |
longest_match = l | |
return longest_idx, longest_match | |
class LZW: | |
"""Implementation of Lempel-Ziv-Welch algorithm with a maximal dictionary size""" | |
def __init__(self, max_dict_len: int): | |
self.max_dict_len = max_dict_len | |
def encode(self, data: bytearray) -> List[int]: | |
D = {} | |
characters = set(data) | |
current_idx = 1 | |
for c in characters: | |
D[current_idx] = c.to_bytes(1, 'big') | |
current_idx += 1 | |
start_dict = {k: v for k, v in D.items()} | |
ptr = 0 | |
out = [] | |
while ptr < len(data): | |
i, l = find_longest_match(data, ptr, D) | |
out += [i] | |
if current_idx < self.max_dict_len: | |
D[current_idx] = data[ptr:ptr + l + 1] | |
current_idx += 1 | |
ptr += l | |
return start_dict, out | |
def decode(self, dictionary: Dict[int, bytearray], code: List[int]) -> bytearray: | |
out = bytearray("", "utf-8") | |
current_idx = max(list(dictionary.keys())) + 1 | |
# copy dict | |
dictionary = {k: v for k, v in dictionary.items()} | |
for i, j in zip(code, code[1:] + [-1]): | |
cword = dictionary[i] | |
out += cword | |
if j == -1: # stop token | |
break | |
if current_idx < self.max_dict_len: | |
if j < current_idx: | |
nword = dictionary[j] | |
elif j == current_idx: | |
nword = dictionary[i] | |
nword_first = nword[0] | |
new_dict_word = cword + nword_first.to_bytes(1, 'big') | |
dictionary[current_idx] = new_dict_word | |
current_idx += 1 | |
return out | |
if __name__ == "__main__": | |
c = LZW(256) | |
with open("data.txt", "r") as f: | |
as_string = bytearray(f.read(), "utf-8") | |
cdict, comp = c.encode(as_string) | |
print(len(cdict), len(comp)) | |
decomp = c.decode(cdict, comp) | |
print(decomp == as_string) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment