ricsi98/lzw.py

## lzw.py
from typing import Dict, List


def match_length(data: bytearray, ptr: int, word: bytearray):
    offset = 0
    while offset < len(word) and offset + ptr < len(data):
        if data[ptr + offset] != word[offset]: break
        offset += 1
    return offset

def find_longest_match(data: bytearray, ptr: int, dictionary: Dict[int, bytearray]):
    if len(dictionary) == 0: return -1, 0
    longest_idx, longest_match = -1, 0
    for i, word in dictionary.items():
        if type(word) == int:
            word = [word]
        l = match_length(data, ptr, word)
        if l > longest_match:
            longest_idx = i
            longest_match = l
    return longest_idx, longest_match

class LZW:
    """Implementation of Lempel-Ziv-Welch algorithm with a maximal dictionary size"""

    def __init__(self, max_dict_len: int):
        self.max_dict_len = max_dict_len

    def encode(self, data: bytearray) -> List[int]:
        D = {}
        characters = set(data)
        current_idx = 1
        for c in characters:
            D[current_idx] = c.to_bytes(1, 'big')
            current_idx += 1

        start_dict = {k: v for k, v in D.items()}

        ptr = 0
        out = []
        while ptr < len(data):
            i, l = find_longest_match(data, ptr, D)
            out += [i]
            if current_idx < self.max_dict_len:
                D[current_idx] = data[ptr:ptr + l + 1]
                current_idx += 1
            ptr += l

        return start_dict, out

    def decode(self, dictionary: Dict[int, bytearray], code: List[int]) -> bytearray:
        out = bytearray("", "utf-8")
        current_idx = max(list(dictionary.keys())) + 1
        # copy dict
        dictionary = {k: v for k, v in dictionary.items()}

        for i, j in zip(code, code[1:] + [-1]):
            cword = dictionary[i]
            out += cword

            if j == -1: # stop token
                break

            if current_idx < self.max_dict_len:
                if j < current_idx:
                    nword = dictionary[j]
                elif j == current_idx:
                    nword = dictionary[i]
                nword_first = nword[0]
                new_dict_word = cword + nword_first.to_bytes(1, 'big')
                dictionary[current_idx] = new_dict_word

                current_idx += 1

        return out


if __name__ == "__main__":
    c = LZW(256)
    with open("data.txt", "r") as f:
        as_string = bytearray(f.read(), "utf-8")
    cdict, comp = c.encode(as_string)
    print(len(cdict), len(comp))
    decomp = c.decode(cdict, comp)
    print(decomp == as_string)
	from typing import Dict, List


	def match_length(data: bytearray, ptr: int, word: bytearray):
	offset = 0
	while offset < len(word) and offset + ptr < len(data):
	if data[ptr + offset] != word[offset]: break
	offset += 1
	return offset

	def find_longest_match(data: bytearray, ptr: int, dictionary: Dict[int, bytearray]):
	if len(dictionary) == 0: return -1, 0
	longest_idx, longest_match = -1, 0
	for i, word in dictionary.items():
	if type(word) == int:
	word = [word]
	l = match_length(data, ptr, word)
	if l > longest_match:
	longest_idx = i
	longest_match = l
	return longest_idx, longest_match

	class LZW:
	"""Implementation of Lempel-Ziv-Welch algorithm with a maximal dictionary size"""

	def __init__(self, max_dict_len: int):
	self.max_dict_len = max_dict_len

	def encode(self, data: bytearray) -> List[int]:
	D = {}
	characters = set(data)
	current_idx = 1
	for c in characters:
	D[current_idx] = c.to_bytes(1, 'big')
	current_idx += 1

	start_dict = {k: v for k, v in D.items()}

	ptr = 0
	out = []
	while ptr < len(data):
	i, l = find_longest_match(data, ptr, D)
	out += [i]
	if current_idx < self.max_dict_len:
	D[current_idx] = data[ptr:ptr + l + 1]
	current_idx += 1
	ptr += l

	return start_dict, out

	def decode(self, dictionary: Dict[int, bytearray], code: List[int]) -> bytearray:
	out = bytearray("", "utf-8")
	current_idx = max(list(dictionary.keys())) + 1
	# copy dict
	dictionary = {k: v for k, v in dictionary.items()}

	for i, j in zip(code, code[1:] + [-1]):
	cword = dictionary[i]
	out += cword

	if j == -1: # stop token
	break

	if current_idx < self.max_dict_len:
	if j < current_idx:
	nword = dictionary[j]
	elif j == current_idx:
	nword = dictionary[i]
	nword_first = nword[0]
	new_dict_word = cword + nword_first.to_bytes(1, 'big')
	dictionary[current_idx] = new_dict_word

	current_idx += 1

	return out


	if __name__ == "__main__":
	c = LZW(256)
	with open("data.txt", "r") as f:
	as_string = bytearray(f.read(), "utf-8")
	cdict, comp = c.encode(as_string)
	print(len(cdict), len(comp))
	decomp = c.decode(cdict, comp)
	print(decomp == as_string)