Dhravya/compress.py

## compress.py
from rich import print
import time

FORBIDDEN_CHARS = ["⍼", "⳧"]

text= "really really really long text that has a lot of repeating stuff so that it can be coompressed"


def find_all_non_overlapping_repeating_substrings(text: str):
    text = (text + ' ') if text[-1] != ' ' else text
    repeating_substrings = {}
    f = text

    for i in range(len(text)):
        for j in range(i + 2, min(i + 15, len(text))):
            substring = text[i:j]
            if substring in text[j:]:
                if substring in repeating_substrings:
                    repeating_substrings[substring] += 1
                else:
                    repeating_substrings[substring] = 2

    for substring in list(repeating_substrings.keys()):
        if repeating_substrings[substring] < 3:
            repeating_substrings.pop(substring)

    for substring in list(repeating_substrings.keys()):
        for substring2 in list(repeating_substrings.keys()):
            if substring != substring2 and substring in substring2:
                repeating_substrings.pop(substring)
                break

    n_replace_ran = 0
    final_list = []

    def replace_shit(txt):
        nonlocal n_replace_ran
        nonlocal f

        n_replace_ran += 1
        if f.count(txt) == value:
            f = f.replace(txt, "")
            final_list.append(txt)
            n_replace_ran = 0
        else:
            if 30 > n_replace_ran >= 15:
                replace_shit(txt[:-1])
            elif n_replace_ran >= 30:
                return
            else:
                replace_shit(txt[1:])

    for key in repeating_substrings:
        value = repeating_substrings[key]
        replace_shit(key)

    return final_list


def find_all_unused_unicode(text: str, number: int = 10000):
    # return [chr(i) for i in range(0, 10000) if chr(i) not in text]
    found = []
    for i in range(500, 10000):
        if chr(i) not in text:
            found.append(chr(i))
        if len(found) >= number:
            break

    return found


def compress(text: str):
    if any(char in text for char in FORBIDDEN_CHARS):
        raise ValueError("Text contains forbidden characters")
    substrings = find_all_non_overlapping_repeating_substrings(text)
    unused_chars = find_all_unused_unicode(text, len(substrings))
    mapping = {}

    for i in range(len(substrings)):
        mapping[substrings[i]] = unused_chars[i]

    for key in mapping:
        text = text.replace(key, mapping[key])
    reference_string = ""
    for key in mapping:
        reference_string += f"{key}⳧{mapping[key]}⍼"

    if len(substrings) == 0:
        return text

    return reference_string + "⍼⍼⍼" + text


def decompress(text: str):
    if "⍼⍼⍼" not in text:
        return text
    reference_string, text = text.split("⍼⍼⍼⍼")
    mapping = {}
    for pair in reference_string.split("⍼"):
        if pair != "":
            try:
                key, value = pair.split("⳧")
                mapping[value] = key
            except ValueError:
                pass

    for key in mapping:
        text = text.replace(key, mapping[key])

    return text


def test(text):

    compress_time = time.time()
    compressed = compress(text)
    print(f"Compression took {time.time() - compress_time:.2f} seconds")

    decompress_time = time.time()
    decompressed = decompress(compressed)
    print(f"Decompression took {time.time() - decompress_time:.2f} seconds")

    print("Compression/decompression successful:", decompressed == text)
    if (decompressed != text):
        loss = (len(text) - len(decompressed)) / len(text) * 100
        print("Loss in quality: ", loss, "%")
        print(decompressed)

    print("Is compressed smaller than original:", len(compressed) < len(text))
    print(
        f"Compressed text is smaller by {(len(text) - len(compressed)) / len(text) * 100:.2f}%")


test(text)
	from rich import print
	import time

	FORBIDDEN_CHARS = ["⍼", "⳧"]

	text= "really really really long text that has a lot of repeating stuff so that it can be coompressed"


	def find_all_non_overlapping_repeating_substrings(text: str):
	text = (text + ' ') if text[-1] != ' ' else text
	repeating_substrings = {}
	f = text

	for i in range(len(text)):
	for j in range(i + 2, min(i + 15, len(text))):
	substring = text[i:j]
	if substring in text[j:]:
	if substring in repeating_substrings:
	repeating_substrings[substring] += 1
	else:
	repeating_substrings[substring] = 2

	for substring in list(repeating_substrings.keys()):
	if repeating_substrings[substring] < 3:
	repeating_substrings.pop(substring)

	for substring in list(repeating_substrings.keys()):
	for substring2 in list(repeating_substrings.keys()):
	if substring != substring2 and substring in substring2:
	repeating_substrings.pop(substring)
	break

	n_replace_ran = 0
	final_list = []

	def replace_shit(txt):
	nonlocal n_replace_ran
	nonlocal f

	n_replace_ran += 1
	if f.count(txt) == value:
	f = f.replace(txt, "")
	final_list.append(txt)
	n_replace_ran = 0
	else:
	if 30 > n_replace_ran >= 15:
	replace_shit(txt[:-1])
	elif n_replace_ran >= 30:
	return
	else:
	replace_shit(txt[1:])

	for key in repeating_substrings:
	value = repeating_substrings[key]
	replace_shit(key)

	return final_list


	def find_all_unused_unicode(text: str, number: int = 10000):
	# return [chr(i) for i in range(0, 10000) if chr(i) not in text]
	found = []
	for i in range(500, 10000):
	if chr(i) not in text:
	found.append(chr(i))
	if len(found) >= number:
	break

	return found


	def compress(text: str):
	if any(char in text for char in FORBIDDEN_CHARS):
	raise ValueError("Text contains forbidden characters")
	substrings = find_all_non_overlapping_repeating_substrings(text)
	unused_chars = find_all_unused_unicode(text, len(substrings))
	mapping = {}

	for i in range(len(substrings)):
	mapping[substrings[i]] = unused_chars[i]

	for key in mapping:
	text = text.replace(key, mapping[key])
	reference_string = ""
	for key in mapping:
	reference_string += f"{key}⳧{mapping[key]}⍼"

	if len(substrings) == 0:
	return text

	return reference_string + "⍼⍼⍼" + text


	def decompress(text: str):
	if "⍼⍼⍼" not in text:
	return text
	reference_string, text = text.split("⍼⍼⍼⍼")
	mapping = {}
	for pair in reference_string.split("⍼"):
	if pair != "":
	try:
	key, value = pair.split("⳧")
	mapping[value] = key
	except ValueError:
	pass

	for key in mapping:
	text = text.replace(key, mapping[key])

	return text


	def test(text):

	compress_time = time.time()
	compressed = compress(text)
	print(f"Compression took {time.time() - compress_time:.2f} seconds")

	decompress_time = time.time()
	decompressed = decompress(compressed)
	print(f"Decompression took {time.time() - decompress_time:.2f} seconds")

	print("Compression/decompression successful:", decompressed == text)
	if (decompressed != text):
	loss = (len(text) - len(decompressed)) / len(text) * 100
	print("Loss in quality: ", loss, "%")
	print(decompressed)

	print("Is compressed smaller than original:", len(compressed) < len(text))
	print(
	f"Compressed text is smaller by {(len(text) - len(compressed)) / len(text) * 100:.2f}%")


	test(text)