Skip to content

Instantly share code, notes, and snippets.

@Dhravya
Created February 6, 2023 20:45
Show Gist options
  • Save Dhravya/a8406fee264c7f9ce2944a7bea3e878e to your computer and use it in GitHub Desktop.
Save Dhravya/a8406fee264c7f9ce2944a7bea3e878e to your computer and use it in GitHub Desktop.
from rich import print
import time
FORBIDDEN_CHARS = ["⍼", "⳧"]
text= "really really really long text that has a lot of repeating stuff so that it can be coompressed"
def find_all_non_overlapping_repeating_substrings(text: str):
text = (text + ' ') if text[-1] != ' ' else text
repeating_substrings = {}
f = text
for i in range(len(text)):
for j in range(i + 2, min(i + 15, len(text))):
substring = text[i:j]
if substring in text[j:]:
if substring in repeating_substrings:
repeating_substrings[substring] += 1
else:
repeating_substrings[substring] = 2
for substring in list(repeating_substrings.keys()):
if repeating_substrings[substring] < 3:
repeating_substrings.pop(substring)
for substring in list(repeating_substrings.keys()):
for substring2 in list(repeating_substrings.keys()):
if substring != substring2 and substring in substring2:
repeating_substrings.pop(substring)
break
n_replace_ran = 0
final_list = []
def replace_shit(txt):
nonlocal n_replace_ran
nonlocal f
n_replace_ran += 1
if f.count(txt) == value:
f = f.replace(txt, "")
final_list.append(txt)
n_replace_ran = 0
else:
if 30 > n_replace_ran >= 15:
replace_shit(txt[:-1])
elif n_replace_ran >= 30:
return
else:
replace_shit(txt[1:])
for key in repeating_substrings:
value = repeating_substrings[key]
replace_shit(key)
return final_list
def find_all_unused_unicode(text: str, number: int = 10000):
# return [chr(i) for i in range(0, 10000) if chr(i) not in text]
found = []
for i in range(500, 10000):
if chr(i) not in text:
found.append(chr(i))
if len(found) >= number:
break
return found
def compress(text: str):
if any(char in text for char in FORBIDDEN_CHARS):
raise ValueError("Text contains forbidden characters")
substrings = find_all_non_overlapping_repeating_substrings(text)
unused_chars = find_all_unused_unicode(text, len(substrings))
mapping = {}
for i in range(len(substrings)):
mapping[substrings[i]] = unused_chars[i]
for key in mapping:
text = text.replace(key, mapping[key])
reference_string = ""
for key in mapping:
reference_string += f"{key}⳧{mapping[key]}⍼"
if len(substrings) == 0:
return text
return reference_string + "⍼⍼⍼" + text
def decompress(text: str):
if "⍼⍼⍼" not in text:
return text
reference_string, text = text.split("⍼⍼⍼⍼")
mapping = {}
for pair in reference_string.split("⍼"):
if pair != "":
try:
key, value = pair.split("⳧")
mapping[value] = key
except ValueError:
pass
for key in mapping:
text = text.replace(key, mapping[key])
return text
def test(text):
compress_time = time.time()
compressed = compress(text)
print(f"Compression took {time.time() - compress_time:.2f} seconds")
decompress_time = time.time()
decompressed = decompress(compressed)
print(f"Decompression took {time.time() - decompress_time:.2f} seconds")
print("Compression/decompression successful:", decompressed == text)
if (decompressed != text):
loss = (len(text) - len(decompressed)) / len(text) * 100
print("Loss in quality: ", loss, "%")
print(decompressed)
print("Is compressed smaller than original:", len(compressed) < len(text))
print(
f"Compressed text is smaller by {(len(text) - len(compressed)) / len(text) * 100:.2f}%")
test(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment