Skip to content

Instantly share code, notes, and snippets.

@sentenzo
Created July 13, 2024 11:45
Show Gist options
  • Save sentenzo/9cf2f48f781610e0d17238ca9f93578b to your computer and use it in GitHub Desktop.
Save sentenzo/9cf2f48f781610e0d17238ca9f93578b to your computer and use it in GitHub Desktop.
Скрипт для сбора статистики по RLE
import os
from collections import Counter, defaultdict
from app.packager import Packager
from app.transformations import BWT, HFC, MTF, RlePackBits
BUFFER_SIZE = 1024 * 128 # 128 KiB
ROOT_PATH = r".\local\data"
PEC_TH = 3
CATHEGORIES = {
("cath_1", "dir_1"): [
"file1.txt",
"file2.txt",
"file3.txt",
"file4.txt",
"file5.txt",
],
("cath_2", "dir_2"): [
"file1.txt",
"file2.txt",
"file3.txt",
"file4.txt",
"file5.txt",
],
}
def byte_repeats_counter(path_to_file: str) -> tuple[Counter, dict]:
prev_byte = None
counter: Counter = Counter()
peculiarities: dict = defaultdict(set)
acc_length = 1
with open(path_to_file, "rb") as file:
while block := file.read(BUFFER_SIZE):
for byte in block:
if byte == prev_byte:
acc_length += 1
else:
counter[acc_length] += 1
if acc_length > PEC_TH:
peculiarities[acc_length].add(prev_byte)
acc_length = 1
prev_byte = byte
counter[acc_length] += 1
if acc_length > PEC_TH:
peculiarities[acc_length].add(prev_byte)
return counter, peculiarities
def print_counter(counter: Counter, name: str, padding: int = 0) -> None:
total_bytes_count = 0
bytes_count_by_repeat_length = []
for acc, cnt in counter.items():
total_bytes_count += acc * cnt
bytes_count_by_repeat_length.append((acc, cnt, acc * cnt))
print(" " * padding, name, sep="")
for acc, cnt, total in sorted(
bytes_count_by_repeat_length, key=lambda t: -t[2]
):
print(
" " * (padding + 2),
f"rep: {acc:>6}, cnt: {cnt:>12}, total: {total:>12}, percent: {100*total/total_bytes_count:>7.4f} %",
sep="",
)
def prep_data(file_path):
# pack = Packager(MTF())
pack = Packager(BWT() >> MTF())
pack.apply_encoding(file_path, file_path + ".enc")
return file_path + ".enc"
def make_stats() -> None:
for (cathegory, cath_path), filenames in CATHEGORIES.items():
if not filenames:
continue
total_cath_counter: Counter = Counter()
print(cathegory)
for filename in filenames:
path_to_file = os.path.join(ROOT_PATH, cath_path, filename)
path_to_file = prep_data(path_to_file)
counter, pecs = byte_repeats_counter(path_to_file)
print_counter(counter, filename, 4)
total_cath_counter += counter
print(">> ", pecs)
print_counter(total_cath_counter, f"ALL {cathegory}", 2)
if __name__ == "__main__":
make_stats()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment