Last active
March 22, 2021 02:16
-
-
Save capezotte/9a60c8cd1cd8267606fda62448c23a6c to your computer and use it in GitHub Desktop.
Testing multiple compression algorithms
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
# Finds files in the "in" subfolder of the current working directory, | |
# | |
# loads them into memory, | |
# dumps them into a compressor program's stdout, | |
# loads the stdout into memory, | |
# then dump 'em again into a decompressor's stdin. | |
# | |
# Logs ratio, compression and decompression speeds | |
# to stdout (each file MiB/s) and to a log.csv (average B/s) into the current working directory | |
# (if this sounds like a NIH version of lzbench, it likely is) | |
from os import stat, walk, path # File functions | |
def fsize(f): return stat(f).st_size | |
from mmap import mmap, PROT_READ # load into memory so disk speed doesn't affect comparison | |
# Open and time processses | |
from subprocess import Popen, PIPE, DEVNULL | |
from time import time | |
# Write CSV with averages | |
from csv import DictWriter | |
methods = [ 'gzip', 'compress', 'zstd', 'bzip2', 'pigz', | |
# pixz, - Version in my repos seem to not like writing to stdout. | |
'lz4', 'lzop', 'xz' ] | |
#methods = [ 'catz' ] debug program, merely #! /bin/sh shift; cat | |
data = {} | |
cols = ["ratio", "comp_speed", "decomp_speed"] | |
for m in methods: | |
data[m] = {} | |
for root, dirs, files in walk('in'): | |
for fname in files: | |
curfile=path.join(root,fname) | |
source_s=fsize(curfile) | |
with open(curfile,'rb') as f: | |
mem_f = mmap(f.fileno(),0,prot=PROT_READ) # Copy file to be compressed into memory for a fair benchmak | |
# Well, they are supposed to be compressors, if they make the file bigger, | |
# sucks to be 'em | |
for method in methods: | |
mem_f.seek(0) | |
data[method][fname] = {} | |
dmf = data[method][fname] # Alias (thanks references!) | |
# Compression | |
start_time = time() | |
comp_proc = Popen( | |
[ method, '-kc', '-' ], | |
stdin=PIPE, | |
stdout=PIPE, | |
stderr=DEVNULL, | |
text=False, | |
) | |
comp_bytes = comp_proc.communicate(input=mem_f.read())[0] # read stdout | |
time_taken = time() - start_time | |
comp_s=len(comp_bytes) | |
dmf["ratio"] = source_s/comp_s | |
dmf["comp_speed"] = source_s/time_taken | |
# Decompression | |
start_time = time() | |
decomp_proc = Popen( | |
[ method, '-dc', '-' ], | |
stdin=PIPE, | |
stdout=DEVNULL, | |
stderr=DEVNULL, | |
) | |
decomp_proc.communicate(input=comp_bytes) | |
time_taken = time() - start_time | |
dmf["decomp_speed"] = source_s/time_taken | |
print("Ran {m} on {f}. Compressed with ratio {r:.2f}, consumes {c:.2f} MiB/s, decompression generates {d:.2f} MiB/s.".format( | |
m=method, f=fname, r=dmf["ratio"], c=dmf["comp_speed"]/(1024**2), d=dmf["decomp_speed"]/(1024**2), | |
)) | |
avgs = {} | |
for m in data: | |
avgs[m] = {} | |
avg = avgs[m] | |
avg['method'] = m | |
num = len(data[m]) | |
for col in cols: | |
avg[col] = 0 | |
for f in data[m]: | |
avg[col] += data[m][f][col] | |
avg[col] /= num | |
with open('log.csv','w') as f: | |
csv = DictWriter(f,fieldnames=['method'] + cols) | |
csv.writeheader() | |
for m in avgs: csv.writerow(avgs[m]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment