Skip to content

Instantly share code, notes, and snippets.

@capezotte
Last active March 22, 2021 02:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save capezotte/9a60c8cd1cd8267606fda62448c23a6c to your computer and use it in GitHub Desktop.
Save capezotte/9a60c8cd1cd8267606fda62448c23a6c to your computer and use it in GitHub Desktop.
Testing multiple compression algorithms
#! /usr/bin/env python3
# Finds files in the "in" subfolder of the current working directory,
#
# loads them into memory,
# dumps them into a compressor program's stdout,
# loads the stdout into memory,
# then dump 'em again into a decompressor's stdin.
#
# Logs ratio, compression and decompression speeds
# to stdout (each file MiB/s) and to a log.csv (average B/s) into the current working directory
# (if this sounds like a NIH version of lzbench, it likely is)
from os import stat, walk, path # File functions
def fsize(f): return stat(f).st_size
from mmap import mmap, PROT_READ # load into memory so disk speed doesn't affect comparison
# Open and time processses
from subprocess import Popen, PIPE, DEVNULL
from time import time
# Write CSV with averages
from csv import DictWriter
methods = [ 'gzip', 'compress', 'zstd', 'bzip2', 'pigz',
# pixz, - Version in my repos seem to not like writing to stdout.
'lz4', 'lzop', 'xz' ]
#methods = [ 'catz' ] debug program, merely #! /bin/sh shift; cat
data = {}
cols = ["ratio", "comp_speed", "decomp_speed"]
for m in methods:
data[m] = {}
for root, dirs, files in walk('in'):
for fname in files:
curfile=path.join(root,fname)
source_s=fsize(curfile)
with open(curfile,'rb') as f:
mem_f = mmap(f.fileno(),0,prot=PROT_READ) # Copy file to be compressed into memory for a fair benchmak
# Well, they are supposed to be compressors, if they make the file bigger,
# sucks to be 'em
for method in methods:
mem_f.seek(0)
data[method][fname] = {}
dmf = data[method][fname] # Alias (thanks references!)
# Compression
start_time = time()
comp_proc = Popen(
[ method, '-kc', '-' ],
stdin=PIPE,
stdout=PIPE,
stderr=DEVNULL,
text=False,
)
comp_bytes = comp_proc.communicate(input=mem_f.read())[0] # read stdout
time_taken = time() - start_time
comp_s=len(comp_bytes)
dmf["ratio"] = source_s/comp_s
dmf["comp_speed"] = source_s/time_taken
# Decompression
start_time = time()
decomp_proc = Popen(
[ method, '-dc', '-' ],
stdin=PIPE,
stdout=DEVNULL,
stderr=DEVNULL,
)
decomp_proc.communicate(input=comp_bytes)
time_taken = time() - start_time
dmf["decomp_speed"] = source_s/time_taken
print("Ran {m} on {f}. Compressed with ratio {r:.2f}, consumes {c:.2f} MiB/s, decompression generates {d:.2f} MiB/s.".format(
m=method, f=fname, r=dmf["ratio"], c=dmf["comp_speed"]/(1024**2), d=dmf["decomp_speed"]/(1024**2),
))
avgs = {}
for m in data:
avgs[m] = {}
avg = avgs[m]
avg['method'] = m
num = len(data[m])
for col in cols:
avg[col] = 0
for f in data[m]:
avg[col] += data[m][f][col]
avg[col] /= num
with open('log.csv','w') as f:
csv = DictWriter(f,fieldnames=['method'] + cols)
csv.writeheader()
for m in avgs: csv.writerow(avgs[m])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment