capezotte/compression-benchmark.py

## compression-benchmark.py
#! /usr/bin/env python3
# Finds files in the "in" subfolder of the current working directory,
#
# loads them into memory,
# dumps them into a compressor program's stdout,
# loads the stdout into memory,
# then dump 'em again into a decompressor's stdin.
#
# Logs ratio, compression and decompression speeds
# to stdout (each file MiB/s) and to a log.csv (average B/s) into the current working directory
# (if this sounds like a NIH version of lzbench, it likely is)

from os import stat, walk, path # File functions
def fsize(f): return stat(f).st_size

from mmap import mmap, PROT_READ # load into memory so disk speed doesn't affect comparison

# Open and time processses
from subprocess import Popen, PIPE, DEVNULL
from time import time

# Write CSV with averages
from csv import DictWriter

methods = [ 'gzip', 'compress', 'zstd', 'bzip2', 'pigz',
	# pixz, - Version in my repos seem to not like writing to stdout.
	'lz4', 'lzop', 'xz' ]

#methods = [ 'catz' ] debug program, merely #! /bin/sh shift; cat

data = {}
cols = ["ratio", "comp_speed", "decomp_speed"]
for m in methods:
	data[m] = {}

for root, dirs, files in walk('in'):
	for fname in files:
		curfile=path.join(root,fname)
		source_s=fsize(curfile)
		with open(curfile,'rb') as f:
			mem_f = mmap(f.fileno(),0,prot=PROT_READ) # Copy file to be compressed into memory for a fair benchmak
			# Well, they are supposed to be compressors, if they make the file bigger,
			# sucks to be 'em
			for method in methods:
				mem_f.seek(0)
				data[method][fname] = {}
				dmf = data[method][fname] # Alias (thanks references!)

				# Compression
				start_time = time()
				comp_proc = Popen(
					[ method, '-kc', '-' ],
					stdin=PIPE,
					stdout=PIPE,
					stderr=DEVNULL,
					text=False,
				)
				comp_bytes = comp_proc.communicate(input=mem_f.read())[0] # read stdout
				time_taken = time() - start_time
				comp_s=len(comp_bytes)
				dmf["ratio"] = source_s/comp_s
				dmf["comp_speed"] = source_s/time_taken

				# Decompression
				start_time = time()
				decomp_proc = Popen(
					[ method, '-dc', '-' ],
					stdin=PIPE,
					stdout=DEVNULL,
					stderr=DEVNULL,
				)
				decomp_proc.communicate(input=comp_bytes)
				time_taken = time() - start_time
				dmf["decomp_speed"] = source_s/time_taken

				print("Ran {m} on {f}. Compressed with ratio {r:.2f}, consumes {c:.2f} MiB/s, decompression generates {d:.2f} MiB/s.".format(
					m=method, f=fname, r=dmf["ratio"], c=dmf["comp_speed"]/(1024**2), d=dmf["decomp_speed"]/(1024**2),
				))

avgs = {}

for m in data:
	avgs[m] = {}
	avg = avgs[m]
	avg['method'] = m
	num = len(data[m])
	for col in cols:
		avg[col] = 0
		for f in data[m]:
			avg[col] += data[m][f][col]
		avg[col] /= num

with open('log.csv','w') as f:
	csv = DictWriter(f,fieldnames=['method'] + cols)
	csv.writeheader()
	for m in avgs: csv.writerow(avgs[m])
	#! /usr/bin/env python3
	# Finds files in the "in" subfolder of the current working directory,
	#
	# loads them into memory,
	# dumps them into a compressor program's stdout,
	# loads the stdout into memory,
	# then dump 'em again into a decompressor's stdin.
	#
	# Logs ratio, compression and decompression speeds
	# to stdout (each file MiB/s) and to a log.csv (average B/s) into the current working directory
	# (if this sounds like a NIH version of lzbench, it likely is)

	from os import stat, walk, path # File functions
	def fsize(f): return stat(f).st_size

	from mmap import mmap, PROT_READ # load into memory so disk speed doesn't affect comparison

	# Open and time processses
	from subprocess import Popen, PIPE, DEVNULL
	from time import time

	# Write CSV with averages
	from csv import DictWriter

	methods = [ 'gzip', 'compress', 'zstd', 'bzip2', 'pigz',
	# pixz, - Version in my repos seem to not like writing to stdout.
	'lz4', 'lzop', 'xz' ]

	#methods = [ 'catz' ] debug program, merely #! /bin/sh shift; cat

	data = {}
	cols = ["ratio", "comp_speed", "decomp_speed"]
	for m in methods:
	data[m] = {}

	for root, dirs, files in walk('in'):
	for fname in files:
	curfile=path.join(root,fname)
	source_s=fsize(curfile)
	with open(curfile,'rb') as f:
	mem_f = mmap(f.fileno(),0,prot=PROT_READ) # Copy file to be compressed into memory for a fair benchmak
	# Well, they are supposed to be compressors, if they make the file bigger,
	# sucks to be 'em
	for method in methods:
	mem_f.seek(0)
	data[method][fname] = {}
	dmf = data[method][fname] # Alias (thanks references!)

	# Compression
	start_time = time()
	comp_proc = Popen(
	[ method, '-kc', '-' ],
	stdin=PIPE,
	stdout=PIPE,
	stderr=DEVNULL,
	text=False,
	)
	comp_bytes = comp_proc.communicate(input=mem_f.read())[0] # read stdout
	time_taken = time() - start_time
	comp_s=len(comp_bytes)
	dmf["ratio"] = source_s/comp_s
	dmf["comp_speed"] = source_s/time_taken

	# Decompression
	start_time = time()
	decomp_proc = Popen(
	[ method, '-dc', '-' ],
	stdin=PIPE,
	stdout=DEVNULL,
	stderr=DEVNULL,
	)
	decomp_proc.communicate(input=comp_bytes)
	time_taken = time() - start_time
	dmf["decomp_speed"] = source_s/time_taken

	print("Ran {m} on {f}. Compressed with ratio {r:.2f}, consumes {c:.2f} MiB/s, decompression generates {d:.2f} MiB/s.".format(
	m=method, f=fname, r=dmf["ratio"], c=dmf["comp_speed"]/(10242), d=dmf["decomp_speed"]/(10242),
	))

	avgs = {}

	for m in data:
	avgs[m] = {}
	avg = avgs[m]
	avg['method'] = m
	num = len(data[m])
	for col in cols:
	avg[col] = 0
	for f in data[m]:
	avg[col] += data[m][f][col]
	avg[col] /= num

	with open('log.csv','w') as f:
	csv = DictWriter(f,fieldnames=['method'] + cols)
	csv.writeheader()
	for m in avgs: csv.writerow(avgs[m])