dougallj/a-test-procedure.md Secret

## a-test-procedure.md

      
    Raw
  

              a-test-procedure.md
            
          
    Sorry - I really thought this would be a throw-away for the one blog post.
Test Procedure

Time measured in nanoseconds using:
uint64_t nano() {
   return std::chrono::duration_cast< ::std::chrono::nanoseconds>(
          std::chrono::steady_clock::now().time_since_epoch())
          .count();
}

Huge global buffers are used:
char input_buffer[64 * 1024 * 1024];
char output_buffer[64 * 1024 * 1024];

The output-size passed to the compressors is that huge-buffer size, whereas the input-size is the file size to time slow path behaviour. Actually some benchmark code for some compressors got this mixed up, but slow-path behaviour only makes a measurable difference on smaller files - retiming with this corrected did not change the results.
zlib-like compressors are called via the int uncompress(Bytef * dest, uLongf * destLen, const Bytef * source, uLong sourceLen); API.
isa-l was called via:
int isal_uncompress(void *out, size_t *size, void *in, size_t insize) {
	struct inflate_state state;
	isal_inflate_init(&state);
	state.next_out = (uint8_t *)out;
	state.avail_out = *size;
	state.next_in = (uint8_t *)in;
	state.avail_in = insize;
	state.crc_flag = ISAL_ZLIB;
	int status = isal_inflate_stateless(&state);
	if (status != ISAL_DECOMP_OK) {
		printf("isal_inflate_stateless failed %d\n", status);
		return 1;
	}
	*size = state.total_out;
	return 0;
}

Take minimum of some number of runs (based on reciprocal of size):


File
Runs


xml
100


reymont
53


ooffice
32


nci
31


mr
27


osdb
27


dickens
25


sao
18


samba
18


x-ray
16


webster
8


mozilla
5


Minimum gets consistent numbers on Firestorm, but for Icestorm testing it seems that sometimes the process can still be scheduled onto Firestorm cores, so median (well, sorted_times[num_runs/2]) is used instead.
Megabytes are decimal, not mibibytes.
Speed is measured in compressed/input megabytes per second, not uncompressed/output megabytes per second. Only uncompression speed (not compression speed) is measured.
Reported overall differences use the geometric mean of the speedup for each individual file in the corpus.

  
## bench.py
import subprocess
import os
import glob

CC = 'clang++'
TARGETS = [
	#{
	#	'name': 'libdeflate-old',
	#	'library': 'libdeflate-old/libdeflate.a',
	#	'cflags': ['-DUSE_LIBDEFLATE'],
	#},
	{
		'name': 'libdeflate',
		'library': 'libdeflate/libdeflate.a',
		'cflags': ['-DUSE_LIBDEFLATE'],
	},
]

TESTS = glob.glob('corpus/silesia.*.zlib6')
TESTS.sort(key=os.path.getsize)

for target in TARGETS:
	subprocess.run([CC] + target.get('cflags', []) + ['harness.cc', target['library'], '-o', 'harness-' + target['name']], check=True)


print('\t'.join(['test'] + [t['name'] for t in TARGETS]))
for test in TESTS:

	last = None

	runs = min(100, max(1, round((1000 * 1000 * 100) / os.path.getsize(test))))

	results = []
	for target in TARGETS:

		r = float(subprocess.check_output(['./harness-' + target['name'], test, '1', str(runs)]))
		results.append('%.1f' % (r,))
		if last is not None:
			results[-1] += (' (%.3fx)' % (r / last,))
		last = r

	print('\t'.join([test.split('/')[-1]] + results))

## harness.cpp
#include <stdio.h>
#include <chrono>
#include <algorithm>

#ifdef USE_LIBDEFLATE
#include "libdeflate/libdeflate.h"
#else
#include <zlib.h>
#endif

char input_buffer[64 * 1024 * 1024];
char output_buffer[64 * 1024 * 1024];

uint64_t nano() {
   return std::chrono::duration_cast< ::std::chrono::nanoseconds>(
          std::chrono::steady_clock::now().time_since_epoch())
          .count();
}

#ifdef USE_LIBDEFLATE
struct libdeflate_decompressor *decompressor;
int uncompress(void *out, size_t *size, void *in, size_t insize) {

   size_t outbytes = 0;
   if (libdeflate_zlib_decompress(decompressor, in, insize, out, *size,
                                  &outbytes) != LIBDEFLATE_SUCCESS) {
      abort();
   }
   *size = outbytes;
   return 0;
}
#endif

int main(int argc, char **argv) {
#ifdef USE_LIBDEFLATE
   decompressor = libdeflate_alloc_decompressor();
#endif

   FILE *f = fopen(argv[1], "rb");

   int iterations_per_run = atoi(argv[2]);
   int runs = atoi(argv[3]);
   int compressed_size = fread(input_buffer, 1, sizeof input_buffer, f);

   size_t outlen = sizeof output_buffer;
   int status = uncompress((uint8_t*)output_buffer, &outlen, (uint8_t*)input_buffer, sizeof input_buffer);

   double times[100];

   for (int j = 0; j < runs; j++) {
      {
         uint64_t start = nano();
         for (int i = 0; i < iterations_per_run; i++) {
            outlen = sizeof output_buffer;
            uncompress((uint8_t*)output_buffer, &outlen, (uint8_t*)input_buffer, compressed_size);
         }
         uint64_t end = nano();
         double bb = (end-start);
         times[j] = bb;
         double b = (end-start) / (double)(compressed_size*iterations_per_run) * 3.2;
      }
   }

   std::sort(&times[0], &times[runs]);

   printf("%lf\n", compressed_size * iterations_per_run / (times[0] / 1000.0));

   return 0;
}
File	Runs
xml	100
reymont	53
ooffice	32
nci	31
mr	27
osdb	27
dickens	25
sao	18
samba	18
x-ray	16
webster	8
mozilla	5
	import subprocess
	import os
	import glob

	CC = 'clang++'
	TARGETS = [
	#{
	# 'name': 'libdeflate-old',
	# 'library': 'libdeflate-old/libdeflate.a',
	# 'cflags': ['-DUSE_LIBDEFLATE'],
	#},
	{
	'name': 'libdeflate',
	'library': 'libdeflate/libdeflate.a',
	'cflags': ['-DUSE_LIBDEFLATE'],
	},
	]

	TESTS = glob.glob('corpus/silesia.*.zlib6')
	TESTS.sort(key=os.path.getsize)

	for target in TARGETS:
	subprocess.run([CC] + target.get('cflags', []) + ['harness.cc', target['library'], '-o', 'harness-' + target['name']], check=True)


	print('\t'.join(['test'] + [t['name'] for t in TARGETS]))
	for test in TESTS:

	last = None

	runs = min(100, max(1, round((1000 * 1000 * 100) / os.path.getsize(test))))

	results = []
	for target in TARGETS:

	r = float(subprocess.check_output(['./harness-' + target['name'], test, '1', str(runs)]))
	results.append('%.1f' % (r,))
	if last is not None:
	results[-1] += (' (%.3fx)' % (r / last,))
	last = r

	print('\t'.join([test.split('/')[-1]] + results))
	#include <stdio.h>
	#include <chrono>
	#include <algorithm>

	#ifdef USE_LIBDEFLATE
	#include "libdeflate/libdeflate.h"
	#else
	#include <zlib.h>
	#endif

	char input_buffer[64 * 1024 * 1024];
	char output_buffer[64 * 1024 * 1024];

	uint64_t nano() {
	return std::chrono::duration_cast< ::std::chrono::nanoseconds>(
	std::chrono::steady_clock::now().time_since_epoch())
	.count();
	}

	#ifdef USE_LIBDEFLATE
	struct libdeflate_decompressor *decompressor;
	int uncompress(void out, size_t size, void *in, size_t insize) {

	size_t outbytes = 0;
	if (libdeflate_zlib_decompress(decompressor, in, insize, out, *size,
	&outbytes) != LIBDEFLATE_SUCCESS) {
	abort();
	}
	*size = outbytes;
	return 0;
	}
	#endif

	int main(int argc, char **argv) {
	#ifdef USE_LIBDEFLATE
	decompressor = libdeflate_alloc_decompressor();
	#endif

	FILE *f = fopen(argv[1], "rb");

	int iterations_per_run = atoi(argv[2]);
	int runs = atoi(argv[3]);
	int compressed_size = fread(input_buffer, 1, sizeof input_buffer, f);

	size_t outlen = sizeof output_buffer;
	int status = uncompress((uint8_t)output_buffer, &outlen, (uint8_t)input_buffer, sizeof input_buffer);

	double times[100];

	for (int j = 0; j < runs; j++) {
	{
	uint64_t start = nano();
	for (int i = 0; i < iterations_per_run; i++) {
	outlen = sizeof output_buffer;
	uncompress((uint8_t)output_buffer, &outlen, (uint8_t)input_buffer, compressed_size);
	}
	uint64_t end = nano();
	double bb = (end-start);
	times[j] = bb;
	double b = (end-start) / (double)(compressed_sizeiterations_per_run) 3.2;
	}
	}

	std::sort(&times[0], &times[runs]);

	printf("%lf\n", compressed_size * iterations_per_run / (times[0] / 1000.0));

	return 0;
	}