GaelVaroquaux/bench.py

## bench.py
"""
Benching I/O with joblib and other libraries. Comment and
un-comment what you are interested in.

Warning: this is slow, and the benchs are easily offset by other disk
activity.
"""
import os
import time
import shutil

import numpy as np

import joblib
from joblib.disk import disk_used

def clear_out():
    if os.path.exists('out'):
        shutil.rmtree('out')
    os.mkdir('out')


def kill_disk_cache():
    if os.name == 'posix' and os.uname()[0] == 'Linux':
        try:
            open('/proc/sys/vm/drop_caches', 'w').write('3\n')
        except IOError, e:
            if e.errno == 13:
                print 'Please run me as root'
            else:
                raise e
    else:
        # Write ~100M to the disk
        file('tmp', 'w').write(np.random.random(2e7))


def timeit(func, *args, **kwargs):
    times = list()
    for _ in range(7):
        kill_disk_cache()
        t0 = time.time()
        out = func(*args, **kwargs)
        if 1:
            # Just time the function
            t1 = time.time()
            times.append(t1 - t0)
        else:
            # Compute a hash of the output, to estimate the time
            # necessary to access the elements: this is a better
            # estimate of the time to load with memmapping.
            joblib.hash(out)
            t1 = time.time()
            joblib.hash(out)
            t2 = time.time()
            times.append(t2 - t0 - 2*(t2 - t1))
    times.sort()
    #return times[0]
    return np.mean(times[1:-1])


def print_line(dataset, library, strategy, write_time, read_time, disk_used):
    print '% 15s, %10s, %12s, % 6.3f, % 7.4f, % 5.1f' % (
            dataset, library, strategy, write_time, read_time, disk_used)


def bench_dump(dataset, name='', compress_levels=(0, 1, 3, 6, 9)):
    time_write = list()
    time_read = list()
    du = list()
    for compress in compress_levels:
        clear_out()
        time_write = \
            timeit(joblib.dump, dataset, 'out/test.pkl', compress=compress)
        #    0)
        #joblib.dump(dataset, 'out/test.pkl', compress=compress)
        du = disk_used('out')/1024.
        time_read = timeit(joblib.load, 'out/test.pkl')
        print_line(name, 'joblib', 'zlib%i' % compress, time_write,
                   time_read, du)
    clear_out()
    time_write = timeit(joblib.dump, dataset, 'out/test.pkl')
    time_read = timeit(joblib.load, 'out/test.pkl', mmap_mode='r')
    du = disk_used('out')/1024.
    print_line(name, 'joblib', 'mmap', time_write, time_read, du)

print '% 15s, %10s, %12s, % 6s, % 6s, % 6s' % (
        'Dataset', 'library', 'strategy', 'write', 'read', 'disk')

# Neuroimaging specific I/O
import nibabel
import tables
#tables.setBloscMaxThreads(1)

def load_nii(filename, sum=True):
    img = nibabel.load(filename)
    if sum:
        return img.get_data().sum(), img.get_affine()
    return img.get_data(), img.get_affine()


def write_nii(d, filename):
    img = nibabel.Nifti1Image(d[0], d[1])
    nibabel.save(img, filename)


def write_hdf(arrays):
    h5file = tables.openFile("out/test.h5",
                            mode = "w", title = "Test file")
    for index, array in enumerate(arrays):
        h5file.createArray(h5file.root, 'array%i' % index,
                            array)
    h5file.close()


def write_chdf(arrays, compress=0, complib='zlib'):
    if compress == 0:
        return write_hdf(arrays)
    h5file = tables.openFile("out/test.h5",
                            mode = "w", title = "Test file")
    filters = tables.Filters(complevel=compress,
                                complib=complib)

    for index, array in enumerate(arrays):
        shape = array.shape
        atom = tables.Atom.from_dtype(array.dtype)
        ca = h5file.createCArray(h5file.root, 'array%i' % index,
                                    atom, shape, filters=filters)
        ca[...] = array
    h5file.close()


def load_hdf(sum=True):
    h5file = tables.openFile("out/test.h5", "r")
    out = list()
    for node in h5file.iterNodes(h5file.root):
        out.append(node.read().sum())
    h5file.close()
    return out


def bench_hdf(d, name):
    clear_out()
    for complib in "zlib", "lzo":
        for compress in (0, 1, 3, 6, 9):
            if compress == 0 and complib != 'zlib':
                continue
            if compress == 9 and complib == 'zlib':
                # Way to slow to be useful
                continue
            if compress != 1 and complib == 'lzo':
                continue
            clear_out()
            h5_save_time = timeit(write_chdf, d, complib=complib,
                                    compress=compress)
            h5_du = disk_used('out')/1024.
            h5_load_time = timeit(load_hdf)
            print_line(name, 'pytables', '%s %i' % (complib, compress),
                        h5_save_time, h5_load_time, h5_du)


for c_order in (True, False):
    for name, nifti_file in (
        ('MNI', '/usr/share/fsl/data/atlases/MNI/MNI-prob-1mm.nii.gz'),
        ('Juelich',
            '/usr/share/fsl/data/atlases/Juelich/Juelich-prob-2mm.nii.gz'),
        ('localiser',
            '~/.nipy/data/s12069_swaloc1_corr.nii.gz'),
        ):
        name = '% 5s(%s)' % (name, 'C' if c_order else 'F')

        nifti_file = os.path.expanduser(nifti_file)
        d = load_nii(nifti_file, sum=False)
        if c_order:
            d = (np.ascontiguousarray(d[0]), d[1])
        compress_load_time = timeit(load_nii, nifti_file)
        clear_out()
        compress_save_time = timeit(write_nii, d, 'out/test.nii.gz')
        compress_nii_du = disk_used('out')/1024.
        strategy = '.nii.gz'
        library = 'Nifti'
        print_line(name, library, strategy,
                    compress_save_time, compress_load_time,
                    compress_nii_du)
        clear_out()
        save_time = timeit(write_nii, d, 'out/test.nii')
        nii_du = disk_used('out')/1024.
        load_time = timeit(load_nii, 'out/test.nii')
        strategy = ' .nii'
        print_line(name, library, strategy,
                    save_time, load_time, nii_du)

        clear_out()
        bench_hdf(d, name=name)

        # Bench numpy's savez
        clear_out()
        np_save_time = timeit(np.save, 'out/test.npy',
                                d[0])
        np_du = disk_used('out')/1024.
        def load_np(filename, sum=True):
            data = np.load(filename)
            if sum:
                return data.sum()
            return data
        np_load_time = timeit(load_np, 'out/test.npy')
        library = 'numpy'
        strategy = ''
        print_line(name, library, strategy,
                    np_save_time, np_load_time, np_du)

        clear_out()
        bench_dump(d, name, compress_levels=(0, 1, 6))


## plot_nii_results.py
""" Joblib comparitive results.
"""
import pylab as pl
import numpy as np

def autolabel(rects, y_max):
    # attach some text labels
    ax = pl.gca()
    for rect in rects:
        height = rect.get_height()
        c = rect.get_facecolor()[:3]
        if np.mean(c) > .5:
            color = 'k'
        else:
            color = 'w'
        if height > y_max:
            if height > 20:
                txt = '%i' % height
            else:
                txt = '%.1f' % height
            ax.text(rect.get_x() + 1.35*rect.get_width()/2.,
                .99*y_max, txt,
                rotation=90, size=11, color=color,
                ha='center', va='top')


res = np.recfromcsv('results_nii.csv')

data = dict()
for library in np.unique(res['library']):
    this_res = res[res['library'] == library]
    for strategy in np.unique(this_res['strategy']):
        name = '%s, %s' % (library.strip(), strategy.strip())
        data[name] = this_res[this_res['strategy'] == strategy]

n_libraries = len(data)
width = .92/n_libraries

colors = pl.cm.spectral(np.linspace(0, .92, n_libraries-2))
colors = np.r_[colors, ((0, 0, 0, 0),)]
colors[-4:, :] = pl.cm.autumn(np.linspace(0, 1.1, 4))
tmp = colors[1].copy()
colors[1:5] = colors[2:6]
colors[5] = tmp
colors[-5] = (1., .4, .7, 1.)

###############################################################################
# Plot a summary

legend_items = dict()
pl.figure(-1, figsize=(8, 4))
pl.clf()

for index, metric in enumerate(('read', 'write', 'disk')):
    ax = pl.axes([.1, .06, .83, .87])
    i = 0
    for library, stats in sorted(data.iteritems()):
        if library in ('Nifti, .nii.gz', ):
            continue
        stats = np.sort(stats)
        stats = stats[metric]
        stats = stats / data['Nifti, .nii.gz'][metric]
        indices = (i-.5*n_libraries) * .07 * width + index * width
        bars = pl.plot([indices ], [stats.mean(), ], 'o',
                      color=colors[i][:3],
                      label=library, linewidth=4,)
        pl.plot([indices, ]*len(stats), stats, '+', color=colors[i][:3])

        if metric == 'write':
            legend_items[library] = bars[0]
        i += 1

proxy_artist = pl.Rectangle((0, 0), .1, .1, fc="w", ec="w", alpha=0)
legend_artist = list()
for lib in ('joblib', 'pytables'):
    these_artists = []
    if lib == 'joblib':
        for _ in range(4):
            these_artists.extend([('', proxy_artist), ])
        these_artists.extend([(n.ljust(24), a)
                        for n, a in sorted(legend_items.iteritems())
                        if n.startswith('Nifti')])
        these_artists.sort()
        these_artists.extend([('', proxy_artist), ])
    these_artists.extend([(n.ljust(32), a)
                        for n, a in sorted(legend_items.iteritems())
                        if n.startswith(lib)])
    if lib == 'joblib':
        for _ in range(4):
            these_artists.extend([('', proxy_artist), ])
        these_artists.extend([(n.strip(', ').ljust(20), a)
                        for n, a in legend_items.iteritems()
                        if n.startswith('numpy')])
    legend_artist.extend(these_artists)

pl.ylim(ymin=.0003)
pl.plot([-0.04, 0.18], [1, 1], color='.8', zorder=0)
pl.xlim(-0.04, 0.18)
pl.gca().set_yscale('log')
pl.text(-.02, 9, 'Read speed', size=14)
pl.text(.06, 9, 'Write speed', size=14)
pl.text(.13, 9, 'Disk used', size=14)
pl.xticks(())
pl.ylabel('Reference: nii.gz')

pl.legend(zip(*legend_artist)[1],
            zip(*legend_artist)[0],
            loc='lower right', ncol=4, prop=dict(size=13),
            handlelength=1, columnspacing=0,
            borderaxespad=0.01,
            handletextpad=.3, labelspacing=.4,
            frameon=False, numpoints=1)

pl.savefig('summary.png')
	"""
	Benching I/O with joblib and other libraries. Comment and
	un-comment what you are interested in.

	Warning: this is slow, and the benchs are easily offset by other disk
	activity.
	"""
	import os
	import time
	import shutil

	import numpy as np

	import joblib
	from joblib.disk import disk_used

	def clear_out():
	if os.path.exists('out'):
	shutil.rmtree('out')
	os.mkdir('out')


	def kill_disk_cache():
	if os.name == 'posix' and os.uname()[0] == 'Linux':
	try:
	open('/proc/sys/vm/drop_caches', 'w').write('3\n')
	except IOError, e:
	if e.errno == 13:
	print 'Please run me as root'
	else:
	raise e
	else:
	# Write ~100M to the disk
	file('tmp', 'w').write(np.random.random(2e7))


	def timeit(func, args, *kwargs):
	times = list()
	for _ in range(7):
	kill_disk_cache()
	t0 = time.time()
	out = func(args, *kwargs)
	if 1:
	# Just time the function
	t1 = time.time()
	times.append(t1 - t0)
	else:
	# Compute a hash of the output, to estimate the time
	# necessary to access the elements: this is a better
	# estimate of the time to load with memmapping.
	joblib.hash(out)
	t1 = time.time()
	joblib.hash(out)
	t2 = time.time()
	times.append(t2 - t0 - 2*(t2 - t1))
	times.sort()
	#return times[0]
	return np.mean(times[1:-1])


	def print_line(dataset, library, strategy, write_time, read_time, disk_used):
	print '% 15s, %10s, %12s, % 6.3f, % 7.4f, % 5.1f' % (
	dataset, library, strategy, write_time, read_time, disk_used)


	def bench_dump(dataset, name='', compress_levels=(0, 1, 3, 6, 9)):
	time_write = list()
	time_read = list()
	du = list()
	for compress in compress_levels:
	clear_out()
	time_write = \
	timeit(joblib.dump, dataset, 'out/test.pkl', compress=compress)
	# 0)
	#joblib.dump(dataset, 'out/test.pkl', compress=compress)
	du = disk_used('out')/1024.
	time_read = timeit(joblib.load, 'out/test.pkl')
	print_line(name, 'joblib', 'zlib%i' % compress, time_write,
	time_read, du)
	clear_out()
	time_write = timeit(joblib.dump, dataset, 'out/test.pkl')
	time_read = timeit(joblib.load, 'out/test.pkl', mmap_mode='r')
	du = disk_used('out')/1024.
	print_line(name, 'joblib', 'mmap', time_write, time_read, du)

	print '% 15s, %10s, %12s, % 6s, % 6s, % 6s' % (
	'Dataset', 'library', 'strategy', 'write', 'read', 'disk')

	# Neuroimaging specific I/O
	import nibabel
	import tables
	#tables.setBloscMaxThreads(1)

	def load_nii(filename, sum=True):
	img = nibabel.load(filename)
	if sum:
	return img.get_data().sum(), img.get_affine()
	return img.get_data(), img.get_affine()


	def write_nii(d, filename):
	img = nibabel.Nifti1Image(d[0], d[1])
	nibabel.save(img, filename)


	def write_hdf(arrays):
	h5file = tables.openFile("out/test.h5",
	mode = "w", title = "Test file")
	for index, array in enumerate(arrays):
	h5file.createArray(h5file.root, 'array%i' % index,
	array)
	h5file.close()


	def write_chdf(arrays, compress=0, complib='zlib'):
	if compress == 0:
	return write_hdf(arrays)
	h5file = tables.openFile("out/test.h5",
	mode = "w", title = "Test file")
	filters = tables.Filters(complevel=compress,
	complib=complib)

	for index, array in enumerate(arrays):
	shape = array.shape
	atom = tables.Atom.from_dtype(array.dtype)
	ca = h5file.createCArray(h5file.root, 'array%i' % index,
	atom, shape, filters=filters)
	ca[...] = array
	h5file.close()


	def load_hdf(sum=True):
	h5file = tables.openFile("out/test.h5", "r")
	out = list()
	for node in h5file.iterNodes(h5file.root):
	out.append(node.read().sum())
	h5file.close()
	return out


	def bench_hdf(d, name):
	clear_out()
	for complib in "zlib", "lzo":
	for compress in (0, 1, 3, 6, 9):
	if compress == 0 and complib != 'zlib':
	continue
	if compress == 9 and complib == 'zlib':
	# Way to slow to be useful
	continue
	if compress != 1 and complib == 'lzo':
	continue
	clear_out()
	h5_save_time = timeit(write_chdf, d, complib=complib,
	compress=compress)
	h5_du = disk_used('out')/1024.
	h5_load_time = timeit(load_hdf)
	print_line(name, 'pytables', '%s %i' % (complib, compress),
	h5_save_time, h5_load_time, h5_du)


	for c_order in (True, False):
	for name, nifti_file in (
	('MNI', '/usr/share/fsl/data/atlases/MNI/MNI-prob-1mm.nii.gz'),
	('Juelich',
	'/usr/share/fsl/data/atlases/Juelich/Juelich-prob-2mm.nii.gz'),
	('localiser',
	'~/.nipy/data/s12069_swaloc1_corr.nii.gz'),
	):
	name = '% 5s(%s)' % (name, 'C' if c_order else 'F')

	nifti_file = os.path.expanduser(nifti_file)
	d = load_nii(nifti_file, sum=False)
	if c_order:
	d = (np.ascontiguousarray(d[0]), d[1])
	compress_load_time = timeit(load_nii, nifti_file)
	clear_out()
	compress_save_time = timeit(write_nii, d, 'out/test.nii.gz')
	compress_nii_du = disk_used('out')/1024.
	strategy = '.nii.gz'
	library = 'Nifti'
	print_line(name, library, strategy,
	compress_save_time, compress_load_time,
	compress_nii_du)
	clear_out()
	save_time = timeit(write_nii, d, 'out/test.nii')
	nii_du = disk_used('out')/1024.
	load_time = timeit(load_nii, 'out/test.nii')
	strategy = ' .nii'
	print_line(name, library, strategy,
	save_time, load_time, nii_du)

	clear_out()
	bench_hdf(d, name=name)

	# Bench numpy's savez
	clear_out()
	np_save_time = timeit(np.save, 'out/test.npy',
	d[0])
	np_du = disk_used('out')/1024.
	def load_np(filename, sum=True):
	data = np.load(filename)
	if sum:
	return data.sum()
	return data
	np_load_time = timeit(load_np, 'out/test.npy')
	library = 'numpy'
	strategy = ''
	print_line(name, library, strategy,
	np_save_time, np_load_time, np_du)

	clear_out()
	bench_dump(d, name, compress_levels=(0, 1, 6))
	""" Joblib comparitive results.
	"""
	import pylab as pl
	import numpy as np

	def autolabel(rects, y_max):
	# attach some text labels
	ax = pl.gca()
	for rect in rects:
	height = rect.get_height()
	c = rect.get_facecolor()[:3]
	if np.mean(c) > .5:
	color = 'k'
	else:
	color = 'w'
	if height > y_max:
	if height > 20:
	txt = '%i' % height
	else:
	txt = '%.1f' % height
	ax.text(rect.get_x() + 1.35*rect.get_width()/2.,
	.99*y_max, txt,
	rotation=90, size=11, color=color,
	ha='center', va='top')


	res = np.recfromcsv('results_nii.csv')

	data = dict()
	for library in np.unique(res['library']):
	this_res = res[res['library'] == library]
	for strategy in np.unique(this_res['strategy']):
	name = '%s, %s' % (library.strip(), strategy.strip())
	data[name] = this_res[this_res['strategy'] == strategy]

	n_libraries = len(data)
	width = .92/n_libraries

	colors = pl.cm.spectral(np.linspace(0, .92, n_libraries-2))
	colors = np.r_[colors, ((0, 0, 0, 0),)]
	colors[-4:, :] = pl.cm.autumn(np.linspace(0, 1.1, 4))
	tmp = colors[1].copy()
	colors[1:5] = colors[2:6]
	colors[5] = tmp
	colors[-5] = (1., .4, .7, 1.)

	###############################################################################
	# Plot a summary

	legend_items = dict()
	pl.figure(-1, figsize=(8, 4))
	pl.clf()

	for index, metric in enumerate(('read', 'write', 'disk')):
	ax = pl.axes([.1, .06, .83, .87])
	i = 0
	for library, stats in sorted(data.iteritems()):
	if library in ('Nifti, .nii.gz', ):
	continue
	stats = np.sort(stats)
	stats = stats[metric]
	stats = stats / data['Nifti, .nii.gz'][metric]
	indices = (i-.5n_libraries) .07 * width + index * width
	bars = pl.plot([indices ], [stats.mean(), ], 'o',
	color=colors[i][:3],
	label=library, linewidth=4,)
	pl.plot([indices, ]*len(stats), stats, '+', color=colors[i][:3])

	if metric == 'write':
	legend_items[library] = bars[0]
	i += 1

	proxy_artist = pl.Rectangle((0, 0), .1, .1, fc="w", ec="w", alpha=0)
	legend_artist = list()
	for lib in ('joblib', 'pytables'):
	these_artists = []
	if lib == 'joblib':
	for _ in range(4):
	these_artists.extend([('', proxy_artist), ])
	these_artists.extend([(n.ljust(24), a)
	for n, a in sorted(legend_items.iteritems())
	if n.startswith('Nifti')])
	these_artists.sort()
	these_artists.extend([('', proxy_artist), ])
	these_artists.extend([(n.ljust(32), a)
	for n, a in sorted(legend_items.iteritems())
	if n.startswith(lib)])
	if lib == 'joblib':
	for _ in range(4):
	these_artists.extend([('', proxy_artist), ])
	these_artists.extend([(n.strip(', ').ljust(20), a)
	for n, a in legend_items.iteritems()
	if n.startswith('numpy')])
	legend_artist.extend(these_artists)

	pl.ylim(ymin=.0003)
	pl.plot([-0.04, 0.18], [1, 1], color='.8', zorder=0)
	pl.xlim(-0.04, 0.18)
	pl.gca().set_yscale('log')
	pl.text(-.02, 9, 'Read speed', size=14)
	pl.text(.06, 9, 'Write speed', size=14)
	pl.text(.13, 9, 'Disk used', size=14)
	pl.xticks(())
	pl.ylabel('Reference: nii.gz')

	pl.legend(zip(*legend_artist)[1],
	zip(*legend_artist)[0],
	loc='lower right', ncol=4, prop=dict(size=13),
	handlelength=1, columnspacing=0,
	borderaxespad=0.01,
	handletextpad=.3, labelspacing=.4,
	frameon=False, numpoints=1)

	pl.savefig('summary.png')