GaelVaroquaux/bench.py

## bench.py
"""
Benching I/O with joblib and other libraries. Comment and
un-comment what you are interested in.

Warning: this is slow, and the benchs are easily offset by other disk
activity.
"""
import os
import time
import shutil
import cPickle

import numpy as np
from sklearn import datasets

import joblib
from joblib.disk import disk_used

def clear_out():
    if os.path.exists('out'):
        shutil.rmtree('out')
    os.mkdir('out')


def kill_disk_cache():
    if os.name == 'posix' and os.uname()[0] == 'Linux':
        try:
            open('/proc/sys/vm/drop_caches', 'w').write('3\n')
        except IOError, e:
            if e.errno == 13:
                print 'Please run me as root'
            else:
                raise e
    else:
        # Write ~100M to the disk
        file('tmp', 'w').write(np.random.random(2e7))


def timeit(func, *args, **kwargs):
    times = list()
    for _ in range(7):
        kill_disk_cache()
        t0 = time.time()
        out = func(*args, **kwargs)
        if 1:
            # Just time the function
            t1 = time.time()
            times.append(t1 - t0)
        else:
            # Compute a hash of the output, to estimate the time
            # necessary to access the elements: this is a better
            # estimate of the time to load with memmapping.
            joblib.hash(out)
            t1 = time.time()
            joblib.hash(out)
            t2 = time.time()
            times.append(t2 - t0 - 2*(t2 - t1))
    times.sort()
    #return times[0]
    return np.mean(times[1:-1])


def print_line(dataset, library, strategy, write_time, read_time, disk_used):
    print '% 15s, %10s, %12s, % 6.3f, % 7.4f, % 5.1f' % (
            dataset, library, strategy, write_time, read_time, disk_used)


def bench_dump(dataset, name='', compress_levels=(0, 1, 3, 6, 9)):
    time_write = list()
    time_read = list()
    du = list()
    for compress in compress_levels:
        clear_out()
        time_write = \
            timeit(joblib.dump, dataset, 'out/test.pkl', compress=compress)
        #    0)
        #joblib.dump(dataset, 'out/test.pkl', compress=compress)
        du = disk_used('out')/1024.
        time_read = timeit(joblib.load, 'out/test.pkl')
        print_line(name, 'joblib', 'zlib%i' % compress, time_write,
                   time_read, du)
    clear_out()
    time_write = timeit(joblib.dump, dataset, 'out/test.pkl')
    time_read = timeit(joblib.load, 'out/test.pkl', mmap_mode='r')
    du = disk_used('out')/1024.
    print_line(name, 'joblib', 'mmap', time_write, time_read, du)

print '% 15s, %10s, %12s, % 6s, % 6s, % 6s' % (
        'Dataset', 'library', 'strategy', 'write', 'read', 'disk')

#d = datasets.fetch_olivetti_faces()
#bench_dump(d, 'Olivetti')
#print 80*'-'
#d = datasets.fetch_20newsgroups()
#bench_dump(d, '20news')
#print 80*'-'
#d = datasets.fetch_lfw_pairs()
#bench_dump(d, 'lfw_pairs')
#print 80*'-'
#d = datasets.fetch_species_distributions()
#bench_dump(d, 'Species')
#d = datasets.fetch_lfw_people()
#print 80*'-'
#bench_dump(d, 'people')
#d.data = np.r_[d.data, d.data, d.data ]
#print 80*'-'
#bench_dump(d, 'big people')
#
try:
    # Neuroimaging specific I/O
    import nibabel
    import tables
    #tables.setBloscMaxThreads(1)

    def load_nii(filename):
        img = nibabel.load(filename)
        return img.get_data(), img.get_affine()

    def write_nii(d, filename):
        img = nibabel.Nifti1Image(d[0], d[1])
        nibabel.save(img, filename)


    def write_hdf(arrays):
        h5file = tables.openFile("out/test.h5",
                                mode = "w", title = "Test file")

        for index, array in enumerate(arrays):
            h5file.createArray(h5file.root, 'array%i' % index,
                               array)
        h5file.close()


    def write_chdf(arrays, compress=0, complib='zlib'):
        if compress == 0:
            return write_hdf(arrays)
        h5file = tables.openFile("out/test.h5",
                                mode = "w", title = "Test file")
        filters = tables.Filters(complevel=compress,
                                 complib=complib)

        for index, array in enumerate(arrays):
            shape = array.shape
            atom = tables.Atom.from_dtype(array.dtype)
            ca = h5file.createCArray(h5file.root, 'array%i' % index,
                                     atom, shape, filters=filters)
            ca[...] = array
        h5file.close()

    def write_vhdf(arrays, compress=0, complib='zlib'):
        filters = None
        if compress != 0:
            filters = tables.Filters(complevel=compress,
                                    complib=complib)
        h5file = tables.openFile("out/test.h5", "w")
        vlarray = h5file.createVLArray(h5file.root, "vlarray",
                                    tables.VLStringAtom(),
                                    filters=filters)
        for array in arrays:
            parray = cPickle.dumps(array, cPickle.HIGHEST_PROTOCOL)
            vlarray.append(parray)
        h5file.close()

    def load_vhdf():
        h5file = tables.openFile("out/test.h5", "r")
        out = list()
        for row in h5file.root.vlarray:
            array = cPickle.loads(row)
            out.append(array)
        h5file.close()
        return out

    def load_hdf():
        h5file = tables.openFile("out/test.h5", "r")
        out = list()
        for node in h5file.iterNodes(h5file.root):
            out.append(node.read())
        h5file.close()
        return out

    def bench_hdf(d, name):
        clear_out()
        for complib in "zlib", "lzo", "blosc":
            for compress in (0, 1, 3, 6, 9):
                if compress == 0 and complib != 'zlib':
                    continue
                if compress == 9 and complib == 'zlib':
                    # Way to slow to be useful
                    continue
                if compress != 1 and complib == 'lzo':
                    continue
                clear_out()
                h5_save_time = timeit(write_chdf, d, complib=complib,
                                      compress=compress)
                h5_du = disk_used('out')/1024.
                h5_load_time = timeit(load_hdf)
                print_line(name, 'pytables', '%s %i' % (complib, compress),
                           h5_save_time, h5_load_time, h5_du)


    for c_order in (True, False):
     for name, nifti_file in (
         ('MNI', '/usr/share/fsl/data/atlases/MNI/MNI-prob-1mm.nii.gz'),
         ('Juelich',
             '/usr/share/fsl/data/atlases/Juelich/Juelich-prob-2mm.nii.gz'),
            ):

        name = '% 5s(%s)' % (name, 'C' if c_order else 'F')

        d = load_nii(nifti_file)
        if c_order:
            d = (np.ascontiguousarray(d[0]), d[1])
        compress_load_time = timeit(load_nii, nifti_file)
        clear_out()
        compress_save_time = timeit(write_nii, d, 'out/test.nii.gz')
        compress_nii_du = disk_used('out')/1024.
        strategy = '.nii.gz'
        library = 'Nifti'
        print_line(name, library, strategy,
                   compress_save_time, compress_load_time,
                   compress_nii_du)
        clear_out()
        save_time = timeit(write_nii, d, 'out/test.nii')
        nii_du = disk_used('out')/1024.
        load_time = timeit(load_nii, 'out/test.nii')
        strategy = ' .nii'
        print_line(name, library, strategy,
                   save_time, load_time, nii_du)

        clear_out()
        bench_hdf(d, name=name)

        # Bench numpy's savez
        clear_out()
        np_save_time = timeit(np.savez_compressed, 'out/test.npz', d[0], d[1])
        np_du = disk_used('out')/1024.
        def load_np(filename):
            data = np.load(filename)
            return data['arr_0'], data['arr_1']
        np_load_time = timeit(load_np, 'out/test.npz')
        library = 'numpy'
        strategy = 'compressed'
        print_line(name, library, strategy,
                   np_save_time, np_load_time, np_du)

        clear_out()
        bench_dump(d, name, compress_levels=(0, 1, 6))

except ImportError:
    "No nibabel"
	"""
	Benching I/O with joblib and other libraries. Comment and
	un-comment what you are interested in.

	Warning: this is slow, and the benchs are easily offset by other disk
	activity.
	"""
	import os
	import time
	import shutil
	import cPickle

	import numpy as np
	from sklearn import datasets

	import joblib
	from joblib.disk import disk_used

	def clear_out():
	if os.path.exists('out'):
	shutil.rmtree('out')
	os.mkdir('out')


	def kill_disk_cache():
	if os.name == 'posix' and os.uname()[0] == 'Linux':
	try:
	open('/proc/sys/vm/drop_caches', 'w').write('3\n')
	except IOError, e:
	if e.errno == 13:
	print 'Please run me as root'
	else:
	raise e
	else:
	# Write ~100M to the disk
	file('tmp', 'w').write(np.random.random(2e7))


	def timeit(func, args, *kwargs):
	times = list()
	for _ in range(7):
	kill_disk_cache()
	t0 = time.time()
	out = func(args, *kwargs)
	if 1:
	# Just time the function
	t1 = time.time()
	times.append(t1 - t0)
	else:
	# Compute a hash of the output, to estimate the time
	# necessary to access the elements: this is a better
	# estimate of the time to load with memmapping.
	joblib.hash(out)
	t1 = time.time()
	joblib.hash(out)
	t2 = time.time()
	times.append(t2 - t0 - 2*(t2 - t1))
	times.sort()
	#return times[0]
	return np.mean(times[1:-1])


	def print_line(dataset, library, strategy, write_time, read_time, disk_used):
	print '% 15s, %10s, %12s, % 6.3f, % 7.4f, % 5.1f' % (
	dataset, library, strategy, write_time, read_time, disk_used)


	def bench_dump(dataset, name='', compress_levels=(0, 1, 3, 6, 9)):
	time_write = list()
	time_read = list()
	du = list()
	for compress in compress_levels:
	clear_out()
	time_write = \
	timeit(joblib.dump, dataset, 'out/test.pkl', compress=compress)
	# 0)
	#joblib.dump(dataset, 'out/test.pkl', compress=compress)
	du = disk_used('out')/1024.
	time_read = timeit(joblib.load, 'out/test.pkl')
	print_line(name, 'joblib', 'zlib%i' % compress, time_write,
	time_read, du)
	clear_out()
	time_write = timeit(joblib.dump, dataset, 'out/test.pkl')
	time_read = timeit(joblib.load, 'out/test.pkl', mmap_mode='r')
	du = disk_used('out')/1024.
	print_line(name, 'joblib', 'mmap', time_write, time_read, du)

	print '% 15s, %10s, %12s, % 6s, % 6s, % 6s' % (
	'Dataset', 'library', 'strategy', 'write', 'read', 'disk')

	#d = datasets.fetch_olivetti_faces()
	#bench_dump(d, 'Olivetti')
	#print 80*'-'
	#d = datasets.fetch_20newsgroups()
	#bench_dump(d, '20news')
	#print 80*'-'
	#d = datasets.fetch_lfw_pairs()
	#bench_dump(d, 'lfw_pairs')
	#print 80*'-'
	#d = datasets.fetch_species_distributions()
	#bench_dump(d, 'Species')
	#d = datasets.fetch_lfw_people()
	#print 80*'-'
	#bench_dump(d, 'people')
	#d.data = np.r_[d.data, d.data, d.data ]
	#print 80*'-'
	#bench_dump(d, 'big people')
	#
	try:
	# Neuroimaging specific I/O
	import nibabel
	import tables
	#tables.setBloscMaxThreads(1)

	def load_nii(filename):
	img = nibabel.load(filename)
	return img.get_data(), img.get_affine()

	def write_nii(d, filename):
	img = nibabel.Nifti1Image(d[0], d[1])
	nibabel.save(img, filename)


	def write_hdf(arrays):
	h5file = tables.openFile("out/test.h5",
	mode = "w", title = "Test file")

	for index, array in enumerate(arrays):
	h5file.createArray(h5file.root, 'array%i' % index,
	array)
	h5file.close()


	def write_chdf(arrays, compress=0, complib='zlib'):
	if compress == 0:
	return write_hdf(arrays)
	h5file = tables.openFile("out/test.h5",
	mode = "w", title = "Test file")
	filters = tables.Filters(complevel=compress,
	complib=complib)

	for index, array in enumerate(arrays):
	shape = array.shape
	atom = tables.Atom.from_dtype(array.dtype)
	ca = h5file.createCArray(h5file.root, 'array%i' % index,
	atom, shape, filters=filters)
	ca[...] = array
	h5file.close()

	def write_vhdf(arrays, compress=0, complib='zlib'):
	filters = None
	if compress != 0:
	filters = tables.Filters(complevel=compress,
	complib=complib)
	h5file = tables.openFile("out/test.h5", "w")
	vlarray = h5file.createVLArray(h5file.root, "vlarray",
	tables.VLStringAtom(),
	filters=filters)
	for array in arrays:
	parray = cPickle.dumps(array, cPickle.HIGHEST_PROTOCOL)
	vlarray.append(parray)
	h5file.close()

	def load_vhdf():
	h5file = tables.openFile("out/test.h5", "r")
	out = list()
	for row in h5file.root.vlarray:
	array = cPickle.loads(row)
	out.append(array)
	h5file.close()
	return out

	def load_hdf():
	h5file = tables.openFile("out/test.h5", "r")
	out = list()
	for node in h5file.iterNodes(h5file.root):
	out.append(node.read())
	h5file.close()
	return out

	def bench_hdf(d, name):
	clear_out()
	for complib in "zlib", "lzo", "blosc":
	for compress in (0, 1, 3, 6, 9):
	if compress == 0 and complib != 'zlib':
	continue
	if compress == 9 and complib == 'zlib':
	# Way to slow to be useful
	continue
	if compress != 1 and complib == 'lzo':
	continue
	clear_out()
	h5_save_time = timeit(write_chdf, d, complib=complib,
	compress=compress)
	h5_du = disk_used('out')/1024.
	h5_load_time = timeit(load_hdf)
	print_line(name, 'pytables', '%s %i' % (complib, compress),
	h5_save_time, h5_load_time, h5_du)


	for c_order in (True, False):
	for name, nifti_file in (
	('MNI', '/usr/share/fsl/data/atlases/MNI/MNI-prob-1mm.nii.gz'),
	('Juelich',
	'/usr/share/fsl/data/atlases/Juelich/Juelich-prob-2mm.nii.gz'),
	):

	name = '% 5s(%s)' % (name, 'C' if c_order else 'F')

	d = load_nii(nifti_file)
	if c_order:
	d = (np.ascontiguousarray(d[0]), d[1])
	compress_load_time = timeit(load_nii, nifti_file)
	clear_out()
	compress_save_time = timeit(write_nii, d, 'out/test.nii.gz')
	compress_nii_du = disk_used('out')/1024.
	strategy = '.nii.gz'
	library = 'Nifti'
	print_line(name, library, strategy,
	compress_save_time, compress_load_time,
	compress_nii_du)
	clear_out()
	save_time = timeit(write_nii, d, 'out/test.nii')
	nii_du = disk_used('out')/1024.
	load_time = timeit(load_nii, 'out/test.nii')
	strategy = ' .nii'
	print_line(name, library, strategy,
	save_time, load_time, nii_du)

	clear_out()
	bench_hdf(d, name=name)

	# Bench numpy's savez
	clear_out()
	np_save_time = timeit(np.savez_compressed, 'out/test.npz', d[0], d[1])
	np_du = disk_used('out')/1024.
	def load_np(filename):
	data = np.load(filename)
	return data['arr_0'], data['arr_1']
	np_load_time = timeit(load_np, 'out/test.npz')
	library = 'numpy'
	strategy = 'compressed'
	print_line(name, library, strategy,
	np_save_time, np_load_time, np_du)

	clear_out()
	bench_dump(d, name, compress_levels=(0, 1, 6))

	except ImportError:
	"No nibabel"