Skip to content

Instantly share code, notes, and snippets.

@GaelVaroquaux
Created August 6, 2012 13:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save GaelVaroquaux/3274477 to your computer and use it in GitHub Desktop.
Save GaelVaroquaux/3274477 to your computer and use it in GitHub Desktop.
Benching neurimaging I/O
"""
Benching I/O with joblib and other libraries. Comment and
un-comment what you are interested in.
Warning: this is slow, and the benchs are easily offset by other disk
activity.
"""
import os
import time
import shutil
import numpy as np
import joblib
from joblib.disk import disk_used
def clear_out():
if os.path.exists('out'):
shutil.rmtree('out')
os.mkdir('out')
def kill_disk_cache():
if os.name == 'posix' and os.uname()[0] == 'Linux':
try:
open('/proc/sys/vm/drop_caches', 'w').write('3\n')
except IOError, e:
if e.errno == 13:
print 'Please run me as root'
else:
raise e
else:
# Write ~100M to the disk
file('tmp', 'w').write(np.random.random(2e7))
def timeit(func, *args, **kwargs):
times = list()
for _ in range(7):
kill_disk_cache()
t0 = time.time()
out = func(*args, **kwargs)
if 1:
# Just time the function
t1 = time.time()
times.append(t1 - t0)
else:
# Compute a hash of the output, to estimate the time
# necessary to access the elements: this is a better
# estimate of the time to load with memmapping.
joblib.hash(out)
t1 = time.time()
joblib.hash(out)
t2 = time.time()
times.append(t2 - t0 - 2*(t2 - t1))
times.sort()
#return times[0]
return np.mean(times[1:-1])
def print_line(dataset, library, strategy, write_time, read_time, disk_used):
print '% 15s, %10s, %12s, % 6.3f, % 7.4f, % 5.1f' % (
dataset, library, strategy, write_time, read_time, disk_used)
def bench_dump(dataset, name='', compress_levels=(0, 1, 3, 6, 9)):
time_write = list()
time_read = list()
du = list()
for compress in compress_levels:
clear_out()
time_write = \
timeit(joblib.dump, dataset, 'out/test.pkl', compress=compress)
# 0)
#joblib.dump(dataset, 'out/test.pkl', compress=compress)
du = disk_used('out')/1024.
time_read = timeit(joblib.load, 'out/test.pkl')
print_line(name, 'joblib', 'zlib%i' % compress, time_write,
time_read, du)
clear_out()
time_write = timeit(joblib.dump, dataset, 'out/test.pkl')
time_read = timeit(joblib.load, 'out/test.pkl', mmap_mode='r')
du = disk_used('out')/1024.
print_line(name, 'joblib', 'mmap', time_write, time_read, du)
print '% 15s, %10s, %12s, % 6s, % 6s, % 6s' % (
'Dataset', 'library', 'strategy', 'write', 'read', 'disk')
# Neuroimaging specific I/O
import nibabel
import tables
#tables.setBloscMaxThreads(1)
def load_nii(filename, sum=True):
img = nibabel.load(filename)
if sum:
return img.get_data().sum(), img.get_affine()
return img.get_data(), img.get_affine()
def write_nii(d, filename):
img = nibabel.Nifti1Image(d[0], d[1])
nibabel.save(img, filename)
def write_hdf(arrays):
h5file = tables.openFile("out/test.h5",
mode = "w", title = "Test file")
for index, array in enumerate(arrays):
h5file.createArray(h5file.root, 'array%i' % index,
array)
h5file.close()
def write_chdf(arrays, compress=0, complib='zlib'):
if compress == 0:
return write_hdf(arrays)
h5file = tables.openFile("out/test.h5",
mode = "w", title = "Test file")
filters = tables.Filters(complevel=compress,
complib=complib)
for index, array in enumerate(arrays):
shape = array.shape
atom = tables.Atom.from_dtype(array.dtype)
ca = h5file.createCArray(h5file.root, 'array%i' % index,
atom, shape, filters=filters)
ca[...] = array
h5file.close()
def load_hdf(sum=True):
h5file = tables.openFile("out/test.h5", "r")
out = list()
for node in h5file.iterNodes(h5file.root):
out.append(node.read().sum())
h5file.close()
return out
def bench_hdf(d, name):
clear_out()
for complib in "zlib", "lzo":
for compress in (0, 1, 3, 6, 9):
if compress == 0 and complib != 'zlib':
continue
if compress == 9 and complib == 'zlib':
# Way to slow to be useful
continue
if compress != 1 and complib == 'lzo':
continue
clear_out()
h5_save_time = timeit(write_chdf, d, complib=complib,
compress=compress)
h5_du = disk_used('out')/1024.
h5_load_time = timeit(load_hdf)
print_line(name, 'pytables', '%s %i' % (complib, compress),
h5_save_time, h5_load_time, h5_du)
for c_order in (True, False):
for name, nifti_file in (
('MNI', '/usr/share/fsl/data/atlases/MNI/MNI-prob-1mm.nii.gz'),
('Juelich',
'/usr/share/fsl/data/atlases/Juelich/Juelich-prob-2mm.nii.gz'),
('localiser',
'~/.nipy/data/s12069_swaloc1_corr.nii.gz'),
):
name = '% 5s(%s)' % (name, 'C' if c_order else 'F')
nifti_file = os.path.expanduser(nifti_file)
d = load_nii(nifti_file, sum=False)
if c_order:
d = (np.ascontiguousarray(d[0]), d[1])
compress_load_time = timeit(load_nii, nifti_file)
clear_out()
compress_save_time = timeit(write_nii, d, 'out/test.nii.gz')
compress_nii_du = disk_used('out')/1024.
strategy = '.nii.gz'
library = 'Nifti'
print_line(name, library, strategy,
compress_save_time, compress_load_time,
compress_nii_du)
clear_out()
save_time = timeit(write_nii, d, 'out/test.nii')
nii_du = disk_used('out')/1024.
load_time = timeit(load_nii, 'out/test.nii')
strategy = ' .nii'
print_line(name, library, strategy,
save_time, load_time, nii_du)
clear_out()
bench_hdf(d, name=name)
# Bench numpy's savez
clear_out()
np_save_time = timeit(np.save, 'out/test.npy',
d[0])
np_du = disk_used('out')/1024.
def load_np(filename, sum=True):
data = np.load(filename)
if sum:
return data.sum()
return data
np_load_time = timeit(load_np, 'out/test.npy')
library = 'numpy'
strategy = ''
print_line(name, library, strategy,
np_save_time, np_load_time, np_du)
clear_out()
bench_dump(d, name, compress_levels=(0, 1, 6))
""" Joblib comparitive results.
"""
import pylab as pl
import numpy as np
def autolabel(rects, y_max):
# attach some text labels
ax = pl.gca()
for rect in rects:
height = rect.get_height()
c = rect.get_facecolor()[:3]
if np.mean(c) > .5:
color = 'k'
else:
color = 'w'
if height > y_max:
if height > 20:
txt = '%i' % height
else:
txt = '%.1f' % height
ax.text(rect.get_x() + 1.35*rect.get_width()/2.,
.99*y_max, txt,
rotation=90, size=11, color=color,
ha='center', va='top')
res = np.recfromcsv('results_nii.csv')
data = dict()
for library in np.unique(res['library']):
this_res = res[res['library'] == library]
for strategy in np.unique(this_res['strategy']):
name = '%s, %s' % (library.strip(), strategy.strip())
data[name] = this_res[this_res['strategy'] == strategy]
n_libraries = len(data)
width = .92/n_libraries
colors = pl.cm.spectral(np.linspace(0, .92, n_libraries-2))
colors = np.r_[colors, ((0, 0, 0, 0),)]
colors[-4:, :] = pl.cm.autumn(np.linspace(0, 1.1, 4))
tmp = colors[1].copy()
colors[1:5] = colors[2:6]
colors[5] = tmp
colors[-5] = (1., .4, .7, 1.)
###############################################################################
# Plot a summary
legend_items = dict()
pl.figure(-1, figsize=(8, 4))
pl.clf()
for index, metric in enumerate(('read', 'write', 'disk')):
ax = pl.axes([.1, .06, .83, .87])
i = 0
for library, stats in sorted(data.iteritems()):
if library in ('Nifti, .nii.gz', ):
continue
stats = np.sort(stats)
stats = stats[metric]
stats = stats / data['Nifti, .nii.gz'][metric]
indices = (i-.5*n_libraries) * .07 * width + index * width
bars = pl.plot([indices ], [stats.mean(), ], 'o',
color=colors[i][:3],
label=library, linewidth=4,)
pl.plot([indices, ]*len(stats), stats, '+', color=colors[i][:3])
if metric == 'write':
legend_items[library] = bars[0]
i += 1
proxy_artist = pl.Rectangle((0, 0), .1, .1, fc="w", ec="w", alpha=0)
legend_artist = list()
for lib in ('joblib', 'pytables'):
these_artists = []
if lib == 'joblib':
for _ in range(4):
these_artists.extend([('', proxy_artist), ])
these_artists.extend([(n.ljust(24), a)
for n, a in sorted(legend_items.iteritems())
if n.startswith('Nifti')])
these_artists.sort()
these_artists.extend([('', proxy_artist), ])
these_artists.extend([(n.ljust(32), a)
for n, a in sorted(legend_items.iteritems())
if n.startswith(lib)])
if lib == 'joblib':
for _ in range(4):
these_artists.extend([('', proxy_artist), ])
these_artists.extend([(n.strip(', ').ljust(20), a)
for n, a in legend_items.iteritems()
if n.startswith('numpy')])
legend_artist.extend(these_artists)
pl.ylim(ymin=.0003)
pl.plot([-0.04, 0.18], [1, 1], color='.8', zorder=0)
pl.xlim(-0.04, 0.18)
pl.gca().set_yscale('log')
pl.text(-.02, 9, 'Read speed', size=14)
pl.text(.06, 9, 'Write speed', size=14)
pl.text(.13, 9, 'Disk used', size=14)
pl.xticks(())
pl.ylabel('Reference: nii.gz')
pl.legend(zip(*legend_artist)[1],
zip(*legend_artist)[0],
loc='lower right', ncol=4, prop=dict(size=13),
handlelength=1, columnspacing=0,
borderaxespad=0.01,
handletextpad=.3, labelspacing=.4,
frameon=False, numpoints=1)
pl.savefig('summary.png')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment