Last active
January 8, 2016 17:23
-
-
Save aabadie/7ca337aea0d6b9a5e7ac to your computer and use it in GitHub Desktop.
Test joblib compression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Script comparing different pickling strategies.""" | |
from joblib.numpy_pickle import NumpyPickler, NumpyUnpickler | |
from joblib.numpy_pickle_utils import JoblibZFile | |
from joblib.numpy_pickle_utils import BinaryZlibFile, BinaryGzipFile | |
from pickle import _Pickler, _Unpickler, Pickler, Unpickler | |
import numpy as np | |
import bz2 | |
import lzma | |
import time | |
import io | |
import sys | |
import os | |
# Defining objects used in this bench | |
DICT_SIZE = int(1e6) | |
ARRAY_SIZE = int(1e7) | |
arr = np.random.normal(size=(ARRAY_SIZE)) | |
arr[::2] = 1 | |
# order is important | |
obj_names = ("dict", "list", "array semi-random", "array random", "array ones") | |
objs = (dict((i, str(i)) for i in range(DICT_SIZE)), # dict | |
[i for i in range(DICT_SIZE)], # list | |
arr, # semi random array | |
np.random.normal(size=(ARRAY_SIZE)), # random array | |
np.ones((ARRAY_SIZE))) # array full of ones | |
list_obj = dict(zip(obj_names, objs)) | |
# We test 3 different picklers | |
picklers = { | |
# Python implementation of Pickler/Unpickler | |
"Pickle": (_Pickler, _Unpickler), | |
# C implementation of Pickler/Unpickler | |
"cPickle": (Pickler, Unpickler), | |
# Joblib Pickler/Unpickler designed for numpy arrays. | |
"Joblib": (NumpyPickler, NumpyUnpickler), | |
} | |
# order is important | |
compressors = ("No", "Zlib", "Gzip", "Joblib", "Bz2", "Xz", "Lzma") | |
fobjs = ((open, '/tmp/test_raw', 'wb', {}), | |
(BinaryZlibFile, '/tmp/test_zlib', 'wb', {'compresslevel': 3}), | |
(BinaryGzipFile, '/tmp/test_gzip', 'wb', {'compresslevel': 3}), | |
(JoblibZFile, '/tmp/test_joblib', 'wb', {'compresslevel': 3}), | |
(bz2.BZ2File, '/tmp/test_bz2', 'wb', {'compresslevel': 3}), | |
(lzma.LZMAFile, '/tmp/test_xz', 'wb', | |
{'preset': 3, 'check': lzma.CHECK_NONE}), | |
(lzma.LZMAFile, '/tmp/test_lzma', 'wb', | |
{'preset': 3, 'format': lzma.FORMAT_ALONE}) | |
) | |
file_objs = dict(zip(compressors, fobjs)) | |
def fileobj(obj, fname, mode, kwargs): | |
"""Create a file object.""" | |
return obj(fname, mode, **kwargs) | |
def bufferize(f, buf): | |
"""Bufferize a fileobject using buf.""" | |
if buf is None: | |
return f | |
else: | |
return (buf(f)) | |
def print_line(obj, strategy, buffer, pickler, dump, load, disk_used): | |
"""Nice printing function.""" | |
print('% 20s | %6s | % 14s | % 7s | % 5.1f | % 5.1f | % 5s' % ( | |
obj, strategy, buffer, pickler, dump, load, disk_used)) | |
class PickleBufferedWriter(): | |
"""Protect the underlying fileobj against numerous calls to write | |
This is achieved by internally keeping a list of small chunks and | |
only flushing to the backing fileobj if passed a large chunk or | |
after a threshold on the number of small chunks. | |
""" | |
def __init__(self, fileobj, chunk_max_size=1024, max_chunks=1024): | |
self._fileobj = fileobj | |
self._chunks = chunks = [] | |
# As the `write` method is called many times by the pickler, | |
# attribute look ups on the self's __dict__ are too expensive | |
# hence we define a closure here with all the regularly | |
# accessed parameters | |
def _write(data): | |
chunks.append(data) | |
if len(data) > chunk_max_size or len(chunks) > max_chunks: | |
self.flush() | |
self.write = _write | |
def flush(self): | |
self._fileobj.write(b''.join(self._chunks)) | |
del self._chunks[:] | |
def close(self): | |
self.flush() | |
self._fileobj.close() | |
def __enter__(self): | |
return self | |
def __exit__(self, *exc): | |
self.close() | |
return False | |
# Test 3 buffering strategies | |
bufs = { | |
"None": (None, None), | |
"io.Buffered": (io.BufferedWriter, io.BufferedReader), | |
"PickleBuffered": (PickleBufferedWriter, io.BufferedReader) | |
} | |
print('% 20s | %10s | % 12s | % 8s | % 9s | % 9s | % 5s' % ( | |
'Object', 'Compression', 'Buffer', 'Pickler/Unpickler', | |
'dump time (s)', 'load time (s)', 'Disk used (MB)')) | |
print("--- | --- | --- | --- | --- | --- | ---") | |
for name in sorted(obj_names): | |
# Looping over the objects (array, dict, etc) | |
obj = list_obj[name] | |
if isinstance(obj, np.ndarray): | |
size = obj.nbytes / 1e6 | |
else: | |
size = sys.getsizeof(obj) / 1e6 | |
for c in compressors: | |
# Looping other defined compressors | |
v = file_objs[c] | |
for bname, buf in sorted(bufs.items()): | |
# Looping other picklers | |
for pname, pl in sorted(picklers.items()): | |
t0 = time.time() | |
# Now pickling the object in the file | |
with bufferize(fileobj(v[0], v[1], v[2], v[3]), buf[0]) as f: | |
p = pl[0](f) | |
p.dump(obj) | |
dtime = time.time() - t0 | |
t0 = time.time() | |
# Now loading the object from the file | |
with bufferize(fileobj(v[0], v[1], 'rb', {}), buf[1]) as f: | |
if pl[1].__name__ == NumpyUnpickler.__name__: | |
p = pl[1](v[1], f) | |
p.load() | |
else: | |
p = pl[1](f) | |
p.load() | |
ltime = time.time() - t0 | |
print_line("{} ({}MB)".format(name, size), | |
c, | |
bname, | |
pname, | |
dtime, | |
ltime, | |
"{:.2f}".format(os.path.getsize(v[1])/1e6)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment