Skip to content

Instantly share code, notes, and snippets.

@aabadie
Last active January 8, 2016 17:23
Show Gist options
  • Save aabadie/7ca337aea0d6b9a5e7ac to your computer and use it in GitHub Desktop.
Save aabadie/7ca337aea0d6b9a5e7ac to your computer and use it in GitHub Desktop.
Test joblib compression
"""Script comparing different pickling strategies."""
from joblib.numpy_pickle import NumpyPickler, NumpyUnpickler
from joblib.numpy_pickle_utils import JoblibZFile
from joblib.numpy_pickle_utils import BinaryZlibFile, BinaryGzipFile
from pickle import _Pickler, _Unpickler, Pickler, Unpickler
import numpy as np
import bz2
import lzma
import time
import io
import sys
import os
# Defining objects used in this bench
DICT_SIZE = int(1e6)
ARRAY_SIZE = int(1e7)
arr = np.random.normal(size=(ARRAY_SIZE))
arr[::2] = 1
# order is important
obj_names = ("dict", "list", "array semi-random", "array random", "array ones")
objs = (dict((i, str(i)) for i in range(DICT_SIZE)), # dict
[i for i in range(DICT_SIZE)], # list
arr, # semi random array
np.random.normal(size=(ARRAY_SIZE)), # random array
np.ones((ARRAY_SIZE))) # array full of ones
list_obj = dict(zip(obj_names, objs))
#  We test 3 different picklers
picklers = {
# Python implementation of Pickler/Unpickler
"Pickle": (_Pickler, _Unpickler),
# C implementation of Pickler/Unpickler
"cPickle": (Pickler, Unpickler),
# Joblib Pickler/Unpickler designed for numpy arrays.
"Joblib": (NumpyPickler, NumpyUnpickler),
}
# order is important
compressors = ("No", "Zlib", "Gzip", "Joblib", "Bz2", "Xz", "Lzma")
fobjs = ((open, '/tmp/test_raw', 'wb', {}),
(BinaryZlibFile, '/tmp/test_zlib', 'wb', {'compresslevel': 3}),
(BinaryGzipFile, '/tmp/test_gzip', 'wb', {'compresslevel': 3}),
(JoblibZFile, '/tmp/test_joblib', 'wb', {'compresslevel': 3}),
(bz2.BZ2File, '/tmp/test_bz2', 'wb', {'compresslevel': 3}),
(lzma.LZMAFile, '/tmp/test_xz', 'wb',
{'preset': 3, 'check': lzma.CHECK_NONE}),
(lzma.LZMAFile, '/tmp/test_lzma', 'wb',
{'preset': 3, 'format': lzma.FORMAT_ALONE})
)
file_objs = dict(zip(compressors, fobjs))
def fileobj(obj, fname, mode, kwargs):
"""Create a file object."""
return obj(fname, mode, **kwargs)
def bufferize(f, buf):
"""Bufferize a fileobject using buf."""
if buf is None:
return f
else:
return (buf(f))
def print_line(obj, strategy, buffer, pickler, dump, load, disk_used):
"""Nice printing function."""
print('% 20s | %6s | % 14s | % 7s | % 5.1f | % 5.1f | % 5s' % (
obj, strategy, buffer, pickler, dump, load, disk_used))
class PickleBufferedWriter():
"""Protect the underlying fileobj against numerous calls to write
This is achieved by internally keeping a list of small chunks and
only flushing to the backing fileobj if passed a large chunk or
after a threshold on the number of small chunks.
"""
def __init__(self, fileobj, chunk_max_size=1024, max_chunks=1024):
self._fileobj = fileobj
self._chunks = chunks = []
# As the `write` method is called many times by the pickler,
# attribute look ups on the self's __dict__ are too expensive
# hence we define a closure here with all the regularly
# accessed parameters
def _write(data):
chunks.append(data)
if len(data) > chunk_max_size or len(chunks) > max_chunks:
self.flush()
self.write = _write
def flush(self):
self._fileobj.write(b''.join(self._chunks))
del self._chunks[:]
def close(self):
self.flush()
self._fileobj.close()
def __enter__(self):
return self
def __exit__(self, *exc):
self.close()
return False
# Test 3 buffering strategies
bufs = {
"None": (None, None),
"io.Buffered": (io.BufferedWriter, io.BufferedReader),
"PickleBuffered": (PickleBufferedWriter, io.BufferedReader)
}
print('% 20s | %10s | % 12s | % 8s | % 9s | % 9s | % 5s' % (
'Object', 'Compression', 'Buffer', 'Pickler/Unpickler',
'dump time (s)', 'load time (s)', 'Disk used (MB)'))
print("--- | --- | --- | --- | --- | --- | ---")
for name in sorted(obj_names):
# Looping over the objects (array, dict, etc)
obj = list_obj[name]
if isinstance(obj, np.ndarray):
size = obj.nbytes / 1e6
else:
size = sys.getsizeof(obj) / 1e6
for c in compressors:
# Looping other defined compressors
v = file_objs[c]
for bname, buf in sorted(bufs.items()):
# Looping other picklers
for pname, pl in sorted(picklers.items()):
t0 = time.time()
# Now pickling the object in the file
with bufferize(fileobj(v[0], v[1], v[2], v[3]), buf[0]) as f:
p = pl[0](f)
p.dump(obj)
dtime = time.time() - t0
t0 = time.time()
# Now loading the object from the file
with bufferize(fileobj(v[0], v[1], 'rb', {}), buf[1]) as f:
if pl[1].__name__ == NumpyUnpickler.__name__:
p = pl[1](v[1], f)
p.load()
else:
p = pl[1](f)
p.load()
ltime = time.time() - t0
print_line("{} ({}MB)".format(name, size),
c,
bname,
pname,
dtime,
ltime,
"{:.2f}".format(os.path.getsize(v[1])/1e6))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment