aabadie/bench_joblib_compression.py

## bench_joblib_compression.py
"""Script comparing different pickling strategies."""

from joblib.numpy_pickle import NumpyPickler, NumpyUnpickler
from joblib.numpy_pickle_utils import JoblibZFile
from joblib.numpy_pickle_utils import BinaryZlibFile, BinaryGzipFile
from pickle import _Pickler, _Unpickler, Pickler, Unpickler
import numpy as np
import bz2
import lzma
import time
import io
import sys
import os

# Defining objects used in this bench
DICT_SIZE = int(1e6)
ARRAY_SIZE = int(1e7)

arr = np.random.normal(size=(ARRAY_SIZE))
arr[::2] = 1

# order is important
obj_names = ("dict", "list", "array semi-random", "array random", "array ones")
objs = (dict((i, str(i)) for i in range(DICT_SIZE)),  # dict
        [i for i in range(DICT_SIZE)],                # list
        arr,                                          # semi random array
        np.random.normal(size=(ARRAY_SIZE)),          # random array
        np.ones((ARRAY_SIZE)))                        # array full of ones
list_obj = dict(zip(obj_names, objs))

#  We test 3 different picklers
picklers = {
    # Python implementation of Pickler/Unpickler
    "Pickle": (_Pickler, _Unpickler),
    # C implementation of Pickler/Unpickler
    "cPickle": (Pickler, Unpickler),
    # Joblib Pickler/Unpickler designed for numpy arrays.
    "Joblib": (NumpyPickler, NumpyUnpickler),
    }

# order is important
compressors = ("No", "Zlib", "Gzip", "Joblib", "Bz2", "Xz", "Lzma")
fobjs = ((open, '/tmp/test_raw', 'wb', {}),
         (BinaryZlibFile, '/tmp/test_zlib', 'wb', {'compresslevel': 3}),
         (BinaryGzipFile, '/tmp/test_gzip', 'wb', {'compresslevel': 3}),
         (JoblibZFile, '/tmp/test_joblib', 'wb', {'compresslevel': 3}),
         (bz2.BZ2File, '/tmp/test_bz2', 'wb', {'compresslevel': 3}),
         (lzma.LZMAFile, '/tmp/test_xz', 'wb',
                         {'preset': 3, 'check': lzma.CHECK_NONE}),
         (lzma.LZMAFile, '/tmp/test_lzma', 'wb',
                         {'preset': 3, 'format': lzma.FORMAT_ALONE})
         )
file_objs = dict(zip(compressors, fobjs))


def fileobj(obj, fname, mode, kwargs):
    """Create a file object."""
    return obj(fname, mode, **kwargs)


def bufferize(f, buf):
    """Bufferize a fileobject using buf."""
    if buf is None:
        return f
    else:
        return (buf(f))


def print_line(obj, strategy, buffer, pickler, dump, load, disk_used):
    """Nice printing function."""
    print('% 20s | %6s | % 14s | % 7s | % 5.1f | % 5.1f | % 5s' % (
          obj, strategy, buffer, pickler, dump, load, disk_used))


class PickleBufferedWriter():
    """Protect the underlying fileobj against numerous calls to write
    This is achieved by internally keeping a list of small chunks and
    only flushing to the backing fileobj if passed a large chunk or
    after a threshold on the number of small chunks.
    """

    def __init__(self, fileobj, chunk_max_size=1024, max_chunks=1024):
        self._fileobj = fileobj
        self._chunks = chunks = []

        # As the `write` method is called many times by the pickler,
        # attribute look ups on the self's __dict__ are too expensive
        # hence we define a closure here with all the regularly
        # accessed parameters
        def _write(data):
            chunks.append(data)
            if len(data) > chunk_max_size or len(chunks) > max_chunks:
                self.flush()
        self.write = _write

    def flush(self):
        self._fileobj.write(b''.join(self._chunks))
        del self._chunks[:]

    def close(self):
        self.flush()
        self._fileobj.close()

    def __enter__(self):
        return self

    def __exit__(self, *exc):
        self.close()
        return False


# Test 3 buffering strategies
bufs = {
    "None": (None, None),
    "io.Buffered": (io.BufferedWriter, io.BufferedReader),
    "PickleBuffered": (PickleBufferedWriter, io.BufferedReader)
    }

print('% 20s | %10s | % 12s | % 8s | % 9s | % 9s | % 5s' % (
      'Object', 'Compression', 'Buffer', 'Pickler/Unpickler',
      'dump time (s)', 'load time (s)', 'Disk used (MB)'))
print("--- | --- | --- | --- | --- | --- | ---")

for name in sorted(obj_names):
    # Looping over the objects (array, dict, etc)
    obj = list_obj[name]
    if isinstance(obj, np.ndarray):
        size = obj.nbytes / 1e6
    else:
        size = sys.getsizeof(obj) / 1e6

    for c in compressors:
        # Looping other defined compressors
        v = file_objs[c]
        for bname, buf in sorted(bufs.items()):
            # Looping other picklers
            for pname, pl in sorted(picklers.items()):
                t0 = time.time()
                # Now pickling the object in the file
                with bufferize(fileobj(v[0], v[1], v[2], v[3]), buf[0]) as f:
                    p = pl[0](f)
                    p.dump(obj)
                dtime = time.time() - t0
                t0 = time.time()
                # Now loading the object from the file
                with bufferize(fileobj(v[0], v[1], 'rb', {}), buf[1]) as f:
                    if pl[1].__name__ == NumpyUnpickler.__name__:
                        p = pl[1](v[1], f)
                        p.load()
                    else:
                        p = pl[1](f)
                        p.load()
                ltime = time.time() - t0
                print_line("{} ({}MB)".format(name, size),
                           c,
                           bname,
                           pname,
                           dtime,
                           ltime,
                           "{:.2f}".format(os.path.getsize(v[1])/1e6))
	"""Script comparing different pickling strategies."""

	from joblib.numpy_pickle import NumpyPickler, NumpyUnpickler
	from joblib.numpy_pickle_utils import JoblibZFile
	from joblib.numpy_pickle_utils import BinaryZlibFile, BinaryGzipFile
	from pickle import _Pickler, _Unpickler, Pickler, Unpickler
	import numpy as np
	import bz2
	import lzma
	import time
	import io
	import sys
	import os

	# Defining objects used in this bench
	DICT_SIZE = int(1e6)
	ARRAY_SIZE = int(1e7)

	arr = np.random.normal(size=(ARRAY_SIZE))
	arr[::2] = 1

	# order is important
	obj_names = ("dict", "list", "array semi-random", "array random", "array ones")
	objs = (dict((i, str(i)) for i in range(DICT_SIZE)), # dict
	[i for i in range(DICT_SIZE)], # list
	arr, # semi random array
	np.random.normal(size=(ARRAY_SIZE)), # random array
	np.ones((ARRAY_SIZE))) # array full of ones
	list_obj = dict(zip(obj_names, objs))

	# We test 3 different picklers
	picklers = {
	# Python implementation of Pickler/Unpickler
	"Pickle": (_Pickler, _Unpickler),
	# C implementation of Pickler/Unpickler
	"cPickle": (Pickler, Unpickler),
	# Joblib Pickler/Unpickler designed for numpy arrays.
	"Joblib": (NumpyPickler, NumpyUnpickler),
	}

	# order is important
	compressors = ("No", "Zlib", "Gzip", "Joblib", "Bz2", "Xz", "Lzma")
	fobjs = ((open, '/tmp/test_raw', 'wb', {}),
	(BinaryZlibFile, '/tmp/test_zlib', 'wb', {'compresslevel': 3}),
	(BinaryGzipFile, '/tmp/test_gzip', 'wb', {'compresslevel': 3}),
	(JoblibZFile, '/tmp/test_joblib', 'wb', {'compresslevel': 3}),
	(bz2.BZ2File, '/tmp/test_bz2', 'wb', {'compresslevel': 3}),
	(lzma.LZMAFile, '/tmp/test_xz', 'wb',
	{'preset': 3, 'check': lzma.CHECK_NONE}),
	(lzma.LZMAFile, '/tmp/test_lzma', 'wb',
	{'preset': 3, 'format': lzma.FORMAT_ALONE})
	)
	file_objs = dict(zip(compressors, fobjs))


	def fileobj(obj, fname, mode, kwargs):
	"""Create a file object."""
	return obj(fname, mode, **kwargs)


	def bufferize(f, buf):
	"""Bufferize a fileobject using buf."""
	if buf is None:
	return f
	else:
	return (buf(f))


	def print_line(obj, strategy, buffer, pickler, dump, load, disk_used):
	"""Nice printing function."""
	print('% 20s \| %6s \| % 14s \| % 7s \| % 5.1f \| % 5.1f \| % 5s' % (
	obj, strategy, buffer, pickler, dump, load, disk_used))


	class PickleBufferedWriter():
	"""Protect the underlying fileobj against numerous calls to write
	This is achieved by internally keeping a list of small chunks and
	only flushing to the backing fileobj if passed a large chunk or
	after a threshold on the number of small chunks.
	"""

	def __init__(self, fileobj, chunk_max_size=1024, max_chunks=1024):
	self._fileobj = fileobj
	self._chunks = chunks = []

	# As the `write` method is called many times by the pickler,
	# attribute look ups on the self's __dict__ are too expensive
	# hence we define a closure here with all the regularly
	# accessed parameters
	def _write(data):
	chunks.append(data)
	if len(data) > chunk_max_size or len(chunks) > max_chunks:
	self.flush()
	self.write = _write

	def flush(self):
	self._fileobj.write(b''.join(self._chunks))
	del self._chunks[:]

	def close(self):
	self.flush()
	self._fileobj.close()

	def __enter__(self):
	return self

	def __exit__(self, *exc):
	self.close()
	return False


	# Test 3 buffering strategies
	bufs = {
	"None": (None, None),
	"io.Buffered": (io.BufferedWriter, io.BufferedReader),
	"PickleBuffered": (PickleBufferedWriter, io.BufferedReader)
	}

	print('% 20s \| %10s \| % 12s \| % 8s \| % 9s \| % 9s \| % 5s' % (
	'Object', 'Compression', 'Buffer', 'Pickler/Unpickler',
	'dump time (s)', 'load time (s)', 'Disk used (MB)'))
	print("--- \| --- \| --- \| --- \| --- \| --- \| ---")

	for name in sorted(obj_names):
	# Looping over the objects (array, dict, etc)
	obj = list_obj[name]
	if isinstance(obj, np.ndarray):
	size = obj.nbytes / 1e6
	else:
	size = sys.getsizeof(obj) / 1e6

	for c in compressors:
	# Looping other defined compressors
	v = file_objs[c]
	for bname, buf in sorted(bufs.items()):
	# Looping other picklers
	for pname, pl in sorted(picklers.items()):
	t0 = time.time()
	# Now pickling the object in the file
	with bufferize(fileobj(v[0], v[1], v[2], v[3]), buf[0]) as f:
	p = pl[0](f)
	p.dump(obj)
	dtime = time.time() - t0
	t0 = time.time()
	# Now loading the object from the file
	with bufferize(fileobj(v[0], v[1], 'rb', {}), buf[1]) as f:
	if pl[1].__name__ == NumpyUnpickler.__name__:
	p = pl[1](v[1], f)
	p.load()
	else:
	p = pl[1](f)
	p.load()
	ltime = time.time() - t0
	print_line("{} ({}MB)".format(name, size),
	c,
	bname,
	pname,
	dtime,
	ltime,
	"{:.2f}".format(os.path.getsize(v[1])/1e6))