JoshRosen/bulk_depickle.py

## bulk_depickle.py
"""
Given a Python list containing pickled objects, joining the pickles into
a single pickled list and deserializing it can be faster than deserializing
each pickle individually, despite the extra string processing that this
requires.

The performance difference is much more pronounced if pickle is used instead of
cPickle.

NOTE: this code is specific to Pickle protocol 2; I'm not sure if these results
hold for the other protocols.
"""
import cPickle as pickle
from pickle import EMPTY_LIST, MARK, APPENDS, STOP
from timeit import timeit
from itertools import chain


def strip_pickle(pickled):
    """
    >>> pickle.dumps(1, 2)
    '\\x80\\x02K\\x01.'
    >>> strip_pickle(pickle.dumps(1, 2))
    'K\\x01'
    """
    # Strip out the PROTO from the start of the pickle and the STOP from the
    # end.
    return pickled[2:-1]


def pickle_as_list_of_pickles(objs):
    return [pickle.dumps(x, 2) for x in objs]


def as_chunks(l, n):
    for i in xrange(0, len(l), n):
        yield l[i:i+n]


def bulk_depickle(list_of_pickles, group_size=None):
    if group_size is None:
        group_size = len(list_of_pickles)
    def _do_bulk_depickle(list_of_pickles):
        combined = EMPTY_LIST +  MARK + \
            ''.join(strip_pickle(x) for x in list_of_pickles) + APPENDS + STOP
        return pickle.loads(combined)
    chunks = as_chunks(list_of_pickles, group_size)
    return list(chain.from_iterable(_do_bulk_depickle(x) for x in chunks))


def individual_depickle(list_of_pickles):
    return [pickle.loads(x) for x in list_of_pickles]


def compare_performance(list_of_pickles, chunk_sizes=[2, 10, 100, 1000],
                        reps=10):
    for chunk_size in chunk_sizes:
        print "Bulk depickle (chunk size = %i):" % chunk_size,
        print timeit(lambda: bulk_depickle(list_of_pickles, chunk_size),
                     number=reps)
    print "Individual depickle:",
    print timeit(lambda: individual_depickle(list_of_pickles), number=reps)


if __name__ == "__main__":
    integers = pickle_as_list_of_pickles(range(10000))
    # Simple check for correctness
    assert bulk_depickle(integers, 200) == individual_depickle(integers)

    print "10000 integers:"
    compare_performance(integers)

    print
    print "10000 dicts (dict([ (str(n), n) for n in range(100) ])):"
    d = dict([ (str(n), n) for n in range(100) ])
    dicts = pickle_as_list_of_pickles(d for _ in range(10000))
    compare_performance(dicts)
	"""
	Given a Python list containing pickled objects, joining the pickles into
	a single pickled list and deserializing it can be faster than deserializing
	each pickle individually, despite the extra string processing that this
	requires.

	The performance difference is much more pronounced if pickle is used instead of
	cPickle.

	NOTE: this code is specific to Pickle protocol 2; I'm not sure if these results
	hold for the other protocols.
	"""
	import cPickle as pickle
	from pickle import EMPTY_LIST, MARK, APPENDS, STOP
	from timeit import timeit
	from itertools import chain


	def strip_pickle(pickled):
	"""
	>>> pickle.dumps(1, 2)
	'\\x80\\x02K\\x01.'
	>>> strip_pickle(pickle.dumps(1, 2))
	'K\\x01'
	"""
	# Strip out the PROTO from the start of the pickle and the STOP from the
	# end.
	return pickled[2:-1]


	def pickle_as_list_of_pickles(objs):
	return [pickle.dumps(x, 2) for x in objs]


	def as_chunks(l, n):
	for i in xrange(0, len(l), n):
	yield l[i:i+n]


	def bulk_depickle(list_of_pickles, group_size=None):
	if group_size is None:
	group_size = len(list_of_pickles)
	def _do_bulk_depickle(list_of_pickles):
	combined = EMPTY_LIST + MARK + \
	''.join(strip_pickle(x) for x in list_of_pickles) + APPENDS + STOP
	return pickle.loads(combined)
	chunks = as_chunks(list_of_pickles, group_size)
	return list(chain.from_iterable(_do_bulk_depickle(x) for x in chunks))


	def individual_depickle(list_of_pickles):
	return [pickle.loads(x) for x in list_of_pickles]


	def compare_performance(list_of_pickles, chunk_sizes=[2, 10, 100, 1000],
	reps=10):
	for chunk_size in chunk_sizes:
	print "Bulk depickle (chunk size = %i):" % chunk_size,
	print timeit(lambda: bulk_depickle(list_of_pickles, chunk_size),
	number=reps)
	print "Individual depickle:",
	print timeit(lambda: individual_depickle(list_of_pickles), number=reps)


	if __name__ == "__main__":
	integers = pickle_as_list_of_pickles(range(10000))
	# Simple check for correctness
	assert bulk_depickle(integers, 200) == individual_depickle(integers)

	print "10000 integers:"
	compare_performance(integers)

	print
	print "10000 dicts (dict([ (str(n), n) for n in range(100) ])):"
	d = dict([ (str(n), n) for n in range(100) ])
	dicts = pickle_as_list_of_pickles(d for _ in range(10000))
	compare_performance(dicts)