Created
August 20, 2012 05:44
-
-
Save JoshRosen/3401373 to your computer and use it in GitHub Desktop.
Bulk de-pickling of Python objects
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Given a Python list containing pickled objects, joining the pickles into | |
a single pickled list and deserializing it can be faster than deserializing | |
each pickle individually, despite the extra string processing that this | |
requires. | |
The performance difference is much more pronounced if pickle is used instead of | |
cPickle. | |
NOTE: this code is specific to Pickle protocol 2; I'm not sure if these results | |
hold for the other protocols. | |
""" | |
import cPickle as pickle | |
from pickle import EMPTY_LIST, MARK, APPENDS, STOP | |
from timeit import timeit | |
from itertools import chain | |
def strip_pickle(pickled): | |
""" | |
>>> pickle.dumps(1, 2) | |
'\\x80\\x02K\\x01.' | |
>>> strip_pickle(pickle.dumps(1, 2)) | |
'K\\x01' | |
""" | |
# Strip out the PROTO from the start of the pickle and the STOP from the | |
# end. | |
return pickled[2:-1] | |
def pickle_as_list_of_pickles(objs): | |
return [pickle.dumps(x, 2) for x in objs] | |
def as_chunks(l, n): | |
for i in xrange(0, len(l), n): | |
yield l[i:i+n] | |
def bulk_depickle(list_of_pickles, group_size=None): | |
if group_size is None: | |
group_size = len(list_of_pickles) | |
def _do_bulk_depickle(list_of_pickles): | |
combined = EMPTY_LIST + MARK + \ | |
''.join(strip_pickle(x) for x in list_of_pickles) + APPENDS + STOP | |
return pickle.loads(combined) | |
chunks = as_chunks(list_of_pickles, group_size) | |
return list(chain.from_iterable(_do_bulk_depickle(x) for x in chunks)) | |
def individual_depickle(list_of_pickles): | |
return [pickle.loads(x) for x in list_of_pickles] | |
def compare_performance(list_of_pickles, chunk_sizes=[2, 10, 100, 1000], | |
reps=10): | |
for chunk_size in chunk_sizes: | |
print "Bulk depickle (chunk size = %i):" % chunk_size, | |
print timeit(lambda: bulk_depickle(list_of_pickles, chunk_size), | |
number=reps) | |
print "Individual depickle:", | |
print timeit(lambda: individual_depickle(list_of_pickles), number=reps) | |
if __name__ == "__main__": | |
integers = pickle_as_list_of_pickles(range(10000)) | |
# Simple check for correctness | |
assert bulk_depickle(integers, 200) == individual_depickle(integers) | |
print "10000 integers:" | |
compare_performance(integers) | |
print "10000 dicts (dict([ (str(n), n) for n in range(100) ])):" | |
d = dict([ (str(n), n) for n in range(100) ]) | |
dicts = pickle_as_list_of_pickles(d for _ in range(10000)) | |
compare_performance(dicts) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Some benchmark numbers:
I'm running
on a 2.3 Ghz Retina Macbook Pro. Note that depickling items in very small batches (2 or 10) is more expensive than depickling them individually.