Skip to content

Instantly share code, notes, and snippets.

@alexhsamuel
Created September 20, 2015 21:49
Show Gist options
  • Save alexhsamuel/0c5cf9a3cb3d43ded8f5 to your computer and use it in GitHub Desktop.
Save alexhsamuel/0c5cf9a3cb3d43ded8f5 to your computer and use it in GitHub Desktop.
pickle size efficiency for Pandas dataframes
import numpy as np
import pickle
def pickle_size(obj):
return len(pickle.dumps(obj))
def arr_size(arr):
try:
categories, codes = arr.categories, arr.codes
except AttributeError:
pass
else:
# Categorical.
return arr_size(categories) + arr_size(codes)
if arr.dtype == np.dtype(object):
return sum( len(x) for x in arr )
else:
return arr.size * arr.itemsize
def data_size(df):
cols = [ df[n] for n in df.columns ] + [df.index]
return sum( arr_size(c.values) for c in cols )
def pickle_eff(df):
return pickle_size(df) / data_size(df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment