Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Profiling sparse DataFrame creation
import pandas as pd
from collections import defaultdict
from random import choice
alph = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
# Function to make some test data
def make_dat(nrows=1000, ncols=1000, nvals=1000):
row_names = [''.join([choice(alph) for x in range(10)]) for y in range(nrows)]
col_names = [''.join([choice(alph).upper() for x in range(10)]) for y in range(ncols)]
cell_vals = range(100000000)
dat = []
for ix in range(nvals):
dat.append({'row': choice(row_names),
'col': choice(col_names),
'val': choice(cell_vals)})
return dat
# Function to make a dict of dicts from the test data
def make_dict_of_dicts(dat):
d_of_d = defaultdict(dict)
for d in dat:
d_of_d[d['col']][d['row']] = d['val']
nrows = len(set([d['row'] for d in dat]))
ncols = len(set([d['col'] for d in dat]))
nvals = sum([len(v) for k, v in d_of_d.items()])
prop_filled = nvals / float(nrows * ncols)
return d_of_d, nrows, ncols, prop_filled
# Make a SparseDataFrame by sequentially adding columns
def make_sparse_df_columnwise(d_of_d):
df = pd.SparseDataFrame()
for colname, vals in d_of_d.items():
df[colname] = pd.Series(vals)
return df
# Make dense DataFrame, pivot it, and then make it sparse
def make_sparse_df_from_dense(dat):
return pd.DataFrame(dat).pivot_table(index="row", columns="col", values="val").to_sparse()
# Make sparse DataFrame from data in longitudinal format and then pivot it
def make_sparse_df_then_pivot(dat):
return pd.DataFrame(dat).to_sparse().pivot_table(index="row", columns="col", values="val")
for nrows, ncols, nvals in [(100, 100, 100),
(100, 100, 1000),
(100, 1000, 1000),
(100, 1000, 10000)]:
dat = make_dat(nrows=nrows, ncols=ncols, nvals=nvals)
test_data, nrows, ncols, prop_filled = make_dict_of_dicts(dat)
print("Matrix: {} rows, {} cols, {}% full".format(nrows, ncols, round(prop_filled * 100, 3)))
print("Testing sparse DataFrame from nested dictionaries")
% timeit pd.SparseDataFrame(test_data)
print("Testing columnwise addition to sparse DataFrame")
% timeit make_sparse_df_columnwise(test_data)
print("Testing make dense DataFrame from longitudinal format, pivot it, then convert to sparse")
% timeit make_sparse_df_from_dense(dat)
print("Testing make sparse DataFrame from longitudinal format, then pivot it")
% timeit make_sparse_df_then_pivot(dat)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.