Skip to content

Instantly share code, notes, and snippets.

@sminot
Created October 26, 2017 17:52
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sminot/7ce6285f7f0c369f11f7353da24f55b7 to your computer and use it in GitHub Desktop.
Save sminot/7ce6285f7f0c369f11f7353da24f55b7 to your computer and use it in GitHub Desktop.
Profiling sparse DataFrame creation
#!/usr/local/bin/ipython
import pandas as pd
from collections import defaultdict
from random import choice
alph = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
# Function to make some test data
def make_dat(nrows=1000, ncols=1000, nvals=1000):
row_names = [''.join([choice(alph) for x in range(10)]) for y in range(nrows)]
col_names = [''.join([choice(alph).upper() for x in range(10)]) for y in range(ncols)]
cell_vals = range(100000000)
dat = []
for ix in range(nvals):
dat.append({'row': choice(row_names),
'col': choice(col_names),
'val': choice(cell_vals)})
return dat
# Function to make a dict of dicts from the test data
def make_dict_of_dicts(dat):
d_of_d = defaultdict(dict)
for d in dat:
d_of_d[d['col']][d['row']] = d['val']
nrows = len(set([d['row'] for d in dat]))
ncols = len(set([d['col'] for d in dat]))
nvals = sum([len(v) for k, v in d_of_d.items()])
prop_filled = nvals / float(nrows * ncols)
return d_of_d, nrows, ncols, prop_filled
# Make a SparseDataFrame by sequentially adding columns
def make_sparse_df_columnwise(d_of_d):
df = pd.SparseDataFrame()
for colname, vals in d_of_d.items():
df[colname] = pd.Series(vals)
return df
# Make dense DataFrame, pivot it, and then make it sparse
def make_sparse_df_from_dense(dat):
return pd.DataFrame(dat).pivot_table(index="row", columns="col", values="val").to_sparse()
# Make sparse DataFrame from data in longitudinal format and then pivot it
def make_sparse_df_then_pivot(dat):
return pd.DataFrame(dat).to_sparse().pivot_table(index="row", columns="col", values="val")
for nrows, ncols, nvals in [(100, 100, 100),
(100, 100, 1000),
(100, 1000, 1000),
(100, 1000, 10000)]:
dat = make_dat(nrows=nrows, ncols=ncols, nvals=nvals)
test_data, nrows, ncols, prop_filled = make_dict_of_dicts(dat)
print("Matrix: {} rows, {} cols, {}% full".format(nrows, ncols, round(prop_filled * 100, 3)))
print("")
print("Testing sparse DataFrame from nested dictionaries")
% timeit pd.SparseDataFrame(test_data)
print("")
print("Testing columnwise addition to sparse DataFrame")
% timeit make_sparse_df_columnwise(test_data)
print("")
print("Testing make dense DataFrame from longitudinal format, pivot it, then convert to sparse")
% timeit make_sparse_df_from_dense(dat)
print("")
print("Testing make sparse DataFrame from longitudinal format, then pivot it")
% timeit make_sparse_df_then_pivot(dat)
print("")
print("")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment