Created
October 26, 2017 17:52
-
-
Save sminot/7ce6285f7f0c369f11f7353da24f55b7 to your computer and use it in GitHub Desktop.
Profiling sparse DataFrame creation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/ipython | |
import pandas as pd | |
from collections import defaultdict | |
from random import choice | |
alph = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] | |
# Function to make some test data | |
def make_dat(nrows=1000, ncols=1000, nvals=1000): | |
row_names = [''.join([choice(alph) for x in range(10)]) for y in range(nrows)] | |
col_names = [''.join([choice(alph).upper() for x in range(10)]) for y in range(ncols)] | |
cell_vals = range(100000000) | |
dat = [] | |
for ix in range(nvals): | |
dat.append({'row': choice(row_names), | |
'col': choice(col_names), | |
'val': choice(cell_vals)}) | |
return dat | |
# Function to make a dict of dicts from the test data | |
def make_dict_of_dicts(dat): | |
d_of_d = defaultdict(dict) | |
for d in dat: | |
d_of_d[d['col']][d['row']] = d['val'] | |
nrows = len(set([d['row'] for d in dat])) | |
ncols = len(set([d['col'] for d in dat])) | |
nvals = sum([len(v) for k, v in d_of_d.items()]) | |
prop_filled = nvals / float(nrows * ncols) | |
return d_of_d, nrows, ncols, prop_filled | |
# Make a SparseDataFrame by sequentially adding columns | |
def make_sparse_df_columnwise(d_of_d): | |
df = pd.SparseDataFrame() | |
for colname, vals in d_of_d.items(): | |
df[colname] = pd.Series(vals) | |
return df | |
# Make dense DataFrame, pivot it, and then make it sparse | |
def make_sparse_df_from_dense(dat): | |
return pd.DataFrame(dat).pivot_table(index="row", columns="col", values="val").to_sparse() | |
# Make sparse DataFrame from data in longitudinal format and then pivot it | |
def make_sparse_df_then_pivot(dat): | |
return pd.DataFrame(dat).to_sparse().pivot_table(index="row", columns="col", values="val") | |
for nrows, ncols, nvals in [(100, 100, 100), | |
(100, 100, 1000), | |
(100, 1000, 1000), | |
(100, 1000, 10000)]: | |
dat = make_dat(nrows=nrows, ncols=ncols, nvals=nvals) | |
test_data, nrows, ncols, prop_filled = make_dict_of_dicts(dat) | |
print("Matrix: {} rows, {} cols, {}% full".format(nrows, ncols, round(prop_filled * 100, 3))) | |
print("") | |
print("Testing sparse DataFrame from nested dictionaries") | |
% timeit pd.SparseDataFrame(test_data) | |
print("") | |
print("Testing columnwise addition to sparse DataFrame") | |
% timeit make_sparse_df_columnwise(test_data) | |
print("") | |
print("Testing make dense DataFrame from longitudinal format, pivot it, then convert to sparse") | |
% timeit make_sparse_df_from_dense(dat) | |
print("") | |
print("Testing make sparse DataFrame from longitudinal format, then pivot it") | |
% timeit make_sparse_df_then_pivot(dat) | |
print("") | |
print("") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment