sminot/test_sparse_dataframe_creation.ipy

## test_sparse_dataframe_creation.ipy
#!/usr/local/bin/ipython

import pandas as pd
from collections import defaultdict
from random import choice

alph = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

# Function to make some test data
def make_dat(nrows=1000, ncols=1000, nvals=1000):
    row_names = [''.join([choice(alph) for x in range(10)]) for y in range(nrows)]
    col_names = [''.join([choice(alph).upper() for x in range(10)]) for y in range(ncols)]
    cell_vals = range(100000000)

    dat = []
    for ix in range(nvals):
        dat.append({'row': choice(row_names),
                    'col': choice(col_names),
                    'val': choice(cell_vals)})
    return dat

# Function to make a dict of dicts from the test data
def make_dict_of_dicts(dat):
    d_of_d = defaultdict(dict)
    for d in dat:
        d_of_d[d['col']][d['row']] = d['val']

    nrows = len(set([d['row'] for d in dat]))
    ncols = len(set([d['col'] for d in dat]))
    nvals = sum([len(v) for k, v in d_of_d.items()])

    prop_filled = nvals / float(nrows * ncols)
    return d_of_d, nrows, ncols, prop_filled


# Make a SparseDataFrame by sequentially adding columns
def make_sparse_df_columnwise(d_of_d):
    df = pd.SparseDataFrame()
    for colname, vals in d_of_d.items():
        df[colname] = pd.Series(vals)
    return df


# Make dense DataFrame, pivot it, and then make it sparse
def make_sparse_df_from_dense(dat):
    return pd.DataFrame(dat).pivot_table(index="row", columns="col", values="val").to_sparse()


# Make sparse DataFrame from data in longitudinal format and then pivot it
def make_sparse_df_then_pivot(dat):
    return pd.DataFrame(dat).to_sparse().pivot_table(index="row", columns="col", values="val")

for nrows, ncols, nvals in [(100, 100, 100),
                            (100, 100, 1000),
                            (100, 1000, 1000),
                            (100, 1000, 10000)]:
    dat = make_dat(nrows=nrows, ncols=ncols, nvals=nvals)
    test_data, nrows, ncols, prop_filled = make_dict_of_dicts(dat)
    print("Matrix: {} rows, {} cols, {}% full".format(nrows, ncols, round(prop_filled * 100, 3)))
    print("")

    print("Testing sparse DataFrame from nested dictionaries")
    % timeit pd.SparseDataFrame(test_data)
    print("")

    print("Testing columnwise addition to sparse DataFrame")
    % timeit make_sparse_df_columnwise(test_data)
    print("")

    print("Testing make dense DataFrame from longitudinal format, pivot it, then convert to sparse")
    % timeit make_sparse_df_from_dense(dat)
    print("")

    print("Testing make sparse DataFrame from longitudinal format, then pivot it")
    % timeit make_sparse_df_then_pivot(dat)
    print("")

    print("")
	#!/usr/local/bin/ipython

	import pandas as pd
	from collections import defaultdict
	from random import choice

	alph = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

	# Function to make some test data
	def make_dat(nrows=1000, ncols=1000, nvals=1000):
	row_names = [''.join([choice(alph) for x in range(10)]) for y in range(nrows)]
	col_names = [''.join([choice(alph).upper() for x in range(10)]) for y in range(ncols)]
	cell_vals = range(100000000)

	dat = []
	for ix in range(nvals):
	dat.append({'row': choice(row_names),
	'col': choice(col_names),
	'val': choice(cell_vals)})
	return dat

	# Function to make a dict of dicts from the test data
	def make_dict_of_dicts(dat):
	d_of_d = defaultdict(dict)
	for d in dat:
	d_of_d[d['col']][d['row']] = d['val']

	nrows = len(set([d['row'] for d in dat]))
	ncols = len(set([d['col'] for d in dat]))
	nvals = sum([len(v) for k, v in d_of_d.items()])

	prop_filled = nvals / float(nrows * ncols)
	return d_of_d, nrows, ncols, prop_filled


	# Make a SparseDataFrame by sequentially adding columns
	def make_sparse_df_columnwise(d_of_d):
	df = pd.SparseDataFrame()
	for colname, vals in d_of_d.items():
	df[colname] = pd.Series(vals)
	return df


	# Make dense DataFrame, pivot it, and then make it sparse
	def make_sparse_df_from_dense(dat):
	return pd.DataFrame(dat).pivot_table(index="row", columns="col", values="val").to_sparse()


	# Make sparse DataFrame from data in longitudinal format and then pivot it
	def make_sparse_df_then_pivot(dat):
	return pd.DataFrame(dat).to_sparse().pivot_table(index="row", columns="col", values="val")

	for nrows, ncols, nvals in [(100, 100, 100),
	(100, 100, 1000),
	(100, 1000, 1000),
	(100, 1000, 10000)]:
	dat = make_dat(nrows=nrows, ncols=ncols, nvals=nvals)
	test_data, nrows, ncols, prop_filled = make_dict_of_dicts(dat)
	print("Matrix: {} rows, {} cols, {}% full".format(nrows, ncols, round(prop_filled * 100, 3)))
	print("")

	print("Testing sparse DataFrame from nested dictionaries")
	% timeit pd.SparseDataFrame(test_data)
	print("")

	print("Testing columnwise addition to sparse DataFrame")
	% timeit make_sparse_df_columnwise(test_data)
	print("")

	print("Testing make dense DataFrame from longitudinal format, pivot it, then convert to sparse")
	% timeit make_sparse_df_from_dense(dat)
	print("")

	print("Testing make sparse DataFrame from longitudinal format, then pivot it")
	% timeit make_sparse_df_then_pivot(dat)
	print("")

	print("")