cottrell/gist:a17fa777afd2cc4a7289 Secret

## gistfile1.txt
from pandas import *
import itertools
import numpy
import scipy.sparse

def _check_partition(left, right, whole):
    left = set(left)
    right = set(right)
    whole = set(whole)
    assert(len(left.intersection(right))==0)
    assert(left.union(right) == whole)

def _get_index_level_subset(s, subset):
    return(list(zip(*[s.index.get_level_values(i) for i in subset])))

def _squish(s):
    seen = set()
    out = [tuple(x) for x in s]
    out = [x for x in out if x not in seen and not seen.add(x)]
    return(out)

def _get_label_to_i_dict(labels, sorted=False):
    labels = _squish(labels)
    if sorted:
        labels = sorted(list(labels))
    d = dict({k: i for i, k in enumerate(labels)})
    return(d)

def _get_sparse_coords(ss, blocs, blength, levels):
    il = _get_index_level_subset(ss, levels)
    # not sure if there is a better way to get at the labels for non-masked entries
    # TODO: using numpy like this here is bad because it is homogenizing the values (all labels are strings)
    # sparse_labels = numpy.concatenate([il[i:(i+j)] for i, j in zip(blocs, blength)])
    # not sure of efficiency with itertools ... appears to preserve types
    sparse_labels = list(itertools.chain(*[il[i:(i+j)] for i, j in zip(blocs, blength)]))
    idict = _get_label_to_i_dict(sparse_labels)
    i = [idict[tuple(k)] for k in sparse_labels]
    inv_dict = {v: k for k, v in idict.items()}
    ordered_labels = [inv_dict[k] for k in range(len(idict))]
    return(i, ordered_labels)

def to_ijv(ss, ilevels=(0,), jlevels=(1,), sort_labels=False):
    """ For arbitrary (MultiIndexed) SparseSeries return (v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for
    passing to scipy.sparse.coo constructory. """
    # index and column levels must be a partition of the index
    _check_partition(ilevels, jlevels, range(ss.index.nlevels))
    v = ss._data.values._valid_sp_values
    blocs = ss._data.values.sp_index.blocs
    blength = ss._data.values.sp_index.blengths
    i, il = _get_sparse_coords(ss, blocs, blength, ilevels)
    j, jl = _get_sparse_coords(ss, blocs, blength, jlevels)
    return(v, i, j, il, jl)

def to_coo(ss, ilevels=(0,), jlevels=(1,), sort_labels=False):
    """ Convert a SparseSeries to a scipy.sparse.coo_matrix using ilevels, jlevels as the row, column labels.
    Returns the sparse_matrix as well as row and column labels.
    TODO: no checking for uniquess (sane-ness) for given ilevels, jlevels. """
    v, i, j, il, jl = to_ijv(ss, ilevels=ilevels, jlevels=jlevels, sort_labels=sort_labels)
    sparse_matrix = scipy.sparse.coo_matrix((v, (i, j)), shape=(len(il), len(jl)))
    return(sparse_matrix, il, jl)

################################################################################
################################################################################
# example
from numpy.random import randn
df = DataFrame(randn(20, 4), columns=['a', 'b', 'c', 'd'])
df.iloc[3:-2,] = np.nan
df.iloc[:3,2:] = np.nan
df.iloc[-2:,:2] = np.nan
df.columns = pandas.MultiIndex.from_tuples([(1, 2, 'a'), (1, 1, 'b'), (2, 1, 'b'), (2, 2, 'c')]).T

# SparseDataFrame
sdf = df.to_sparse()

# SparseSeries
ss = df.unstack().to_sparse()

# create a sparse coo matrix
A, il, jl = to_coo(ss, ilevels=[0, 1], jlevels=[2, 3])

A_df = pandas.DataFrame(A.todense(), columns=pandas.MultiIndex.from_tuples(jl), index=pandas.MultiIndex.from_tuples(il))

# this should be the same data
a_df = ss.to_dense().dropna().unstack(level=[2,3]).fillna(0)

A_df = A_df.sort_index().sort_index(axis=1)
a_df = a_df.sort_index().sort_index(axis=1)
from pandas.util.testing import assert_frame_equal

assert_frame_equal(A_df, a_df)

print(A_df)
print(a_df)
	from pandas import *
	import itertools
	import numpy
	import scipy.sparse

	def _check_partition(left, right, whole):
	left = set(left)
	right = set(right)
	whole = set(whole)
	assert(len(left.intersection(right))==0)
	assert(left.union(right) == whole)

	def _get_index_level_subset(s, subset):
	return(list(zip(*[s.index.get_level_values(i) for i in subset])))

	def _squish(s):
	seen = set()
	out = [tuple(x) for x in s]
	out = [x for x in out if x not in seen and not seen.add(x)]
	return(out)

	def _get_label_to_i_dict(labels, sorted=False):
	labels = _squish(labels)
	if sorted:
	labels = sorted(list(labels))
	d = dict({k: i for i, k in enumerate(labels)})
	return(d)

	def _get_sparse_coords(ss, blocs, blength, levels):
	il = _get_index_level_subset(ss, levels)
	# not sure if there is a better way to get at the labels for non-masked entries
	# TODO: using numpy like this here is bad because it is homogenizing the values (all labels are strings)
	# sparse_labels = numpy.concatenate([il[i:(i+j)] for i, j in zip(blocs, blength)])
	# not sure of efficiency with itertools ... appears to preserve types
	sparse_labels = list(itertools.chain(*[il[i:(i+j)] for i, j in zip(blocs, blength)]))
	idict = _get_label_to_i_dict(sparse_labels)
	i = [idict[tuple(k)] for k in sparse_labels]
	inv_dict = {v: k for k, v in idict.items()}
	ordered_labels = [inv_dict[k] for k in range(len(idict))]
	return(i, ordered_labels)

	def to_ijv(ss, ilevels=(0,), jlevels=(1,), sort_labels=False):
	""" For arbitrary (MultiIndexed) SparseSeries return (v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for
	passing to scipy.sparse.coo constructory. """
	# index and column levels must be a partition of the index
	_check_partition(ilevels, jlevels, range(ss.index.nlevels))
	v = ss._data.values._valid_sp_values
	blocs = ss._data.values.sp_index.blocs
	blength = ss._data.values.sp_index.blengths
	i, il = _get_sparse_coords(ss, blocs, blength, ilevels)
	j, jl = _get_sparse_coords(ss, blocs, blength, jlevels)
	return(v, i, j, il, jl)

	def to_coo(ss, ilevels=(0,), jlevels=(1,), sort_labels=False):
	""" Convert a SparseSeries to a scipy.sparse.coo_matrix using ilevels, jlevels as the row, column labels.
	Returns the sparse_matrix as well as row and column labels.
	TODO: no checking for uniquess (sane-ness) for given ilevels, jlevels. """
	v, i, j, il, jl = to_ijv(ss, ilevels=ilevels, jlevels=jlevels, sort_labels=sort_labels)
	sparse_matrix = scipy.sparse.coo_matrix((v, (i, j)), shape=(len(il), len(jl)))
	return(sparse_matrix, il, jl)

	################################################################################
	################################################################################
	# example
	from numpy.random import randn
	df = DataFrame(randn(20, 4), columns=['a', 'b', 'c', 'd'])
	df.iloc[3:-2,] = np.nan
	df.iloc[:3,2:] = np.nan
	df.iloc[-2:,:2] = np.nan
	df.columns = pandas.MultiIndex.from_tuples([(1, 2, 'a'), (1, 1, 'b'), (2, 1, 'b'), (2, 2, 'c')]).T

	# SparseDataFrame
	sdf = df.to_sparse()

	# SparseSeries
	ss = df.unstack().to_sparse()

	# create a sparse coo matrix
	A, il, jl = to_coo(ss, ilevels=[0, 1], jlevels=[2, 3])

	A_df = pandas.DataFrame(A.todense(), columns=pandas.MultiIndex.from_tuples(jl), index=pandas.MultiIndex.from_tuples(il))

	# this should be the same data
	a_df = ss.to_dense().dropna().unstack(level=[2,3]).fillna(0)

	A_df = A_df.sort_index().sort_index(axis=1)
	a_df = a_df.sort_index().sort_index(axis=1)
	from pandas.util.testing import assert_frame_equal

	assert_frame_equal(A_df, a_df)

	print(A_df)
	print(a_df)