Skip to content

Instantly share code, notes, and snippets.

@cottrell
Last active August 29, 2015 14:10
Show Gist options
  • Save cottrell/a17fa777afd2cc4a7289 to your computer and use it in GitHub Desktop.
Save cottrell/a17fa777afd2cc4a7289 to your computer and use it in GitHub Desktop.
pandas.sparse.to_coo first hack
from pandas import *
import itertools
import numpy
import scipy.sparse
def _check_partition(left, right, whole):
left = set(left)
right = set(right)
whole = set(whole)
assert(len(left.intersection(right))==0)
assert(left.union(right) == whole)
def _get_index_level_subset(s, subset):
return(list(zip(*[s.index.get_level_values(i) for i in subset])))
def _squish(s):
seen = set()
out = [tuple(x) for x in s]
out = [x for x in out if x not in seen and not seen.add(x)]
return(out)
def _get_label_to_i_dict(labels, sorted=False):
labels = _squish(labels)
if sorted:
labels = sorted(list(labels))
d = dict({k: i for i, k in enumerate(labels)})
return(d)
def _get_sparse_coords(ss, blocs, blength, levels):
il = _get_index_level_subset(ss, levels)
# not sure if there is a better way to get at the labels for non-masked entries
# TODO: using numpy like this here is bad because it is homogenizing the values (all labels are strings)
# sparse_labels = numpy.concatenate([il[i:(i+j)] for i, j in zip(blocs, blength)])
# not sure of efficiency with itertools ... appears to preserve types
sparse_labels = list(itertools.chain(*[il[i:(i+j)] for i, j in zip(blocs, blength)]))
idict = _get_label_to_i_dict(sparse_labels)
i = [idict[tuple(k)] for k in sparse_labels]
inv_dict = {v: k for k, v in idict.items()}
ordered_labels = [inv_dict[k] for k in range(len(idict))]
return(i, ordered_labels)
def to_ijv(ss, ilevels=(0,), jlevels=(1,), sort_labels=False):
""" For arbitrary (MultiIndexed) SparseSeries return (v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for
passing to scipy.sparse.coo constructory. """
# index and column levels must be a partition of the index
_check_partition(ilevels, jlevels, range(ss.index.nlevels))
v = ss._data.values._valid_sp_values
blocs = ss._data.values.sp_index.blocs
blength = ss._data.values.sp_index.blengths
i, il = _get_sparse_coords(ss, blocs, blength, ilevels)
j, jl = _get_sparse_coords(ss, blocs, blength, jlevels)
return(v, i, j, il, jl)
def to_coo(ss, ilevels=(0,), jlevels=(1,), sort_labels=False):
""" Convert a SparseSeries to a scipy.sparse.coo_matrix using ilevels, jlevels as the row, column labels.
Returns the sparse_matrix as well as row and column labels.
TODO: no checking for uniquess (sane-ness) for given ilevels, jlevels. """
v, i, j, il, jl = to_ijv(ss, ilevels=ilevels, jlevels=jlevels, sort_labels=sort_labels)
sparse_matrix = scipy.sparse.coo_matrix((v, (i, j)), shape=(len(il), len(jl)))
return(sparse_matrix, il, jl)
################################################################################
################################################################################
# example
from numpy.random import randn
df = DataFrame(randn(20, 4), columns=['a', 'b', 'c', 'd'])
df.iloc[3:-2,] = np.nan
df.iloc[:3,2:] = np.nan
df.iloc[-2:,:2] = np.nan
df.columns = pandas.MultiIndex.from_tuples([(1, 2, 'a'), (1, 1, 'b'), (2, 1, 'b'), (2, 2, 'c')]).T
# SparseDataFrame
sdf = df.to_sparse()
# SparseSeries
ss = df.unstack().to_sparse()
# create a sparse coo matrix
A, il, jl = to_coo(ss, ilevels=[0, 1], jlevels=[2, 3])
A_df = pandas.DataFrame(A.todense(), columns=pandas.MultiIndex.from_tuples(jl), index=pandas.MultiIndex.from_tuples(il))
# this should be the same data
a_df = ss.to_dense().dropna().unstack(level=[2,3]).fillna(0)
A_df = A_df.sort_index().sort_index(axis=1)
a_df = a_df.sort_index().sort_index(axis=1)
from pandas.util.testing import assert_frame_equal
assert_frame_equal(A_df, a_df)
print(A_df)
print(a_df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment