-
-
Save cottrell/a17fa777afd2cc4a7289 to your computer and use it in GitHub Desktop.
pandas.sparse.to_coo first hack
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pandas import * | |
import itertools | |
import numpy | |
import scipy.sparse | |
def _check_partition(left, right, whole): | |
left = set(left) | |
right = set(right) | |
whole = set(whole) | |
assert(len(left.intersection(right))==0) | |
assert(left.union(right) == whole) | |
def _get_index_level_subset(s, subset): | |
return(list(zip(*[s.index.get_level_values(i) for i in subset]))) | |
def _squish(s): | |
seen = set() | |
out = [tuple(x) for x in s] | |
out = [x for x in out if x not in seen and not seen.add(x)] | |
return(out) | |
def _get_label_to_i_dict(labels, sorted=False): | |
labels = _squish(labels) | |
if sorted: | |
labels = sorted(list(labels)) | |
d = dict({k: i for i, k in enumerate(labels)}) | |
return(d) | |
def _get_sparse_coords(ss, blocs, blength, levels): | |
il = _get_index_level_subset(ss, levels) | |
# not sure if there is a better way to get at the labels for non-masked entries | |
# TODO: using numpy like this here is bad because it is homogenizing the values (all labels are strings) | |
# sparse_labels = numpy.concatenate([il[i:(i+j)] for i, j in zip(blocs, blength)]) | |
# not sure of efficiency with itertools ... appears to preserve types | |
sparse_labels = list(itertools.chain(*[il[i:(i+j)] for i, j in zip(blocs, blength)])) | |
idict = _get_label_to_i_dict(sparse_labels) | |
i = [idict[tuple(k)] for k in sparse_labels] | |
inv_dict = {v: k for k, v in idict.items()} | |
ordered_labels = [inv_dict[k] for k in range(len(idict))] | |
return(i, ordered_labels) | |
def to_ijv(ss, ilevels=(0,), jlevels=(1,), sort_labels=False): | |
""" For arbitrary (MultiIndexed) SparseSeries return (v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for | |
passing to scipy.sparse.coo constructory. """ | |
# index and column levels must be a partition of the index | |
_check_partition(ilevels, jlevels, range(ss.index.nlevels)) | |
v = ss._data.values._valid_sp_values | |
blocs = ss._data.values.sp_index.blocs | |
blength = ss._data.values.sp_index.blengths | |
i, il = _get_sparse_coords(ss, blocs, blength, ilevels) | |
j, jl = _get_sparse_coords(ss, blocs, blength, jlevels) | |
return(v, i, j, il, jl) | |
def to_coo(ss, ilevels=(0,), jlevels=(1,), sort_labels=False): | |
""" Convert a SparseSeries to a scipy.sparse.coo_matrix using ilevels, jlevels as the row, column labels. | |
Returns the sparse_matrix as well as row and column labels. | |
TODO: no checking for uniquess (sane-ness) for given ilevels, jlevels. """ | |
v, i, j, il, jl = to_ijv(ss, ilevels=ilevels, jlevels=jlevels, sort_labels=sort_labels) | |
sparse_matrix = scipy.sparse.coo_matrix((v, (i, j)), shape=(len(il), len(jl))) | |
return(sparse_matrix, il, jl) | |
################################################################################ | |
################################################################################ | |
# example | |
from numpy.random import randn | |
df = DataFrame(randn(20, 4), columns=['a', 'b', 'c', 'd']) | |
df.iloc[3:-2,] = np.nan | |
df.iloc[:3,2:] = np.nan | |
df.iloc[-2:,:2] = np.nan | |
df.columns = pandas.MultiIndex.from_tuples([(1, 2, 'a'), (1, 1, 'b'), (2, 1, 'b'), (2, 2, 'c')]).T | |
# SparseDataFrame | |
sdf = df.to_sparse() | |
# SparseSeries | |
ss = df.unstack().to_sparse() | |
# create a sparse coo matrix | |
A, il, jl = to_coo(ss, ilevels=[0, 1], jlevels=[2, 3]) | |
A_df = pandas.DataFrame(A.todense(), columns=pandas.MultiIndex.from_tuples(jl), index=pandas.MultiIndex.from_tuples(il)) | |
# this should be the same data | |
a_df = ss.to_dense().dropna().unstack(level=[2,3]).fillna(0) | |
A_df = A_df.sort_index().sort_index(axis=1) | |
a_df = a_df.sort_index().sort_index(axis=1) | |
from pandas.util.testing import assert_frame_equal | |
assert_frame_equal(A_df, a_df) | |
print(A_df) | |
print(a_df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment