yjzhang/sparse_mean_variance.pyx

## sparse_mean_variance.pyx
cimport cython

import numpy as np
cimport numpy as np

from libc.math cimport log2

ctypedef fused int2:
    short
    int
    long
    long long

ctypedef fused numeric:
    short
    unsigned short
    int
    unsigned int
    long
    float
    double

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
def sparse_means_var_csc(np.ndarray[numeric, ndim=1] data,
        np.ndarray[int2, ndim=1] indices,
        np.ndarray[int2, ndim=1] indptr,
        Py_ssize_t ncols,
        Py_ssize_t nrows):
    """
    Returns a pair of matrices: mean, variance of a sparse csc matrix.
    Uses the identity Var = E[X^2] = E[X]^2.

    This is substantially faster than calling data.mean(1) where data is a csc matrix.

    Args:
        data (array): the data field of a csc matrix
        indices (array): the indices field of a csc matrix
        indptr (array): the indptr field of a csc matrix
        ncols (int): the number of columns in the matrix
        nrows (int): the number of rows in the matrix

    Returns:
        a pair of 1d arrays, mean and variance, of shape (genes,) -
        the mean and variance are taken over each row of the input.
    """
    cdef int2 c, start_ind, end_ind, i2, g
    cdef double s
    cdef double[:] sq_means = np.zeros(nrows)
    cdef double[:] var = np.zeros(nrows)
    cdef double[:] means = np.zeros(nrows)
    for c in range(ncols):
        start_ind = indptr[c]
        end_ind = indptr[c+1]
        for i2 in range(start_ind, end_ind):
            g = indices[i2]
            sq_means[g] += data[i2]**2
            means[g] += data[i2]
    for g in range(nrows):
        means[g] = means[g]/ncols
        var[g] = sq_means[g]/ncols - means[g]**2
    return np.asarray(means), np.asarray(var)
	cimport cython

	import numpy as np
	cimport numpy as np

	from libc.math cimport log2

	ctypedef fused int2:
	short
	int
	long
	long long

	ctypedef fused numeric:
	short
	unsigned short
	int
	unsigned int
	long
	float
	double

	@cython.boundscheck(False)
	@cython.wraparound(False)
	@cython.nonecheck(False)
	def sparse_means_var_csc(np.ndarray[numeric, ndim=1] data,
	np.ndarray[int2, ndim=1] indices,
	np.ndarray[int2, ndim=1] indptr,
	Py_ssize_t ncols,
	Py_ssize_t nrows):
	"""
	Returns a pair of matrices: mean, variance of a sparse csc matrix.
	Uses the identity Var = E[X^2] = E[X]^2.

	This is substantially faster than calling data.mean(1) where data is a csc matrix.

	Args:
	data (array): the data field of a csc matrix
	indices (array): the indices field of a csc matrix
	indptr (array): the indptr field of a csc matrix
	ncols (int): the number of columns in the matrix
	nrows (int): the number of rows in the matrix

	Returns:
	a pair of 1d arrays, mean and variance, of shape (genes,) -
	the mean and variance are taken over each row of the input.
	"""
	cdef int2 c, start_ind, end_ind, i2, g
	cdef double s
	cdef double[:] sq_means = np.zeros(nrows)
	cdef double[:] var = np.zeros(nrows)
	cdef double[:] means = np.zeros(nrows)
	for c in range(ncols):
	start_ind = indptr[c]
	end_ind = indptr[c+1]
	for i2 in range(start_ind, end_ind):
	g = indices[i2]
	sq_means[g] += data[i2]**2
	means[g] += data[i2]
	for g in range(nrows):
	means[g] = means[g]/ncols
	var[g] = sq_means[g]/ncols - means[g]**2
	return np.asarray(means), np.asarray(var)