danielchalef/spark_vector_col_to_scipy_sparse.py

## spark_vector_col_to_scipy_sparse.py
def to_sparse(row: pd.Series, col_name) -> csr_matrix:
    """ Parse each row of the constituent columns of a Spark SparseVector and
        return this row as a scipy.csr_matrix
    """
    values = np.array(row[f"{col_name}.values"]).astype(np.float32)
    row_indices = np.zeros(values.shape[0], dtype=np.int8)
    col_indices = np.array(row[f"{col_name}.indices"])
    shape = (1, row[f"{col_name}.size"])
    return coo_matrix((values, (row_indices, col_indices)), shape=shape).tocsr()


def vector_col_to_sparse(df: pd.DataFrame, col_name: str = "features") -> csr_matrix:
    """ Parse a dataframe with the constituent columns of a Spark SparseVector and
        return a scipy.csr_matrix
    """
    to_sparse_col: partial[csr_matrix] = partial(to_sparse, col_name=col_name)

    matt_array: np.ndarray = df.apply(to_sparse_col, axis=1).values
    return vstack(matt_array)
	def to_sparse(row: pd.Series, col_name) -> csr_matrix:
	""" Parse each row of the constituent columns of a Spark SparseVector and
	return this row as a scipy.csr_matrix
	"""
	values = np.array(row[f"{col_name}.values"]).astype(np.float32)
	row_indices = np.zeros(values.shape[0], dtype=np.int8)
	col_indices = np.array(row[f"{col_name}.indices"])
	shape = (1, row[f"{col_name}.size"])
	return coo_matrix((values, (row_indices, col_indices)), shape=shape).tocsr()


	def vector_col_to_sparse(df: pd.DataFrame, col_name: str = "features") -> csr_matrix:
	""" Parse a dataframe with the constituent columns of a Spark SparseVector and
	return a scipy.csr_matrix
	"""
	to_sparse_col: partial[csr_matrix] = partial(to_sparse, col_name=col_name)

	matt_array: np.ndarray = df.apply(to_sparse_col, axis=1).values
	return vstack(matt_array)