Skip to content

Instantly share code, notes, and snippets.

@danielchalef
Created December 24, 2019 04:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danielchalef/ed753581015cbb0ab6ee1059257b9244 to your computer and use it in GitHub Desktop.
Save danielchalef/ed753581015cbb0ab6ee1059257b9244 to your computer and use it in GitHub Desktop.
Convert a Spark SparseVector column saved to parquet to a SciPy SparseMatrix
def to_sparse(row: pd.Series, col_name) -> csr_matrix:
""" Parse each row of the constituent columns of a Spark SparseVector and
return this row as a scipy.csr_matrix
"""
values = np.array(row[f"{col_name}.values"]).astype(np.float32)
row_indices = np.zeros(values.shape[0], dtype=np.int8)
col_indices = np.array(row[f"{col_name}.indices"])
shape = (1, row[f"{col_name}.size"])
return coo_matrix((values, (row_indices, col_indices)), shape=shape).tocsr()
def vector_col_to_sparse(df: pd.DataFrame, col_name: str = "features") -> csr_matrix:
""" Parse a dataframe with the constituent columns of a Spark SparseVector and
return a scipy.csr_matrix
"""
to_sparse_col: partial[csr_matrix] = partial(to_sparse, col_name=col_name)
matt_array: np.ndarray = df.apply(to_sparse_col, axis=1).values
return vstack(matt_array)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment