Created
December 24, 2019 04:06
-
-
Save danielchalef/ed753581015cbb0ab6ee1059257b9244 to your computer and use it in GitHub Desktop.
Convert a Spark SparseVector column saved to parquet to a SciPy SparseMatrix
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def to_sparse(row: pd.Series, col_name) -> csr_matrix: | |
""" Parse each row of the constituent columns of a Spark SparseVector and | |
return this row as a scipy.csr_matrix | |
""" | |
values = np.array(row[f"{col_name}.values"]).astype(np.float32) | |
row_indices = np.zeros(values.shape[0], dtype=np.int8) | |
col_indices = np.array(row[f"{col_name}.indices"]) | |
shape = (1, row[f"{col_name}.size"]) | |
return coo_matrix((values, (row_indices, col_indices)), shape=shape).tocsr() | |
def vector_col_to_sparse(df: pd.DataFrame, col_name: str = "features") -> csr_matrix: | |
""" Parse a dataframe with the constituent columns of a Spark SparseVector and | |
return a scipy.csr_matrix | |
""" | |
to_sparse_col: partial[csr_matrix] = partial(to_sparse, col_name=col_name) | |
matt_array: np.ndarray = df.apply(to_sparse_col, axis=1).values | |
return vstack(matt_array) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment