Last active
January 19, 2024 17:22
-
-
Save LouisFaure/9302aa140d7989a25ed2a44b1ce741e8 to your computer and use it in GitHub Desktop.
some GPU accelerated function for scanpy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scanpy as sc | |
import cupy as cp | |
from scipy.sparse import csr_matrix, find, issparse | |
from scipy.sparse.linalg import eigs | |
import numpy as np | |
import pandas as pd | |
import cudf | |
import glob | |
from cupy.sparse import cupyx as cpx | |
from cupyx.scipy.sparse import coo_matrix as coo_matrix_gpu | |
from cupyx.scipy.sparse import csr_matrix as csr_matrix_gpu | |
from cupyx.scipy.sparse.linalg import eigsh | |
import scipy.sparse as sp | |
def read_mtx_gpu(filename,nchunks=None): | |
print("Getting mtx") | |
mtxinfo=pd.read_csv(filename,nrows=1,sep=" ",comment="%",header=None).values[0] | |
if nchunks is not None: | |
print(" loading to device chunkwise") | |
chunks=np.round(np.arange(0, mtxinfo[2]+mtxinfo[2]/nchunks, mtxinfo[2]/nchunks)) | |
toadata=list() | |
for i in range(nchunks): | |
mtx_data=cudf.read_csv(filename,sep=" ",dtype=['float32' for i in range(3)],comment="%",header=None, | |
nrows=chunks[i+1]-chunks[i],skiprows=chunks[i]+2) | |
toadata.append(mtx_data.to_numpy()) | |
print(" stacking and converting to anndata") | |
toadata=np.vstack(toadata) | |
else: | |
print(" loading to device") | |
mtx_data=cudf.read_csv(filename,sep=" ",dtype=['float32' for i in range(3)],comment="%",header=None, | |
skiprows=2) | |
print(" transfering to host and converting to anndata") | |
toadata=mtx_data.to_numpy() | |
shape=tuple((mtxinfo[:2]+1).astype(int)) | |
toadata=sp.coo_matrix((toadata[:,2], (toadata[:,0], toadata[:,1])),shape=shape) | |
toadata=toadata.tocsr() | |
toadata=toadata[1:,1:] | |
return sc.AnnData(toadata) | |
def run_diffusion(data_df, n_components=10, knn=30, alpha=0,metric='euclidean'): | |
"""Run Diffusion maps using the adaptive anisotropic kernel | |
:param data_df: PCA projections of the data or adjacency matrix | |
:param n_components: Number of diffusion components | |
:param knn: Number of nearest neighbors for graph construction | |
:param alpha: Normalization parameter for the diffusion operator | |
:return: Diffusion components, corresponding eigen values and the diffusion operator | |
""" | |
# Determine the kernel | |
N = data_df.shape[0] | |
if not issparse(data_df): | |
from cuml.neighbors import NearestNeighbors | |
nn = NearestNeighbors(n_neighbors=knn,metric=metric) | |
X_contiguous = np.ascontiguousarray(data_df.values) | |
nn.fit(X_contiguous) | |
kNN = nn.kneighbors_graph(X_contiguous,mode="distance") | |
kNN.setdiag(0) | |
kNN.eliminate_zeros() | |
# Adaptive k | |
adaptive_k = int(np.floor(knn / 3)) | |
adaptive_std = np.zeros(N) | |
for i in np.arange(len(adaptive_std)): | |
adaptive_std[i] = np.sort(kNN.data[kNN.indptr[i] : kNN.indptr[i + 1]])[ | |
adaptive_k - 1 | |
] | |
# Kernel | |
x, y, dists = find(kNN) | |
# X, y specific stds | |
dists = dists / adaptive_std[x] | |
W = csr_matrix((np.exp(-dists), (x, y)), shape=[N, N]) | |
# Diffusion components | |
kernel = W + W.T | |
else: | |
kernel = data_df | |
# Markov | |
D = np.ravel(kernel.sum(axis=1)) | |
if alpha > 0: | |
# L_alpha | |
D[D != 0] = D[D != 0] ** (-alpha) | |
mat = csr_matrix((D, (range(N), range(N))), shape=[N, N]) | |
kernel = mat.dot(kernel).dot(mat) | |
D = np.ravel(kernel.sum(axis=1)) | |
D[D != 0] = 1 / D[D != 0] | |
kernel = csr_matrix_gpu(kernel) | |
D = csr_matrix_gpu((cp.array(D), (cp.arange(N), cp.arange(N))), shape=(N, N)) | |
T = D.dot(kernel) | |
# Eigen value dcomposition | |
D, V = eigsh(T, n_components, tol=1e-4, maxiter=1000) | |
D, V = D.get(), V.get() | |
inds = np.argsort(D)[::-1] | |
D = D[inds] | |
V = V[:, inds] | |
# Normalize | |
for i in range(V.shape[1]): | |
V[:, i] = V[:, i] / np.linalg.norm(V[:, i]) | |
# Create are results dictionary | |
res = {"T": T.get(), "EigenVectors": V, "EigenValues": D} | |
res["EigenVectors"] = pd.DataFrame(res["EigenVectors"]) | |
if not issparse(data_df): | |
res["EigenVectors"].index = data_df.index | |
res["EigenValues"] = pd.Series(res["EigenValues"]) | |
res["kernel"] = kernel.get() | |
return res | |
def draw_graph(adata, | |
init_pos= None, | |
random_state = 0, | |
iterations = 500): | |
from scanpy._utils import AnyRandom, _choose_graph | |
adjacency = _choose_graph(adata, None, None) | |
if init_pos is not None: | |
init_coords = adata.obsm[init_pos] | |
else: | |
np.random.seed(random_state) | |
init_coords = np.random.random((adjacency.shape[0], 2)) | |
import cugraph | |
import cudf | |
offsets = cudf.Series(adjacency.indptr) | |
indices = cudf.Series(adjacency.indices) | |
G = cugraph.Graph() | |
G.from_cudf_adjlist(offsets, indices, None) | |
forceatlas2 = cugraph.layout.force_atlas2( | |
G, | |
max_iter=iterations, | |
pos_list=cudf.DataFrame( | |
{ | |
"vertex": np.arange(init_coords.shape[0]), | |
"x": init_coords[:, 0], | |
"y": init_coords[:, 1], | |
} | |
), | |
outbound_attraction_distribution=False, | |
lin_log_mode=False, | |
edge_weight_influence=1.0, | |
jitter_tolerance=1.0, | |
barnes_hut_optimize=True, | |
barnes_hut_theta=1.2, | |
scaling_ratio=2.0, | |
strong_gravity_mode=False, | |
gravity=1.0, | |
verbose=True, | |
) | |
positions = forceatlas2.to_pandas().loc[:, ["x", "y"]].values | |
adata.uns['draw_graph'] = {} | |
adata.uns['draw_graph']['params'] = dict(layout="fa", random_state=0) | |
adata.obsm["X_draw_graph_fa"] = positions | |
def leiden(adata,resolution=1,use_weights=True): | |
from scanpy._utils import _choose_graph | |
adjacency = _choose_graph(adata,None,None) | |
import cudf | |
import cugraph | |
offsets = cudf.Series(adjacency.indptr) | |
indices = cudf.Series(adjacency.indices) | |
if use_weights: | |
sources, targets = adjacency.nonzero() | |
weights = adjacency[sources, targets] | |
if isinstance(weights, np.matrix): | |
weights = weights.A1 | |
weights = cudf.Series(weights) | |
else: | |
weights = None | |
g = cugraph.Graph() | |
if hasattr(g, 'add_adj_list'): | |
g.add_adj_list(offsets, indices, weights) | |
else: | |
g.from_cudf_adjlist(offsets, indices, weights) | |
leiden_parts, _ = cugraph.leiden(g, resolution=resolution) | |
groups = ( | |
leiden_parts.to_pandas() | |
.sort_values('vertex')[['partition']] | |
.to_numpy() | |
.ravel() | |
) | |
adata.obs["leiden"]=groups.astype(str) | |
adata.obs.leiden=adata.obs.leiden.astype("category") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment