Skip to content

Instantly share code, notes, and snippets.

@LouisFaure
Last active January 19, 2024 17:22
Show Gist options
  • Save LouisFaure/9302aa140d7989a25ed2a44b1ce741e8 to your computer and use it in GitHub Desktop.
Save LouisFaure/9302aa140d7989a25ed2a44b1ce741e8 to your computer and use it in GitHub Desktop.
some GPU accelerated function for scanpy
import scanpy as sc
import cupy as cp
from scipy.sparse import csr_matrix, find, issparse
from scipy.sparse.linalg import eigs
import numpy as np
import pandas as pd
import cudf
import glob
from cupy.sparse import cupyx as cpx
from cupyx.scipy.sparse import coo_matrix as coo_matrix_gpu
from cupyx.scipy.sparse import csr_matrix as csr_matrix_gpu
from cupyx.scipy.sparse.linalg import eigsh
import scipy.sparse as sp
def read_mtx_gpu(filename,nchunks=None):
print("Getting mtx")
mtxinfo=pd.read_csv(filename,nrows=1,sep=" ",comment="%",header=None).values[0]
if nchunks is not None:
print(" loading to device chunkwise")
chunks=np.round(np.arange(0, mtxinfo[2]+mtxinfo[2]/nchunks, mtxinfo[2]/nchunks))
toadata=list()
for i in range(nchunks):
mtx_data=cudf.read_csv(filename,sep=" ",dtype=['float32' for i in range(3)],comment="%",header=None,
nrows=chunks[i+1]-chunks[i],skiprows=chunks[i]+2)
toadata.append(mtx_data.to_numpy())
print(" stacking and converting to anndata")
toadata=np.vstack(toadata)
else:
print(" loading to device")
mtx_data=cudf.read_csv(filename,sep=" ",dtype=['float32' for i in range(3)],comment="%",header=None,
skiprows=2)
print(" transfering to host and converting to anndata")
toadata=mtx_data.to_numpy()
shape=tuple((mtxinfo[:2]+1).astype(int))
toadata=sp.coo_matrix((toadata[:,2], (toadata[:,0], toadata[:,1])),shape=shape)
toadata=toadata.tocsr()
toadata=toadata[1:,1:]
return sc.AnnData(toadata)
def run_diffusion(data_df, n_components=10, knn=30, alpha=0,metric='euclidean'):
"""Run Diffusion maps using the adaptive anisotropic kernel
:param data_df: PCA projections of the data or adjacency matrix
:param n_components: Number of diffusion components
:param knn: Number of nearest neighbors for graph construction
:param alpha: Normalization parameter for the diffusion operator
:return: Diffusion components, corresponding eigen values and the diffusion operator
"""
# Determine the kernel
N = data_df.shape[0]
if not issparse(data_df):
from cuml.neighbors import NearestNeighbors
nn = NearestNeighbors(n_neighbors=knn,metric=metric)
X_contiguous = np.ascontiguousarray(data_df.values)
nn.fit(X_contiguous)
kNN = nn.kneighbors_graph(X_contiguous,mode="distance")
kNN.setdiag(0)
kNN.eliminate_zeros()
# Adaptive k
adaptive_k = int(np.floor(knn / 3))
adaptive_std = np.zeros(N)
for i in np.arange(len(adaptive_std)):
adaptive_std[i] = np.sort(kNN.data[kNN.indptr[i] : kNN.indptr[i + 1]])[
adaptive_k - 1
]
# Kernel
x, y, dists = find(kNN)
# X, y specific stds
dists = dists / adaptive_std[x]
W = csr_matrix((np.exp(-dists), (x, y)), shape=[N, N])
# Diffusion components
kernel = W + W.T
else:
kernel = data_df
# Markov
D = np.ravel(kernel.sum(axis=1))
if alpha > 0:
# L_alpha
D[D != 0] = D[D != 0] ** (-alpha)
mat = csr_matrix((D, (range(N), range(N))), shape=[N, N])
kernel = mat.dot(kernel).dot(mat)
D = np.ravel(kernel.sum(axis=1))
D[D != 0] = 1 / D[D != 0]
kernel = csr_matrix_gpu(kernel)
D = csr_matrix_gpu((cp.array(D), (cp.arange(N), cp.arange(N))), shape=(N, N))
T = D.dot(kernel)
# Eigen value dcomposition
D, V = eigsh(T, n_components, tol=1e-4, maxiter=1000)
D, V = D.get(), V.get()
inds = np.argsort(D)[::-1]
D = D[inds]
V = V[:, inds]
# Normalize
for i in range(V.shape[1]):
V[:, i] = V[:, i] / np.linalg.norm(V[:, i])
# Create are results dictionary
res = {"T": T.get(), "EigenVectors": V, "EigenValues": D}
res["EigenVectors"] = pd.DataFrame(res["EigenVectors"])
if not issparse(data_df):
res["EigenVectors"].index = data_df.index
res["EigenValues"] = pd.Series(res["EigenValues"])
res["kernel"] = kernel.get()
return res
def draw_graph(adata,
init_pos= None,
random_state = 0,
iterations = 500):
from scanpy._utils import AnyRandom, _choose_graph
adjacency = _choose_graph(adata, None, None)
if init_pos is not None:
init_coords = adata.obsm[init_pos]
else:
np.random.seed(random_state)
init_coords = np.random.random((adjacency.shape[0], 2))
import cugraph
import cudf
offsets = cudf.Series(adjacency.indptr)
indices = cudf.Series(adjacency.indices)
G = cugraph.Graph()
G.from_cudf_adjlist(offsets, indices, None)
forceatlas2 = cugraph.layout.force_atlas2(
G,
max_iter=iterations,
pos_list=cudf.DataFrame(
{
"vertex": np.arange(init_coords.shape[0]),
"x": init_coords[:, 0],
"y": init_coords[:, 1],
}
),
outbound_attraction_distribution=False,
lin_log_mode=False,
edge_weight_influence=1.0,
jitter_tolerance=1.0,
barnes_hut_optimize=True,
barnes_hut_theta=1.2,
scaling_ratio=2.0,
strong_gravity_mode=False,
gravity=1.0,
verbose=True,
)
positions = forceatlas2.to_pandas().loc[:, ["x", "y"]].values
adata.uns['draw_graph'] = {}
adata.uns['draw_graph']['params'] = dict(layout="fa", random_state=0)
adata.obsm["X_draw_graph_fa"] = positions
def leiden(adata,resolution=1,use_weights=True):
from scanpy._utils import _choose_graph
adjacency = _choose_graph(adata,None,None)
import cudf
import cugraph
offsets = cudf.Series(adjacency.indptr)
indices = cudf.Series(adjacency.indices)
if use_weights:
sources, targets = adjacency.nonzero()
weights = adjacency[sources, targets]
if isinstance(weights, np.matrix):
weights = weights.A1
weights = cudf.Series(weights)
else:
weights = None
g = cugraph.Graph()
if hasattr(g, 'add_adj_list'):
g.add_adj_list(offsets, indices, weights)
else:
g.from_cudf_adjlist(offsets, indices, weights)
leiden_parts, _ = cugraph.leiden(g, resolution=resolution)
groups = (
leiden_parts.to_pandas()
.sort_values('vertex')[['partition']]
.to_numpy()
.ravel()
)
adata.obs["leiden"]=groups.astype(str)
adata.obs.leiden=adata.obs.leiden.astype("category")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment