Created
April 10, 2019 03:48
-
-
Save ivirshup/2a0d9a785339b719e7d372027ae2df31 to your computer and use it in GitHub Desktop.
Doublet prediction via projecting simulated doublets into a UMAP representation of real data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scanpy as sc | |
import numpy as np | |
import pandas as pd | |
from scipy import sparse | |
from umap import UMAP | |
from itertools import repeat, chain | |
# Define functions | |
def preprocess(adata): | |
adata.var["mito"] = adata.var["gene_symbols"].str.startswith("MT-") | |
sc.pp.calculate_qc_metrics(adata, qc_vars=["mito"], inplace=True) | |
sc.pp.normalize_per_cell(adata, counts_per_cell_after=10000) | |
sc.pp.log1p(adata) | |
return adata | |
def pca_update(tgt, src, inplace=True): | |
# TODO: Make sure we know the settings from src | |
if not inplace: | |
tgt = tgt.copy() | |
if sparse.issparse(tgt.X): | |
X = tgt.X.toarray() | |
else: | |
X = tgt.X.copy() | |
X -= np.asarray(tgt.X.mean(axis=0)) | |
tgt_pca = np.dot(X, src.varm["PCs"]) | |
tgt.obsm["X_pca"] = tgt_pca | |
return tgt | |
def simulate_doublets(adata, frac=.5): | |
"""Simulate doublets from count data. | |
Params | |
------ | |
adata | |
The anndata object to sample from. Must have count data. | |
frac | |
Fraction of total cells to simulate. | |
""" | |
m, n = adata.X.shape | |
n_doublets = int(np.round(m * frac)) | |
pos_idx = np.array(list(chain.from_iterable(map(lambda x: repeat(x, 2), range(n_doublets))))) | |
combos = np.random.randint(0, m, (n_doublets * 2)) | |
pos = sparse.csr_matrix( | |
(np.ones_like(combos, dtype=adata.X.dtype), (pos_idx, combos)), | |
shape=(n_doublets, m) | |
) | |
dblX = pos * adata.X | |
# TODO: Downsample total counts | |
srcs = np.sort(combos.reshape(n_doublets, 2), axis=1) | |
obs = pd.DataFrame(srcs, columns=["src1", "src2"]) | |
var = pd.DataFrame(index=adata.var_names) | |
return sc.AnnData(dblX, obs=obs, var=var) | |
# Load, simulate, and preprocess | |
# http: // cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_10k_v3/pbmc_10k_v3_filtered_feature_bc_matrix.h5 | |
pbmc = sc.read_10x_h5("./data/10x/pbmc_10k_v3_filtered_feature_bc_matrix.h5") | |
pbmc.var["gene_symbols"] = pbmc.var.index | |
pbmc.var.set_index("gene_ids", inplace=True) | |
dblt = simulate_doublets(pbmc) | |
dblt.var["gene_symbols"] = pbmc.var["gene_symbols"] | |
pbmc.raw = pbmc | |
dblt.raw = dblt | |
pbmc = preprocess(pbmc) | |
dblt = preprocess(dblt) | |
# Process and project | |
sc.pp.pca(pbmc) | |
pca_update(dblt, pbmc) | |
umap = UMAP() | |
pbmc.obsm["X_umap"] = umap.fit_transform(pbmc.obsm["X_pca"]) | |
dblt.obsm["X_umap"] = umap.transform(dblt.obsm["X_pca"]) | |
# Plotting | |
## Prep data for plotting | |
sc.tl.embedding_density(pbmc, "umap") | |
sc.tl.embedding_density(dblt, "umap") | |
pbmcdf = pd.DataFrame(pbmc.obsm["X_umap"], columns=["x", "y"]) # Real data | |
dbltdf = pd.DataFrame(dblt.obsm["X_umap"], columns=["x", "y"]) # Simulated doublets | |
pbmcdf["density"] = pbmc.obs["umap_density"].values | |
dbltdf["density"] = dblt.obs["umap_density"].values | |
## Plotting imports | |
import datashader as ds | |
from datashader import transfer_functions as tf | |
from bokeh import palettes | |
canvas = ds.Canvas(plot_width=300, plot_height=300, | |
x_range=(pbmcdf["x"].min() - .5, pbmcdf["x"].max() + .5), | |
y_range=(pbmcdf["y"].min() - .5, pbmcdf["y"].max() + .5), | |
x_axis_type='linear', y_axis_type='linear') | |
## Actual plots | |
real = canvas.points(pbmcdf, 'x', 'y', ds.count()) | |
sim = canvas.points(dbltdf, 'x', 'y', ds.count()) | |
real_norm = real / real.sum() | |
sim_norm = sim / sim.sum() | |
tf.Images( | |
tf.shade(real, name="pbmcs"), | |
tf.shade(sim, name="doublet"), | |
tf.shade((sim_norm - real_norm) / (real_norm + sim_norm), cmap=palettes.Viridis256), | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment