Skip to content

Instantly share code, notes, and snippets.

@ivirshup
Created April 10, 2019 03:48
Show Gist options
  • Save ivirshup/2a0d9a785339b719e7d372027ae2df31 to your computer and use it in GitHub Desktop.
Save ivirshup/2a0d9a785339b719e7d372027ae2df31 to your computer and use it in GitHub Desktop.
Doublet prediction via projecting simulated doublets into a UMAP representation of real data
import scanpy as sc
import numpy as np
import pandas as pd
from scipy import sparse
from umap import UMAP
from itertools import repeat, chain
# Define functions
def preprocess(adata):
adata.var["mito"] = adata.var["gene_symbols"].str.startswith("MT-")
sc.pp.calculate_qc_metrics(adata, qc_vars=["mito"], inplace=True)
sc.pp.normalize_per_cell(adata, counts_per_cell_after=10000)
sc.pp.log1p(adata)
return adata
def pca_update(tgt, src, inplace=True):
# TODO: Make sure we know the settings from src
if not inplace:
tgt = tgt.copy()
if sparse.issparse(tgt.X):
X = tgt.X.toarray()
else:
X = tgt.X.copy()
X -= np.asarray(tgt.X.mean(axis=0))
tgt_pca = np.dot(X, src.varm["PCs"])
tgt.obsm["X_pca"] = tgt_pca
return tgt
def simulate_doublets(adata, frac=.5):
"""Simulate doublets from count data.
Params
------
adata
The anndata object to sample from. Must have count data.
frac
Fraction of total cells to simulate.
"""
m, n = adata.X.shape
n_doublets = int(np.round(m * frac))
pos_idx = np.array(list(chain.from_iterable(map(lambda x: repeat(x, 2), range(n_doublets)))))
combos = np.random.randint(0, m, (n_doublets * 2))
pos = sparse.csr_matrix(
(np.ones_like(combos, dtype=adata.X.dtype), (pos_idx, combos)),
shape=(n_doublets, m)
)
dblX = pos * adata.X
# TODO: Downsample total counts
srcs = np.sort(combos.reshape(n_doublets, 2), axis=1)
obs = pd.DataFrame(srcs, columns=["src1", "src2"])
var = pd.DataFrame(index=adata.var_names)
return sc.AnnData(dblX, obs=obs, var=var)
# Load, simulate, and preprocess
# http: // cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_10k_v3/pbmc_10k_v3_filtered_feature_bc_matrix.h5
pbmc = sc.read_10x_h5("./data/10x/pbmc_10k_v3_filtered_feature_bc_matrix.h5")
pbmc.var["gene_symbols"] = pbmc.var.index
pbmc.var.set_index("gene_ids", inplace=True)
dblt = simulate_doublets(pbmc)
dblt.var["gene_symbols"] = pbmc.var["gene_symbols"]
pbmc.raw = pbmc
dblt.raw = dblt
pbmc = preprocess(pbmc)
dblt = preprocess(dblt)
# Process and project
sc.pp.pca(pbmc)
pca_update(dblt, pbmc)
umap = UMAP()
pbmc.obsm["X_umap"] = umap.fit_transform(pbmc.obsm["X_pca"])
dblt.obsm["X_umap"] = umap.transform(dblt.obsm["X_pca"])
# Plotting
## Prep data for plotting
sc.tl.embedding_density(pbmc, "umap")
sc.tl.embedding_density(dblt, "umap")
pbmcdf = pd.DataFrame(pbmc.obsm["X_umap"], columns=["x", "y"]) # Real data
dbltdf = pd.DataFrame(dblt.obsm["X_umap"], columns=["x", "y"]) # Simulated doublets
pbmcdf["density"] = pbmc.obs["umap_density"].values
dbltdf["density"] = dblt.obs["umap_density"].values
## Plotting imports
import datashader as ds
from datashader import transfer_functions as tf
from bokeh import palettes
canvas = ds.Canvas(plot_width=300, plot_height=300,
x_range=(pbmcdf["x"].min() - .5, pbmcdf["x"].max() + .5),
y_range=(pbmcdf["y"].min() - .5, pbmcdf["y"].max() + .5),
x_axis_type='linear', y_axis_type='linear')
## Actual plots
real = canvas.points(pbmcdf, 'x', 'y', ds.count())
sim = canvas.points(dbltdf, 'x', 'y', ds.count())
real_norm = real / real.sum()
sim_norm = sim / sim.sum()
tf.Images(
tf.shade(real, name="pbmcs"),
tf.shade(sim, name="doublet"),
tf.shade((sim_norm - real_norm) / (real_norm + sim_norm), cmap=palettes.Viridis256),
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment