Skip to content

Instantly share code, notes, and snippets.

@wflynny
Last active December 18, 2019 19:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wflynny/8195f7337501487cdd677d7ce3195bcf to your computer and use it in GitHub Desktop.
Save wflynny/8195f7337501487cdd677d7ce3195bcf to your computer and use it in GitHub Desktop.
HTO demuxing in python
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import scanpy as sc
def load_hto_matrix(mtx_dir):
raw_htos = sc.read_mtx(mtx_dir + "/matrix.mtx.gz").T
raw_htos.var = pd.read_csv(mtx_dir + "/features.tsv.gz", header=None, index_col=0)
raw_htos.obs = pd.read_csv(mtx_dir + "/barcodes.tsv.gz", header=None, index_col=0)
raw_htos = raw_htos[:, ~raw_htos.var_names.isin(["unmapped"])]
return raw_htos
def CLR_norm(counts):
inner = np.log1p(counts.values / np.exp(np.sum(np.log1p(counts[counts > 0]), axis=1) / counts.shape[1])[:,None])
return pd.DataFrame(data=inner, index=counts.index, columns=counts.columns)
def hto_demux(raw_htos, n_clusters=None):
# 1
# convert scanpy obj to dataframe
tmp = pd.DataFrame(raw_htos.X.toarray(), index=raw_htos.obs_names, columns=raw_htos.var_names)
htos = tmp.columns
norm_htos = CLR_norm(tmp.copy())
if n_clusters is None:
n_clusters = len(htos) + 1
# using Kmeans instead of kmediods to start
classification = pd.DataFrame(index=tmp.index)
k = KMeans(n_clusters=n_clusters).fit(tmp.values)
classification["k_label"] = k.labels_
# 2
norms = pd.DataFrame(index=np.unique(k.labels_), columns=htos)
for cluster, group in classification.groupby("k_label"):
cells = group.index
norms.loc[cluster, :] = norm_htos.loc[cells, :].mean(axis=0)
# 3
# Maybe use sm.ECDF instead of fitting a nbinom
for hto in htos:
min_cluster = norms[hto].astype(float).idxmin()
print(min_cluster)
min_cells = classification[classification.k_label == min_cluster].index
background_expression = tmp.loc[min_cells, hto]
threshold = np.percentile(background_expression.values, 95)
indicator = tmp[hto] > threshold
classification[hto] = indicator.astype(int)
print(hto, threshold)
hto_sums = classification.iloc[:, 1:].sum(axis=1)
classification["global_class"] = "negative"
classification.loc[hto_sums == 1, "global_class"] = "singlet"
classification.loc[hto_sums > 1, "global_class"] = "multiplet"
singlets = classification.loc[hto_sums == 1, classification.columns[1:len(htos)+1]]
classification["tag_class"] = "negative"
classification.loc[singlets.index, "tag_class"] = singlets.idxmax(axis=1)
return classification
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment