Last active
December 18, 2019 19:54
-
-
Save wflynny/8195f7337501487cdd677d7ce3195bcf to your computer and use it in GitHub Desktop.
HTO demuxing in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.cluster import KMeans | |
import numpy as np | |
import pandas as pd | |
import scanpy as sc | |
def load_hto_matrix(mtx_dir): | |
raw_htos = sc.read_mtx(mtx_dir + "/matrix.mtx.gz").T | |
raw_htos.var = pd.read_csv(mtx_dir + "/features.tsv.gz", header=None, index_col=0) | |
raw_htos.obs = pd.read_csv(mtx_dir + "/barcodes.tsv.gz", header=None, index_col=0) | |
raw_htos = raw_htos[:, ~raw_htos.var_names.isin(["unmapped"])] | |
return raw_htos | |
def CLR_norm(counts): | |
inner = np.log1p(counts.values / np.exp(np.sum(np.log1p(counts[counts > 0]), axis=1) / counts.shape[1])[:,None]) | |
return pd.DataFrame(data=inner, index=counts.index, columns=counts.columns) | |
def hto_demux(raw_htos, n_clusters=None): | |
# 1 | |
# convert scanpy obj to dataframe | |
tmp = pd.DataFrame(raw_htos.X.toarray(), index=raw_htos.obs_names, columns=raw_htos.var_names) | |
htos = tmp.columns | |
norm_htos = CLR_norm(tmp.copy()) | |
if n_clusters is None: | |
n_clusters = len(htos) + 1 | |
# using Kmeans instead of kmediods to start | |
classification = pd.DataFrame(index=tmp.index) | |
k = KMeans(n_clusters=n_clusters).fit(tmp.values) | |
classification["k_label"] = k.labels_ | |
# 2 | |
norms = pd.DataFrame(index=np.unique(k.labels_), columns=htos) | |
for cluster, group in classification.groupby("k_label"): | |
cells = group.index | |
norms.loc[cluster, :] = norm_htos.loc[cells, :].mean(axis=0) | |
# 3 | |
# Maybe use sm.ECDF instead of fitting a nbinom | |
for hto in htos: | |
min_cluster = norms[hto].astype(float).idxmin() | |
print(min_cluster) | |
min_cells = classification[classification.k_label == min_cluster].index | |
background_expression = tmp.loc[min_cells, hto] | |
threshold = np.percentile(background_expression.values, 95) | |
indicator = tmp[hto] > threshold | |
classification[hto] = indicator.astype(int) | |
print(hto, threshold) | |
hto_sums = classification.iloc[:, 1:].sum(axis=1) | |
classification["global_class"] = "negative" | |
classification.loc[hto_sums == 1, "global_class"] = "singlet" | |
classification.loc[hto_sums > 1, "global_class"] = "multiplet" | |
singlets = classification.loc[hto_sums == 1, classification.columns[1:len(htos)+1]] | |
classification["tag_class"] = "negative" | |
classification.loc[singlets.index, "tag_class"] = singlets.idxmax(axis=1) | |
return classification |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment