Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Simple method to score each sample withing the boundaries of a signature containing two blocks (e.g. gene expression signature with up- and down-regulated genes)
import pandas as pd
from sklearn.preprocessing import StandardScaler
# get list of genes of interest (e.g. from diff exp, or a known signature)
diff = pd.Index(['geneX', 'geneY'])
# X is our numeric matrix, shape (n_features, n_samples)
X = pd.DataFrame(index=['geneA', 'geneZ'], columns=['stimul_1', 'stimul_2'] + ['unstimul_1', 'unstimul_2'])
# Standardize and center
# from scipy.stats import z_score
# X = X.loc[diff, :].apply(z_score, axis=1) # slow version
X = pd.DataFrame(
StandardScaler().fit_transform(X.loc[diff, :].T).T,
index=X.index, columns=X.columns) # fast version
# 1. Compute score based on intensities of up- or down-regulated genes
# 1.1 get vectors for up- and down regulated genes
cond1, cond2 = ('stimul', 'unstimul')
u1 = X.loc[:, X.columns.str.startswith(cond1)].mean(axis=1)
u2 = X.loc[:, X.columns.str.startswith(cond2)].mean(axis=1)
extremes = pd.DataFrame([u1, u2], index=[cond1, cond2]).T
up = extremes[extremes[cond1] > extremes[cond2]].index
down = extremes[extremes[cond1] < extremes[cond2]].index
# 1.2 Make score
# get sum/mean intensities in either
# weighted by each side contribution to the signature
# sum the value of each side
scores = (
-(X.loc[up, :].mean(axis=0) * (float(up.size) / X.shape[0])) +
(X.loc[down, :].mean(axis=0) * (float(down.size) / X.shape[0]))
# reverse for cond1 samples (give positive values for depletion in downregulated genes, negative for depletion in upregulated)
scores.loc[scores.index.str.contains(cond1)] = -scores.loc[scores.index.str.contains(cond1)]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment