Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Log2ratio transformation, feature selection and simple random sampling on a gene expression matrix
import pandas as pd
import numpy as np
import sys
import random as rnd
csv = sys.argv[1]
out = sys.argv[2]
df = pd.read_table(csv, sep='\t', index_col=0)
# log2
df = np.log2(df)
# remove genes with more than 5% of samples below the 5th percentile
thres = np.percentile(df, 5)
quantile = df.quantile(0.05, axis=1)
genes = quantile[quantile > thres].index.tolist()
df = df.ix[genes, :]
# ratio
median = df.median(axis='columns')
tcga = df.sub(median, axis='index')
# feature selection, most variant genes
Kgenes = 1000
std = df.std(axis='columns')
rank = std.rank(ascending=False)
genes = rank[rank < Kgenes].index.tolist()
df = df.ix[genes, :]
# random sampling 100 samples
Ksamples = 100
samplesix = rnd.sample(range(len(newcolumns)), Ksamples)
samples = [newcolumns[i] for i in samplesix]
df = df.ix[:, samples]
# output
df.to_csv(out, sep='\t')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.