Log2ratio transformation, feature selection and simple random sampling on a gene expression matrix
import pandas as pd | |
import numpy as np | |
import sys | |
import random as rnd | |
csv = sys.argv[1] | |
out = sys.argv[2] | |
df = pd.read_table(csv, sep='\t', index_col=0) | |
# log2 | |
df = np.log2(df) | |
# remove genes with more than 5% of samples below the 5th percentile | |
thres = np.percentile(df, 5) | |
quantile = df.quantile(0.05, axis=1) | |
genes = quantile[quantile > thres].index.tolist() | |
df = df.ix[genes, :] | |
# ratio | |
median = df.median(axis='columns') | |
tcga = df.sub(median, axis='index') | |
# feature selection, most variant genes | |
Kgenes = 1000 | |
std = df.std(axis='columns') | |
rank = std.rank(ascending=False) | |
genes = rank[rank < Kgenes].index.tolist() | |
df = df.ix[genes, :] | |
# random sampling 100 samples | |
Ksamples = 100 | |
samplesix = rnd.sample(range(len(newcolumns)), Ksamples) | |
samples = [newcolumns[i] for i in samplesix] | |
df = df.ix[:, samples] | |
# output | |
df.to_csv(out, sep='\t') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment