Skip to content

Instantly share code, notes, and snippets.

@thouis
Created June 11, 2011 20:53
Show Gist options
  • Save thouis/1020945 to your computer and use it in GitHub Desktop.
Save thouis/1020945 to your computer and use it in GitHub Desktop.
median of medians sorted by gene, sorted by gene
import xlrd
import sys
import numpy as np
book = xlrd.open_workbook(sys.argv[1])
sheet = book.sheet_by_name('Normalization 1')
headers = [c.value for c in sheet.row(0)]
# find columns
colidx = dict([(name, headers.index(name)) for name in ['Gene', 'rep1', 'rep2', 'rep3']])
# read data
data = {}
for rowidx in range(1, sheet.nrows):
row = sheet.row(rowidx)
gene = row[colidx['Gene']].value
vals = [row[colidx['rep%d' % rep]].value for rep in range(1, 4)]
data[gene] = data.get(gene, []) + [np.median(vals)]
# report, for each gene, median, second highest, and second lowest value
output = []
print "Gene,\tMedian,\t2nd highest,\t2nd lowest,\tNumber"
for g, vals in data.iteritems():
vals = sorted(vals)
output += ["%s,\t%f,\t%f,\t%f,\t%d"%(g, np.median(vals), vals[-2], vals[1], len(vals))]
output.sort()
print "\n".join(output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment