walterst/get_rank_sorted_data.py

## get_rank_sorted_data.py
#!/usr/bin/env python

from sys import argv

from operator import itemgetter
from scipy.stats import rankdata
from numpy import log

from biom import load_table

""" From the figure 1 approach in http://jem.rupress.org/content/209/2/365
    Rank = # of times sequence appears
    Frequency of rank = how many of a given rank # of present, e.g., should be many rank 1

    implication is that those with high rank will be active pool of B-cells.

    """

otu_table = load_table(argv[1])

ids = otu_table._sample_ids
obs = otu_table._observation_ids

exceptions = ["WT.sp4", "WT.LI4"] # outlier


per_id_outfs = [x + ".txt" for x in ids]

id_outf = []

for f in per_id_outfs:
    id_outf.append(open(f, "w"))


combined_outf = open("combined_data.txt", "w")
combined_filtered_outf = open("combined_data_no4.txt", "w")

combined_counts = {}
combined_counts_filtered = {}

for curr_otu in obs:
    count = 0
    count_filtered = 0
    for id in ids:
        curr_val = otu_table.get_value_by_ids(curr_otu, id)
        count += curr_val
        if id not in exceptions:
            count_filtered += curr_val
    # Skip if zero count OTU, e.g. zero after rarefaction
    if count > 0:
        try:
            combined_counts[count] += 1
        except KeyError:
            combined_counts[count] = 1
    if count_filtered > 0:
        try:
            combined_counts_filtered[count_filtered] += 1
        except KeyError:
            combined_counts_filtered[count_filtered] = 1

id_dicts = {}
for id in ids:
    id_dicts[id] = {}

for id in ids:

    for curr_otu in obs:
        curr_val = otu_table.get_value_by_ids(curr_otu, id)
        if curr_val > 0:
            try:
                id_dicts[id][curr_val] += 1
            except KeyError:
                id_dicts[id][curr_val] = 1

sorted_per_sample = []

for id in ids:
    curr_l = sorted(id_dicts[id].iteritems(), key=itemgetter(0), reverse=False)
    sorted_per_sample.append(curr_l)


combined_counts_sorted = sorted(combined_counts.iteritems(), key=itemgetter(0), reverse=False)
combined_counts_filtered_sorted = sorted(combined_counts_filtered.iteritems(), key=itemgetter(0), reverse=False)

combined_outf.write("#Rank\tFreq of Rank\tLogRank\tLogFreq\n")
for x in combined_counts_sorted:
    combined_outf.write("%d\t%d\t%f\t%f\n" % (x[0], x[1], log(x[0]), log(x[1])))

combined_filtered_outf.write("#Rank\tFreq of Rank\tLogRank\tLogFreq\n")
for x in combined_counts_filtered_sorted:
    combined_filtered_outf.write("%d\t%d\t%f\t%f\n" % (x[0], x[1], log(x[0]), log(x[1])))

for f in id_outf:
    f.write("#Rank\tFreq of Rank\tLogRank\tLogFreq\n")

for f in range(len(id_outf)):
    for x in sorted_per_sample[f]:
        id_outf[f].write("%d\t%d\t%f\t%f\n" % (x[0], x[1], log(x[0]), log(x[1])))
	#!/usr/bin/env python

	from sys import argv

	from operator import itemgetter
	from scipy.stats import rankdata
	from numpy import log

	from biom import load_table

	""" From the figure 1 approach in http://jem.rupress.org/content/209/2/365
	Rank = # of times sequence appears
	Frequency of rank = how many of a given rank # of present, e.g., should be many rank 1

	implication is that those with high rank will be active pool of B-cells.

	"""

	otu_table = load_table(argv[1])

	ids = otu_table._sample_ids
	obs = otu_table._observation_ids

	exceptions = ["WT.sp4", "WT.LI4"] # outlier


	per_id_outfs = [x + ".txt" for x in ids]

	id_outf = []

	for f in per_id_outfs:
	id_outf.append(open(f, "w"))



	combined_outf = open("combined_data.txt", "w")
	combined_filtered_outf = open("combined_data_no4.txt", "w")

	combined_counts = {}
	combined_counts_filtered = {}

	for curr_otu in obs:
	count = 0
	count_filtered = 0
	for id in ids:
	curr_val = otu_table.get_value_by_ids(curr_otu, id)
	count += curr_val
	if id not in exceptions:
	count_filtered += curr_val
	# Skip if zero count OTU, e.g. zero after rarefaction
	if count > 0:
	try:
	combined_counts[count] += 1
	except KeyError:
	combined_counts[count] = 1
	if count_filtered > 0:
	try:
	combined_counts_filtered[count_filtered] += 1
	except KeyError:
	combined_counts_filtered[count_filtered] = 1

	id_dicts = {}
	for id in ids:
	id_dicts[id] = {}

	for id in ids:

	for curr_otu in obs:
	curr_val = otu_table.get_value_by_ids(curr_otu, id)
	if curr_val > 0:
	try:
	id_dicts[id][curr_val] += 1
	except KeyError:
	id_dicts[id][curr_val] = 1

	sorted_per_sample = []

	for id in ids:
	curr_l = sorted(id_dicts[id].iteritems(), key=itemgetter(0), reverse=False)
	sorted_per_sample.append(curr_l)



	combined_counts_sorted = sorted(combined_counts.iteritems(), key=itemgetter(0), reverse=False)
	combined_counts_filtered_sorted = sorted(combined_counts_filtered.iteritems(), key=itemgetter(0), reverse=False)

	combined_outf.write("#Rank\tFreq of Rank\tLogRank\tLogFreq\n")
	for x in combined_counts_sorted:
	combined_outf.write("%d\t%d\t%f\t%f\n" % (x[0], x[1], log(x[0]), log(x[1])))

	combined_filtered_outf.write("#Rank\tFreq of Rank\tLogRank\tLogFreq\n")
	for x in combined_counts_filtered_sorted:
	combined_filtered_outf.write("%d\t%d\t%f\t%f\n" % (x[0], x[1], log(x[0]), log(x[1])))

	for f in id_outf:
	f.write("#Rank\tFreq of Rank\tLogRank\tLogFreq\n")

	for f in range(len(id_outf)):
	for x in sorted_per_sample[f]:
	id_outf[f].write("%d\t%d\t%f\t%f\n" % (x[0], x[1], log(x[0]), log(x[1])))