Created
October 28, 2016 16:09
-
-
Save ShaiberAlon/ced1e9be654af164edeb91daa360fcfa to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import numpy as np | |
import argparse | |
parser = argparse.ArgumentParser(description='Adding the 2MA column to anvio AA table') | |
parser.add_argument('-i','--input',metavar='FILE',dest='input_file',help='Input file') | |
parser.add_argument('-o','--out',metavar='FILE',dest='output_file',help='Name of file for output') | |
parser.add_argument('-r','--ratio',metavar='NUMBER',dest='ratio',type=float,help='Minimal ratio between consensus and the second most covered amino-acid. If the ratio is lower than the provided threshold, then the 2MA value would be in the form concensus_concensus') | |
args = parser.parse_args() | |
# creating empty list for output | |
OUTPUT=[] | |
input_file=args.input_file | |
with open(input_file) as f: | |
reader=csv.reader(f,delimiter='\t') | |
# Getting the list of Amino Acids from the header | |
header=next(reader) | |
AA_list=header[8:29] # The amino acid data is in columns 9-29 (20 AA + stop codon) | |
# Appending the header to the output, with the addition of the new column. the title of the new column is specified below | |
header.append('2MA_Dom') | |
header.append('2MA_Alp') | |
OUTPUT.append(header) | |
consensus_to_second_max_ratio=args.ratio | |
for row in reader: | |
# Read the amino acid coverage for each row | |
AA_coverage=np.array(map(float,list(row[8:29]))) | |
# Sorting the amino acids according to coverage in order to get the concesnsus and second most occuring amino acid | |
index_sort=np.argsort(AA_coverage)[::-1] | |
consensus=AA_list[index_sort[0]] | |
second_abundant_AA=AA_list[index_sort[1]] | |
consensus_coverage=AA_coverage[index_sort[0]] | |
second_abundant_AA_coverage=AA_coverage[index_sort[1]] | |
s='_' # The outout will be written in the format: AA1_AA2 (e.g. Arg_Lys) | |
if consensus_coverage*consensus_to_second_max_ratio < second_abundant_AA_coverage: | |
# Output is AA1_AA2, where AA1 is always the amino acid that comes first alphabetically (i.e. even if the consensus is Glu and the second AA is Asn, then the output is Asn_Glu (and not Glu_Asn) | |
most_2_AA_Alp=s.join(sorted(list((consensus,second_abundant_AA)))) | |
most_2_AA_Dom=s.join(list((consensus,second_abundant_AA))) | |
else: | |
# The consunsus is significantly more occuring than the second AA | |
most_2_AA_Alp=s.join((consensus,consensus)) | |
most_2_AA_Dom=s.join((consensus,consensus)) | |
row.append(most_2_AA_Dom) | |
row.append(most_2_AA_Alp) | |
OUTPUT.append(row) | |
output_file=args.output_file | |
with open(output_file,'wb') as o: | |
# Writing output to tab delimited file | |
writer=csv.writer(o,lineterminator='\n',delimiter='\t') | |
writer.writerows(OUTPUT) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment