ShaiberAlon/add_2MA_to_AA_table.py

## add_2MA_to_AA_table.py
import csv
import numpy as np
import argparse

parser = argparse.ArgumentParser(description='Adding the 2MA column to anvio AA table')
parser.add_argument('-i','--input',metavar='FILE',dest='input_file',help='Input file')
parser.add_argument('-o','--out',metavar='FILE',dest='output_file',help='Name of file for output')
parser.add_argument('-r','--ratio',metavar='NUMBER',dest='ratio',type=float,help='Minimal ratio between consensus and the second most covered amino-acid. If the ratio is lower than the provided threshold, then the 2MA value would be in the form concensus_concensus')
args = parser.parse_args()

# creating empty list for output
OUTPUT=[]
input_file=args.input_file

with open(input_file) as f:
    reader=csv.reader(f,delimiter='\t')
    # Getting the list of Amino Acids from the header
    header=next(reader)
    AA_list=header[8:29] # The amino acid data is in columns 9-29 (20 AA + stop codon)
    # Appending the header to the output, with the addition of the new column. the title of the new column is specified below
    header.append('2MA_Dom')
    header.append('2MA_Alp')
    OUTPUT.append(header)

    consensus_to_second_max_ratio=args.ratio
    for row in reader:
        # Read the amino acid coverage for each row
        AA_coverage=np.array(map(float,list(row[8:29])))
        # Sorting the amino acids according to coverage in order to get the concesnsus and second most occuring amino acid
        index_sort=np.argsort(AA_coverage)[::-1]
        consensus=AA_list[index_sort[0]]
        second_abundant_AA=AA_list[index_sort[1]]
        consensus_coverage=AA_coverage[index_sort[0]]
        second_abundant_AA_coverage=AA_coverage[index_sort[1]]
        s='_' # The outout will be written in the format: AA1_AA2 (e.g. Arg_Lys)
        if consensus_coverage*consensus_to_second_max_ratio < second_abundant_AA_coverage:
            # Output is AA1_AA2, where AA1 is always the amino acid that comes first alphabetically (i.e. even if the consensus is Glu and the second AA is Asn, then the output is Asn_Glu (and not Glu_Asn)
            most_2_AA_Alp=s.join(sorted(list((consensus,second_abundant_AA))))
            most_2_AA_Dom=s.join(list((consensus,second_abundant_AA)))
        else:
            # The consunsus is significantly more occuring than the second AA
            most_2_AA_Alp=s.join((consensus,consensus))
            most_2_AA_Dom=s.join((consensus,consensus))

        row.append(most_2_AA_Dom)
        row.append(most_2_AA_Alp)
        OUTPUT.append(row)

output_file=args.output_file
with open(output_file,'wb') as o:
    # Writing output to tab delimited file
    writer=csv.writer(o,lineterminator='\n',delimiter='\t')
    writer.writerows(OUTPUT)
	import csv
	import numpy as np
	import argparse

	parser = argparse.ArgumentParser(description='Adding the 2MA column to anvio AA table')
	parser.add_argument('-i','--input',metavar='FILE',dest='input_file',help='Input file')
	parser.add_argument('-o','--out',metavar='FILE',dest='output_file',help='Name of file for output')
	parser.add_argument('-r','--ratio',metavar='NUMBER',dest='ratio',type=float,help='Minimal ratio between consensus and the second most covered amino-acid. If the ratio is lower than the provided threshold, then the 2MA value would be in the form concensus_concensus')
	args = parser.parse_args()

	# creating empty list for output
	OUTPUT=[]
	input_file=args.input_file

	with open(input_file) as f:
	reader=csv.reader(f,delimiter='\t')
	# Getting the list of Amino Acids from the header
	header=next(reader)
	AA_list=header[8:29] # The amino acid data is in columns 9-29 (20 AA + stop codon)
	# Appending the header to the output, with the addition of the new column. the title of the new column is specified below
	header.append('2MA_Dom')
	header.append('2MA_Alp')
	OUTPUT.append(header)

	consensus_to_second_max_ratio=args.ratio
	for row in reader:
	# Read the amino acid coverage for each row
	AA_coverage=np.array(map(float,list(row[8:29])))
	# Sorting the amino acids according to coverage in order to get the concesnsus and second most occuring amino acid
	index_sort=np.argsort(AA_coverage)[::-1]
	consensus=AA_list[index_sort[0]]
	second_abundant_AA=AA_list[index_sort[1]]
	consensus_coverage=AA_coverage[index_sort[0]]
	second_abundant_AA_coverage=AA_coverage[index_sort[1]]
	s='_' # The outout will be written in the format: AA1_AA2 (e.g. Arg_Lys)
	if consensus_coverage*consensus_to_second_max_ratio < second_abundant_AA_coverage:
	# Output is AA1_AA2, where AA1 is always the amino acid that comes first alphabetically (i.e. even if the consensus is Glu and the second AA is Asn, then the output is Asn_Glu (and not Glu_Asn)
	most_2_AA_Alp=s.join(sorted(list((consensus,second_abundant_AA))))
	most_2_AA_Dom=s.join(list((consensus,second_abundant_AA)))
	else:
	# The consunsus is significantly more occuring than the second AA
	most_2_AA_Alp=s.join((consensus,consensus))
	most_2_AA_Dom=s.join((consensus,consensus))

	row.append(most_2_AA_Dom)
	row.append(most_2_AA_Alp)
	OUTPUT.append(row)

	output_file=args.output_file
	with open(output_file,'wb') as o:
	# Writing output to tab delimited file
	writer=csv.writer(o,lineterminator='\n',delimiter='\t')
	writer.writerows(OUTPUT)