Skip to content

Instantly share code, notes, and snippets.

@ShaiberAlon
Created October 28, 2016 16:09
Show Gist options
  • Save ShaiberAlon/ced1e9be654af164edeb91daa360fcfa to your computer and use it in GitHub Desktop.
Save ShaiberAlon/ced1e9be654af164edeb91daa360fcfa to your computer and use it in GitHub Desktop.
import csv
import numpy as np
import argparse
parser = argparse.ArgumentParser(description='Adding the 2MA column to anvio AA table')
parser.add_argument('-i','--input',metavar='FILE',dest='input_file',help='Input file')
parser.add_argument('-o','--out',metavar='FILE',dest='output_file',help='Name of file for output')
parser.add_argument('-r','--ratio',metavar='NUMBER',dest='ratio',type=float,help='Minimal ratio between consensus and the second most covered amino-acid. If the ratio is lower than the provided threshold, then the 2MA value would be in the form concensus_concensus')
args = parser.parse_args()
# creating empty list for output
OUTPUT=[]
input_file=args.input_file
with open(input_file) as f:
reader=csv.reader(f,delimiter='\t')
# Getting the list of Amino Acids from the header
header=next(reader)
AA_list=header[8:29] # The amino acid data is in columns 9-29 (20 AA + stop codon)
# Appending the header to the output, with the addition of the new column. the title of the new column is specified below
header.append('2MA_Dom')
header.append('2MA_Alp')
OUTPUT.append(header)
consensus_to_second_max_ratio=args.ratio
for row in reader:
# Read the amino acid coverage for each row
AA_coverage=np.array(map(float,list(row[8:29])))
# Sorting the amino acids according to coverage in order to get the concesnsus and second most occuring amino acid
index_sort=np.argsort(AA_coverage)[::-1]
consensus=AA_list[index_sort[0]]
second_abundant_AA=AA_list[index_sort[1]]
consensus_coverage=AA_coverage[index_sort[0]]
second_abundant_AA_coverage=AA_coverage[index_sort[1]]
s='_' # The outout will be written in the format: AA1_AA2 (e.g. Arg_Lys)
if consensus_coverage*consensus_to_second_max_ratio < second_abundant_AA_coverage:
# Output is AA1_AA2, where AA1 is always the amino acid that comes first alphabetically (i.e. even if the consensus is Glu and the second AA is Asn, then the output is Asn_Glu (and not Glu_Asn)
most_2_AA_Alp=s.join(sorted(list((consensus,second_abundant_AA))))
most_2_AA_Dom=s.join(list((consensus,second_abundant_AA)))
else:
# The consunsus is significantly more occuring than the second AA
most_2_AA_Alp=s.join((consensus,consensus))
most_2_AA_Dom=s.join((consensus,consensus))
row.append(most_2_AA_Dom)
row.append(most_2_AA_Alp)
OUTPUT.append(row)
output_file=args.output_file
with open(output_file,'wb') as o:
# Writing output to tab delimited file
writer=csv.writer(o,lineterminator='\n',delimiter='\t')
writer.writerows(OUTPUT)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment