See help text below about usage. Script was created to create consensus taxonomy strings for all sequence taxa strings in the Silva 119 release.
#!/usr/bin/env python
# python X Y Z A
# where X is the taxonomy mapping file for all NR seqs, Y is the representative
# file (i.e. one of the rep_set/ files with the 119 release), Z is the OTU
# mapping file created from running, and A is the output
# consensus mapping file
from sys import argv
from cogent.parse.fasta import MinimalFastaParser
silva_taxa = open(argv[1], "U")
id_to_taxa = {}
for line in silva_taxa:
curr_id = line.split()[0].strip()
curr_taxa = " ".join(line.split()[1:]).strip()
id_to_taxa[curr_id] = curr_taxa
rep_set_fasta = open(argv[2], "U")
ordered_ids = []
for label,seq in MinimalFastaParser(rep_set_fasta):
ordered_ids = set(ordered_ids)
otu_mapping = open(argv[3], "U")
matched_ids = {}
for line in otu_mapping:
if len(line.strip()) == 0:
curr_line = line.strip().split('\t')
curr_otu = curr_line[0]
all_seqs = curr_line[1:]
for seq in all_seqs:
if seq in ordered_ids:
matched_ids[seq] = all_seqs
output_taxa_mapping = open(argv[4], "w")
for id in ordered_ids:
taxa_data = []
final_taxa_string = []
for curr_seq in matched_ids[id]:
# Used example from for comparing equality of elements
# Have to step backwards through the bottom taxa up to the top taxa
levels = len(taxa_data[0])
final_taxa_string = []
for n in range(levels):
curr_taxa_strings = []
for tax in taxa_data:
match = all(x==curr_taxa_strings[0] for x in curr_taxa_strings)
if match:
fixed_taxa_string = ";".join(final_taxa_string[::-1])
output_taxa_mapping.write("%s\t%s\n" % (id, fixed_taxa_string))
