Skip to content

Instantly share code, notes, and snippets.

@walterst
Created July 8, 2015 18:41
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Embed
What would you like to do?
See help text below about usage. Script was created to create consensus taxonomy strings for all sequence taxa strings in the Silva 119 release.
#!/usr/bin/env python
# USAGE
# python create_consensus_taxonomy.py X Y Z A
# where X is the taxonomy mapping file for all NR seqs, Y is the representative
# file (i.e. one of the rep_set/ files with the 119 release), Z is the OTU
# mapping file created from running pick_otus.py, and A is the output
# consensus mapping file
from sys import argv
from cogent.parse.fasta import MinimalFastaParser
silva_taxa = open(argv[1], "U")
id_to_taxa = {}
for line in silva_taxa:
curr_id = line.split()[0].strip()
curr_taxa = " ".join(line.split()[1:]).strip()
id_to_taxa[curr_id] = curr_taxa
rep_set_fasta = open(argv[2], "U")
ordered_ids = []
for label,seq in MinimalFastaParser(rep_set_fasta):
ordered_ids.append(label)
ordered_ids = set(ordered_ids)
otu_mapping = open(argv[3], "U")
matched_ids = {}
for line in otu_mapping:
if len(line.strip()) == 0:
continue
curr_line = line.strip().split('\t')
curr_otu = curr_line[0]
all_seqs = curr_line[1:]
for seq in all_seqs:
if seq in ordered_ids:
matched_ids[seq] = all_seqs
output_taxa_mapping = open(argv[4], "w")
for id in ordered_ids:
taxa_data = []
final_taxa_string = []
for curr_seq in matched_ids[id]:
# Used example from http://stackoverflow.com/questions/3844801/check-if-all-elements-in-a-list-are-identical for comparing equality of elements
# Have to step backwards through the bottom taxa up to the top taxa
taxa_data.append(id_to_taxa[curr_seq].split(';')[::-1])
levels = len(taxa_data[0])
final_taxa_string = []
for n in range(levels):
curr_taxa_strings = []
for tax in taxa_data:
curr_taxa_strings.append(tax[n])
match = all(x==curr_taxa_strings[0] for x in curr_taxa_strings)
if match:
final_taxa_string.append(curr_taxa_strings[0])
else:
final_taxa_string.append("Ambiguous_taxa")
fixed_taxa_string = ";".join(final_taxa_string[::-1])
output_taxa_mapping.write("%s\t%s\n" % (id, fixed_taxa_string))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment