Skip to content

Instantly share code, notes, and snippets.

@sminot
Last active December 8, 2017 23:08
Show Gist options
  • Save sminot/680a354fbf1f3bb25220577718bd0187 to your computer and use it in GitHub Desktop.
Save sminot/680a354fbf1f3bb25220577718bd0187 to your computer and use it in GitHub Desktop.
Make a taxonomy file compatible with mothur
#!/usr/bin/python
"""Make a taxonomy file compatible with mothur."""
import os
import sys
import pandas as pd
if len(sys.argv) != 4:
print("Please specify the seq_info.csv, tax_info.csv, and output.tsv files")
# Get the filepaths
seqinfo = sys.argv[1]
assert os.path.exists(seqinfo)
taxinfo = sys.argv[2]
assert os.path.exists(taxinfo)
output = sys.argv[3]
# Read in the tables
taxinfo = pd.read_table(taxinfo, sep=',')
assert "tax_id" in taxinfo
taxinfo.set_index("tax_id", inplace=True)
seqinfo = pd.read_table(seqinfo, sep=',')
# Function to generate the taxonomy string for a certain taxid
def taxonomy_string(taxid, taxinfo, sep=';', root=1):
tax_string = []
while taxid in taxinfo.index.values:
tax_string.append(taxinfo.loc[taxid, "tax_name"])
if taxid == root:
break
if taxinfo.loc[taxid, "parent_id"] == taxid:
break
taxid = taxinfo.loc[taxid, "parent_id"]
assert tax_string[-1] == taxinfo.loc[root, "tax_name"]
tax_string = tax_string[::-1]
return sep.join(tax_string) + sep
# Write a table with the sequence name and taxonomy string for each sequence
cache = {}
with open(output, "wt") as fo:
for ix, r in seqinfo.iterrows():
if r["tax_id"] not in cache:
cache[r["tax_id"]] = taxonomy_string(r["tax_id"], taxinfo)
fo.write("{}\t{}\n".format(r["seqname"], cache[r["tax_id"]]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment