Skip to content

Instantly share code, notes, and snippets.

@Ken-Kuroki
Last active December 7, 2018 08:16
Show Gist options
  • Save Ken-Kuroki/daa1d7d5feccec9ad70aff49d3b6d83b to your computer and use it in GitHub Desktop.
Save Ken-Kuroki/daa1d7d5feccec9ad70aff49d3b6d83b to your computer and use it in GitHub Desktop.
Get Taxonomy Hierarchy Locally from NCBI Taxonomy
# Make sure you have downloaded ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip
# and have its contents in ./data/.
# Note: This is just another reinvention of NCBITaxonomy of ETE Toolkit.
# http://etetoolkit.org/docs/latest/tutorial/tutorial_ncbitaxonomy.html
import pandas as pd
from collections import defaultdict
def _read_nodes():
nodes = pd.read_csv("data/nodes.dmp", sep="|", header=None)
nodes = nodes.drop(nodes.columns[3:], axis=1)
nodes.columns = ["taxid", "parentid", "rank"]
nodes = nodes.set_index("taxid")
nodes["rank"] = nodes["rank"].apply(lambda x: x.strip())
return nodes
def _read_names():
names = pd.read_csv("data/names.dmp", sep="|", header=None)
names = names.drop([names.columns[2], names.columns[4]], axis=1)
names.columns = ["taxid", "name", "type"]
names = names.set_index("taxid")
names = names.applymap(lambda x: x.strip())
names = names[names["type"] == "scientific name"]
return names
def _read_merged():
merged = pd.read_csv("data/merged.dmp", sep="|", header=None)
merged = merged.drop([merged.columns[2]], axis=1)
merged.columns = ["original", "mergedto"]
merged = merged.set_index("original")
return merged
def get_tax(taxid, nodes, names, merged, prev_tax=None):
if prev_tax is None:
prev_tax = defaultdict(lambda: "") # DO NOT GIVE IT AS A DEFAULT PARAMETER
while taxid in merged.index: # substitute with merged taxid ITERATIVELY just in case
taxid = merged.loc[taxid]["mergedto"]
prev_tax[nodes.loc[taxid]["rank"]] = names.loc[taxid]["name"]
# recursion
if "kingdom" in nodes.loc[taxid]["rank"]:
return prev_tax
else:
return get_tax(nodes.loc[taxid]["parentid"], nodes, names, merged, prev_tax)
# prepare data
nodes = _read_nodes()
names = _read_names()
merged = _read_merged()
# now you can give a taxid of your choice
get_tax(562, nodes, names, merged)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment