Skip to content

Instantly share code, notes, and snippets.

@whacked
Created February 6, 2015 16:17
Show Gist options
  • Save whacked/86d764a9e2b2607da741 to your computer and use it in GitHub Desktop.
Save whacked/86d764a9e2b2607da741 to your computer and use it in GitHub Desktop.
traverse ncbi taxonomy?
'''
> head nodes.dmp
1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | |
2 | 131567 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | |
6 | 335928 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
7 | 6 | species | AC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | |
9 | 32199 | species | BA | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | |
10 | 135621 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
11 | 1707 | species | CG | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | |
13 | 203488 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
14 | 13 | species | DT | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | |
16 | 32011 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
> head names.dmp
1 | all | | synonym |
1 | root | | scientific name |
2 | Bacteria | Bacteria <prokaryote> | scientific name |
2 | Monera | Monera <Bacteria> | in-part |
2 | Procaryotae | Procaryotae <Bacteria> | in-part |
2 | Prokaryota | Prokaryota <Bacteria> | in-part |
2 | Prokaryotae | Prokaryotae <Bacteria> | in-part |
2 | bacteria | bacteria <blast2> | blast name |
2 | eubacteria | | genbank common name |
2 | not Bacteria Haeckel 1894 | | synonym |
'''
from time import time as now
try:
rawnodes, rawnames
print('files already loaded')
except:
t0 = now()
rawnodes = open('nodes.dmp').read()
rawnames = open('names.dmp').read()
print('loaded files in %s sec' % (now() - t0))
try:
namemap
print('name map already loaded')
except:
t0 = now()
namemap = {int(id):name for (id, name) in [[part.strip() for part in line.split('|', 2)[:2]] for line in rawnames.splitlines()]}
print('loaded name map in %s sec' % (now() - t0))
try:
genusmap, parentmap
print('genus+parent map already loaded')
except:
t0 = now()
genusmap = {}
parentmap = {}
for line in rawnodes.splitlines():
child, parent, category = [part.strip() for part in line.split('|', 3)][:3]
child = int(child)
parent = int(parent)
genusmap[child] = category
parentmap[child] = parent
print('loaded genus+parent map in %s sec' % (now() - t0))
VERBOSE = False
def query(node_id, iter=0):
if iter > 10:
if VERBOSE: print(' * * * warning! big tree! * * *')
prefix = '>>'*(iter+1)
category = genusmap.get(node_id)
if VERBOSE: print('%s ID %s\tis type: %s' % (prefix, node_id, category))
if category == 'genus':
print('|__ found genus at %s\n' % (namemap[node_id]))
else:
if node_id not in parentmap:
if VERBOSE: print(' * * * warning: no parent! * * *')
elif node_id == parentmap[node_id]:
if VERBOSE: print(' ! ! ! end of tree ! ! !')
else:
query(parentmap[node_id], iter+1)
import random
allid = namemap.keys()
for i in range(10):
rand_id = random.choice(allid)
print('='*50)
print('querying: %s\t%s' % (rand_id, namemap[rand_id]))
query(rand_id)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment