Skip to content

Instantly share code, notes, and snippets.

@evanroyrees
Created February 20, 2020 00:41
Show Gist options
  • Save evanroyrees/8d7b81d934c7a10fe4d60b2b3e6f5ba8 to your computer and use it in GitHub Desktop.
Save evanroyrees/8d7b81d934c7a10fe4d60b2b3e6f5ba8 to your computer and use it in GitHub Desktop.
Convert old taxids to new taxids given an autometa LCA output table and NCBI taxdump's merged.dmp file
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
merge nodes from autometa LCA output table before running `add_contig_taxonomy.py`
"""
import logging
import os
from tqdm import tqdm
logger = logging.getLogger(__name__)
def parse_merged(fpath):
"""Construct merged.dmp file dictionary.
Parameters
----------
fpath : str
</path/to/merged.dmp>
Returns
-------
dict
{old_taxid:new_taxid, ...}
"""
fh = open(fpath)
merged = {}
for line in tqdm(fh, desc='parsing merged.dmp', leave=False):
old_taxid, new_taxid = [int(taxid) for taxid in line.strip('\t|\n').split('\t|\t')]
merged.update({old_taxid:new_taxid})
fh.close()
logger.info('merged loaded')
return merged
def parse_lca(fpath, merged):
lines = ''
with open(fpath) as fh:
for line in fh:
orf, name, rank, taxid = line.strip().split('\t')
taxid = int(taxid)
if taxid in merged:
old_tid = taxid
taxid = merged.get(taxid, taxid)
logger.info('translated {} -> {}'.format(old_tid, merged.get(old_tid, old_tid)))
lines += '\t'.join([orf,name,rank,str(taxid)])+'\n'
return lines
def main(args):
if args.lca == args.out:
logger.error('Provided lca: ({}) must not match out: ({})'.format(args.lca, args.out))
import sys;sys.exit(1)
if os.path.exists(args.out):
raise FileExistsError(args.out)
merged = parse_merged(args.merged)
lines = parse_lca(args.lca, merged)
with open(args.out, 'w') as out:
out.write(lines)
logger.info('Written: {}'.format(args.out))
if __name__ == '__main__':
import argparse
import logging as logger
logger.basicConfig(
format='%(asctime)s : %(name)s : %(levelname)s : %(message)s',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logger.DEBUG)
parser = argparse.ArgumentParser('merge_nodes.py',
description='convert old taxids in Autometa LCA output file to new versions using NCBI taxdump database file merged.dmp')
parser.add_argument('lca',help='</path/to/autometa.lca.tsv>')
parser.add_argument('merged',help='</path/to/merged.dmp>')
parser.add_argument('out',help='</path/to/output.tsv>')
args = parser.parse_args()
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment