Skip to content

Instantly share code, notes, and snippets.

@biocyberman
Last active June 27, 2020 21:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save biocyberman/13cb3ca5fdd055bf213711b93e9e6b81 to your computer and use it in GitHub Desktop.
Save biocyberman/13cb3ca5fdd055bf213711b93e9e6b81 to your computer and use it in GitHub Desktop.
"""
Compare mutations from augur vs direct naive calcuation
"""
import sys
from Bio import Phylo, SeqIO
import numpy as np
from augur.translate import construct_mut
from augur.utils import read_node_data
from augur.clades import is_node_in_clade, get_reference_sequence_from_root_node
import argparse
from pathlib import Path
def check_muts_with_aln(tree, aln, all_muts, ref):
direct_mutations = {}
# prepare the nodes
for node in tree.get_nonterminals():
for c in node:
c.up = node
c.up.clade = ""
c.up.level = 0
c.up.accumulated_muts = set()
tree.root.up = None
tree.root.sequences = {'nuc': {}}
# Compare mutations between those computed by treetime and those computed directly (still use treetime ancestral sequence)
for node in tree.find_clades(order='preorder'):
if node.is_terminal():
node_muts = {construct_mut(a, int(pos+1), d) for pos, (a,d) in
enumerate(zip(ref['nuc'], aln[node.name]))
if a!=d and a != 'N' and d != 'N' }
direct_mutations[node.name] = node_muts
if node.up:
node.sequences = {gene: muts.copy() for gene, muts in node.up.sequences.items()}
node.accumulated_muts = node.up.accumulated_muts.union(all_muts[node.name]['muts'])
acc_muts = node.accumulated_muts
if not acc_muts.issubset(node_muts) and node.is_terminal():
print(f'Timetree accumulated mutations do not match direct mutations for {node.name}:\n'
f'Unique for timetree: {acc_muts.difference(node_muts)}\n'
f'Intersection: {acc_muts.intersection(node_muts)}\n'
f'Unique for direct: {node_muts.difference(acc_muts)}\n')
nn = node.name
if nn == "Hungary/SRC-00817/2020":
p = [n for n in tree.get_path(node)]
names = [n.name for n in p]
tmp = {n: all_muts[n]['muts'] for n in names }
print(f"Debug, Node Path: {','.join(names)}\n")
for n in p:
print(f"{n.name}:{all_muts[n.name]['muts']}")
for c in n:
print(f"\t\t{all_muts[c.name]['muts']}({c.name})")
return direct_mutations
def check_muts(all_muts, tree, ref=None):
direct_mutations = {}
# prepare the nodes
for node in tree.get_nonterminals():
for c in node:
c.up = node
c.up.clade = ""
c.up.level = 0
c.up.accumulated_muts = set()
tree.root.up = None
tree.root.sequences = {'nuc': {}}
# Compare mutations between those computed by treetime and those computed directly (still use treetime ancestral sequence)
for node in tree.find_clades(order='preorder'):
node_muts = {construct_mut(a, int(pos+1), d) for pos, (a,d) in
enumerate(zip(ref['nuc'], all_muts[node.name]['sequence']))
if a!=d and a != 'N' and d != 'N' }
direct_mutations[node.name] = node_muts
if node.up:
node.sequences = {gene: muts.copy() for gene, muts in node.up.sequences.items()}
node.accumulated_muts = node.up.accumulated_muts.union(all_muts[node.name]['muts'])
acc_muts = node.accumulated_muts
if not acc_muts.issubset(node_muts) and node.is_terminal():
print(f'Timetree accumulated mutations do not match direct mutations for {node.name}:\n'
f'Unique for timetree: {acc_muts.difference(node_muts)}\n'
f'Intersection: {acc_muts.intersection(node_muts)}\n'
f'Unique for direct: {node_muts.difference(acc_muts)}\n')
nn = node.name
if nn == "Hungary/SRC-00817/2020":
p = [n for n in tree.get_path(node)]
names = [n.name for n in p]
tmp = {n: all_muts[n]['muts'] for n in names }
print(f"Debug, Node Path: {','.join(names)}\n")
for n in p:
print(f"{n.name}:{all_muts[n.name]['muts']}")
for c in n:
print(f"\t\t{all_muts[c.name]['muts']}({c.name})")
return direct_mutations
def register_arguments(parser):
parser.add_argument('-t', '--tree', help="prebuilt Newick -- no tree will be built if provided")
parser.add_argument('-m', '--mutations', nargs='+',
help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ')
parser.add_argument('-r', '--reference',
help='fasta files containing reference and tip nucleotide and/or amino-acid sequences ')
parser.add_argument('-a', '--alignment', help="alignment in fasta format")
def run(args):
## read tree and data, if reading data fails, return with error code
tree = Phylo.read(args.tree, 'newick')
node_data = read_node_data(args.mutations, args.tree)
refseq = {}
refseq['nuc'] = SeqIO.read(args.reference, 'genbank').seq
if node_data is None:
print("ERROR: could not read node data (incl sequences)")
return 1
all_muts = node_data['nodes']
from Bio import AlignIO
alignment = AlignIO.read(open(args.alignment), "fasta")
aln_seq = {}
for s in alignment:
aln_seq[s.id] = s.seq
# extract reference sequences from the root node entry in the mutation json
# if this doesn't exist, it will complain but not error.
ref = get_reference_sequence_from_root_node(all_muts, tree.root.name)
# assert(refseq['nuc'] == "".join(ref['nuc']))
# aln_mutations = check_muts_with_aln(tree, aln_seq, all_muts, refseq)
direct_mutations = check_muts(all_muts, tree, refseq)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="check-muts",
description="Test mutations")
args = register_arguments(parser)
exit(run(parser.parse_args()))
Timetree accumulated mutations do not match direct mutations for Singapore/15/2020:
Unique for timetree: {'C28144-', 'T28144C'}
Intersection: {'T27931-', 'T28039-', 'G27967-', 'A27979-', 'G28075-', 'T28046-', 'G27955-', 'A28213-', 'G28193-', 'A28152-', 'A28096-', 'G28122-', 'G27906-', 'C27884-', 'G27890-', 'G28179-', 'C28099-', 'C27923-', 'T28067-', 'T28150-', 'T28217-', 'G28038-', 'T28010-', 'T28215-', 'C27998-', 'T28208-', 'A28024-', 'A28097-', 'A27994-', 'T28061-', 'T27901-', 'T27958-', 'T28004-', 'G28003-', 'T28200-', 'A28070-', 'C28164-', 'C28146-', 'T28157-', 'G28212-', 'G28191-', 'A28025-', 'A28222-', 'A27886-', 'C28057-', 'C28115-', 'A27938-', 'A27872-', 'T28082-', 'G28183-', 'C28045-', 'A28207-', 'A28018-', 'C28147-', 'G27936-', 'T27895-', 'G28221-', 'T27910-', 'C27970-', 'G28085-', 'T27929-', 'T28216-', 'A28210-', 'T28014-', 'C27864-', 'A27983-', 'T27902-', 'T28029-', 'T27866-', 'G28123-', 'A28133-', 'C28138-', 'A28175-', 'A28081-', 'C27999-', 'A27871-', 'A28168-', 'G28044-', 'C28153-', 'T27953-', 'T27869-', 'T28020-', 'G28188-', 'A27892-', 'A27927-', 'A28058-', 'A28064-', 'A27961-', 'T27885-', 'A27950-', 'A27924-', 'C27934-', 'T28229-', 'T28017-', 'G27882-', 'T28149-', 'G28162-', 'C28227-', 'T28071-', 'T28098-', 'T28088-', 'T28063-', 'G28223-', 'A27980-', 'C27978-', 'T27908-', 'G28198-', 'C27942-', 'A27859-', 'C27881-', 'A27921-', 'A28008-', 'T27907-', 'C28194-', 'A28155-', 'T28062-', 'G28166-', 'T28137-', 'A28051-', 'G28079-', 'A27873-', 'A28225-', 'T28053-', 'A27862-', 'C27975-', 'T28143-', 'A27932-', 'T27988-', 'G27990-', 'A28035-', 'A27949-', 'A27853-', 'T27863-', 'A27973-', 'T27909-', 'A28065-', 'G28028-', 'T27995-', 'C27928-', 'G28167-', 'A28211-', 'T28072-', 'C28163-', 'A28165-', 'A28040-', 'T28172-', 'A28182-', 'T28031-', 'T28007-', 'T28161-', 'A27947-', 'T28114-', 'T28160-', 'A27865-', 'T28022-', 'T28074-', 'T28206-', 'C27893-', 'G28080-', 'T28196-', 'T27935-', 'A27959-', 'A28037-', 'C28059-', 'C28076-', 'T28224-', 'G28180-', 'G28041-', 'C27879-', 'A28126-', 'T28105-', 'C28171-', 'A28159-', 'C27972-', 'A27887-', 'G28077-', 'T28091-', 'C28011-', 'T28177-', 'T28135-', 'A27969-', 'A28084-', 'T28128-', 'A28158-', 'A27918-', 'G28073-', 'G27896-', 'T28002-', 'T28127-', 'A27989-', 'T28203-', 'C28000-', 'T27991-', 'T28110-', 'G27933-', 'C27964-', 'T27856-', 'T28176-', 'T28140-', 'A28108-', 'A28069-', 'T27912-', 'A27926-', 'A27899-', 'G27948-', 'C27944-', 'C28112-', 'A27848-', 'C27903-', 'A27965-', 'G28042-', 'A28145-', 'G28109-', 'T28151-', 'A27880-', 'T27984-', 'C28205-', 'T28118-', 'C28201-', 'G27861-', 'A28095-', 'A28052-', 'T28120-', 'T28124-', 'G28195-', 'C28102-', 'G27996-', 'T27963-', 'T27851-', 'A28104-', 'C28132-', 'G28001-', 'C28006-', 'T28192-', 'G27870-', 'C28214-', 'T27913-', 'T27875-', 'T28204-', 'T27977-', 'T28197-', 'A27917-', 'T28218-', 'T27971-', 'C28170-', 'C28005-', 'A28030-', 'A28047-', 'C28060-', 'A28125-', 'T27986-', 'T28066-', 'G28089-', 'T28189-', 'A28050-', 'C28139-', 'T28033-', 'C27883-', 'A27868-', 'G27916-', 'T27968-', 'C27849-', 'T28136-', 'T27957-', 'C27982-', 'C28185-', 'G28036-', 'T28130-', 'G27930-', 'T27876-', 'A27888-', 'C27911-', 'A28154-', 'A28174-', 'T27900-', 'T28019-', 'A27898-', 'C28054-', 'T27905-', 'A28055-', 'A27974-', 'G28083-', 'A28129-', 'A27914-', 'G27962-', 'G27877-', 'G28086-', 'T27951-', 'T27904-', 'T28156-', 'T27919-', 'A27976-', 'A27894-', 'T28094-', 'A28117-', 'C8782T', 'G28090-', 'A28228-', 'T27956-', 'C28016-', 'T27941-', 'C28107-', 'C27874-', 'T28078-', 'A28169-', 'G28048-', 'T28009-', 'C28087-', 'G28141-', 'C28021-', 'T28092-', 'T28034-', 'T27992-', 'G28178-', 'T27878-', 'C28013-', 'T28186-', 'G28134-', 'C27945-', 'G27993-', 'G27852-', 'G27952-', 'C28101-', 'T27939-', 'T28184-', 'C28103-', 'T28226-', 'A28220-', 'A27897-', 'C27855-', 'T28026-', 'T28106-', 'C27858-', 'G28056-', 'T27940-', 'T28199-', 'A28012-', 'A27854-', 'T27850-', 'T28015-', 'A28111-', 'A27985-', 'T28142-', 'C28093-', 'C28121-', 'A28100-', 'G27987-', 'A27943-', 'A28023-', 'A28173-', 'C27960-', 'T27966-', 'A28131-', 'T28219-', 'A27946-', 'C27925-', 'G28068-', 'A28113-', 'C27920-', 'A27891-', 'G27915-', 'G28027-', 'A28119-', 'C27937-', 'A27860-', 'C27889-', 'G27857-', 'A28190-', 'A28032-', 'T28181-', 'A28049-', 'T27922-', 'A27997-', 'G28209-', 'G28202-', 'T28148-', 'A27954-', 'A27867-', 'A28043-', 'C27981-', 'G28116-', 'T28187-'}
Unique for direct: {'T28144-'}
Timetree accumulated mutations do not match direct mutations for UnitedArabEmirates/L4682/2020:
Unique for timetree: {'G11083T', 'T11083G'}
Intersection: {'T28688C', 'G1397A', 'G2246A', 'G29742T', 'C9246T'}
Unique for direct: set()
Timetree accumulated mutations do not match direct mutations for Iran/HGRC-2-2162/2020:
Unique for timetree: {'G11083T', 'T11083G'}
Intersection: {'G20887A', 'T28688C', 'C21627T', 'G1397A', 'C22735T', 'T8707C', 'G29742T', 'C28830T'}
Unique for direct: set()
Timetree accumulated mutations do not match direct mutations for Iran/KHGRC-2-2162/2020:
Unique for timetree: {'G11083T', 'T11083G'}
Intersection: {'G20887A', 'T28688C', 'C21627T', 'G1397A', 'C22735T', 'T8707C', 'G29742T', 'C28830T'}
Unique for direct: set()
Timetree accumulated mutations do not match direct mutations for Kazakhstan/NCB-1/2020:
Unique for timetree: {'G29742T', 'T29742G'}
Intersection: {'G11083T', 'T28688C', 'T16272G', 'T3333-', 'C884T', 'C4234T', 'G1397A', 'G3335-', 'G8653T', 'T3334-', 'C17141A'}
Unique for direct: set()
Timetree accumulated mutations do not match direct mutations for UnitedArabEmirates/L0231/2020:
Unique for timetree: {'G11083T', 'T11083G'}
Intersection: {'T28688C', 'C884T', 'G27915T', 'C28868T', 'G1397A', 'A19073G', 'G8653T', 'G29742T', 'C29077T'}
Unique for direct: set()
Timetree accumulated mutations do not match direct mutations for SriLanka/COV486/2020:
Unique for timetree: {'G11083T', 'T11083G'}
Intersection: {'C14805T', 'C1762T', 'T17247C', 'G26144T'}
Unique for direct: set()
Timetree accumulated mutations do not match direct mutations for Philippines/RITM-03/2020:
Unique for timetree: {'G11083T', 'T11083G'}
Intersection: {'A6659G', 'C23929T', 'C28311T', 'C13730T', 'C26425T', 'G27463T', 'G12793T', 'C6312A'}
Unique for direct: set()
Timetree accumulated mutations do not match direct mutations for DRC/3451/2020:
Unique for timetree: {'T241-', 'C241T'}
Intersection: {'C583T', 'A23403G', 'C14408T', 'G4180T', 'T16983A', 'C3037T', 'G28202A', 'C15324T'}
Unique for direct: {'C241-'}
Timetree accumulated mutations do not match direct mutations for Brazil/CV4/2020:
Unique for timetree: {'T241C', 'C241T'}
Intersection: {'A23403G', 'G25563T', 'C18877T', 'C14408T', 'C3037T'}
Unique for direct: set()
Timetree accumulated mutations do not match direct mutations for Austria/Graz-MUG6/2020:
Unique for timetree: {'C3037T', 'T3037C'}
Intersection: {'G24197T', 'A23403G', 'G25563T', 'C18877T', 'G15957T', 'C14408T', 'C241T', 'C26735T'}
Unique for direct: set()
Timetree accumulated mutations do not match direct mutations for Austria/Graz-MUG9/2020:
Unique for timetree: {'C3037T', 'T3037C'}
Intersection: {'G24197T', 'A23403G', 'G25563T', 'C18877T', 'G15957T', 'C14408T', 'C241T', 'C26735T'}
Unique for direct: set()
Timetree accumulated mutations do not match direct mutations for Austria/Graz-MUG8/2020:
Unique for timetree: {'C3037T', 'T3037C'}
Intersection: {'G24197T', 'A23403G', 'G25563T', 'C18877T', 'G15957T', 'C14408T', 'C241T', 'C26735T'}
Unique for direct: set()
Timetree accumulated mutations do not match direct mutations for BosniaandHerzegovina/01-Livno/2020:
Unique for timetree: {'C3037T', 'T3037C', 'T241C', 'C241T'}
Intersection: {'G28882A', 'C25614T', 'C29838-', 'A6867G', 'T29840-', 'T29834-', 'G29841-', 'A29844-', 'T25533A', 'C29836-', 'T29845-', 'T29847-', 'T29842-', 'T26300C', 'T29832-', 'C29831-', 'G28881A', 'T29848-', 'T29846-', 'G29843-', 'A23403G', 'C29837-', 'C29835-', 'A29833-', 'C14408T', 'A29839-', 'G28883C'}
Unique for direct: set()
Timetree accumulated mutations do not match direct mutations for Uganda/UG009/2020:
Unique for timetree: {'C3037T', 'T14408C', 'T3037C', 'C14408T'}
Intersection: {'G28882A', 'G28580T', 'C241T', 'A23403G', 'T7207A', 'G10097A', 'T606C', 'C5997T', 'C21335T', 'G28881A', 'G28883C'}
Unique for direct: set()
Timetree accumulated mutations do not match direct mutations for Uganda/UG003/2020:
Unique for timetree: {'C3037T', 'T14408C', 'T3037C', 'C14408T'}
Intersection: {'G28882A', 'A23403G', 'G16330C', 'C23644T', 'C241T', 'G28881A', 'G28883C'}
Unique for direct: set()
Timetree accumulated mutations do not match direct mutations for Uganda/UG014/2020:
Unique for timetree: {'A10097G', 'G10097A'}
Intersection: {'G28882A', 'G28580T', 'T29693A', 'T7207A', 'A454G', 'A23403G', 'A532G', 'C23731T', 'G29828A', 'C14408T', 'C3037T', 'C241T', 'A22466G', 'A29723G', 'G28881A', 'G28883C', 'G29830A'}
Unique for direct: set()
Timetree accumulated mutations do not match direct mutations for Hungary/SRC-00817/2020:
Unique for timetree: {'C23731T', 'T23731C'}
Intersection: {'G28882A', 'C13536T', 'A23403G', 'C4002T', 'G10097A', 'C14408T', 'C241T', 'C3037T', 'C19718T', 'G28881A', 'G28883C'}
Unique for direct: set()
Debug, Node Path: NODE_0000100,NODE_0000343,NODE_0000376,NODE_0000341,NODE_0000342,NODE_0000394,NODE_0000323,NODE_0000348,NODE_0000739,NODE_0000873,NODE_0000881,NODE_0000734,NODE_0000896,Hungary/SRC-00817/2020
NODE_0000100:[]
[](NODE_0000079)
[](NODE_0000343)
NODE_0000343:[]
['A5084G', 'C28854T'](NODE_0000268)
[](NODE_0000376)
NODE_0000376:[]
['C17373T'](NODE_0000330)
['C241T', 'A23403G'](NODE_0000341)
NODE_0000341:['C241T', 'A23403G']
['T141C', 'C1473T', 'C16523A', 'G26144T'](Uganda/UG013/2020)
['C3037T'](NODE_0000342)
NODE_0000342:['C3037T']
['G27915T', 'G28580T'](NODE_0000347)
[](NODE_0000394)
NODE_0000394:[]
[](NODE_0000396)
[](NODE_0000323)
NODE_0000323:[]
[](Germany/BavPat2-ChVir984-ChVir1017/2020)
['C14408T'](NODE_0000348)
NODE_0000348:['C14408T']
['G10523C', 'C13424A', 'C14097T', 'G14580T', 'A19298G', 'A21163T', 'A21166T'](Morocco/6893/2020)
['A3337G', 'T3741C', 'T3762A', 'G21985A', 'T22308C', 'T26187C'](Finland/FIN-25/2020)
['C13620T', 'C21595T'](NODE_0000402)
['C23575T'](NODE_0000464)
['A187G'](NODE_0000473)
['A20268G'](NODE_0000380)
['A26530G'](NODE_0000359)
[](NODE_0000356)
['C15324T'](NODE_0000475)
['G25563T'](NODE_0000578)
['G28881A', 'G28882A', 'G28883C'](NODE_0000739)
NODE_0000739:['G28881A', 'G28882A', 'G28883C']
['A8945G', 'T24022C'](Peru/010/2020)
['G957A'](Nigeria/Lagos01/2020)
['A1103G', 'C1420T', 'G1743T', 'G2422C', 'T27153G', 'C27689T'](Portugal/PT0593/2020)
['C4010T', 'C18555T', 'A25983G'](Ecuador/HGSQ-USFQ-018/2020)
['A8449C', 'C13115T', 'C20234T'](SouthAfrica/KRISP-109/2020)
['A23116T'](Denmark/SSI-01/2020)
['G10427A', 'C14786T', 'C20389T'](Morocco/6888/2020)
['A5220T', 'C19170T', 'G19509A', 'T25461C'](SouthAfrica/KRISP-012/2020)
['C27046T'](NODE_0000800)
['G12832A'](NODE_0000815)
['A2869G', 'C17733T', 'C29095T'](NODE_0000783)
['T3037C'](NODE_0000801)
['C6573T', 'C25528T'](NODE_0000850)
['C3373A'](NODE_0000859)
['G10097A', 'C23731T'](NODE_0000873)
[](NODE_0000889)
NODE_0000873:['G10097A', 'C23731T']
['G28580T'](NODE_0000874)
['C4002T', 'C13536T'](NODE_0000881)
NODE_0000881:['C4002T', 'C13536T']
['G28209T'](Croatia/ZG-297-20/2020)
[](Denmark/SSI-05/2020)
[](Denmark/ALAB-SSI-248/2020)
['A6693G', 'C8386T'](NODE_0000908)
['G11083T'](NODE_0000915)
[](NODE_0000734)
NODE_0000734:[]
['C19718T'](NODE_0000896)
[](NODE_0000736)
NODE_0000896:['C19718T']
['T23731C'](Hungary/SRC-00817/2020)
['C22323T'](Austria/CeMM0067/2020)
Hungary/SRC-00817/2020:['T23731C']
# Go to augur output directory
source activate augur
python check_muts.py --tree africa/tree.nwk \
--reference reference.gb \
--mutations africa/nt_muts.json \
--alignment africa/subsampled_alignment.fasta >check_muts.txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment