Last active
June 27, 2020 21:10
-
-
Save biocyberman/13cb3ca5fdd055bf213711b93e9e6b81 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Compare mutations from augur vs direct naive calcuation | |
""" | |
import sys | |
from Bio import Phylo, SeqIO | |
import numpy as np | |
from augur.translate import construct_mut | |
from augur.utils import read_node_data | |
from augur.clades import is_node_in_clade, get_reference_sequence_from_root_node | |
import argparse | |
from pathlib import Path | |
def check_muts_with_aln(tree, aln, all_muts, ref): | |
direct_mutations = {} | |
# prepare the nodes | |
for node in tree.get_nonterminals(): | |
for c in node: | |
c.up = node | |
c.up.clade = "" | |
c.up.level = 0 | |
c.up.accumulated_muts = set() | |
tree.root.up = None | |
tree.root.sequences = {'nuc': {}} | |
# Compare mutations between those computed by treetime and those computed directly (still use treetime ancestral sequence) | |
for node in tree.find_clades(order='preorder'): | |
if node.is_terminal(): | |
node_muts = {construct_mut(a, int(pos+1), d) for pos, (a,d) in | |
enumerate(zip(ref['nuc'], aln[node.name])) | |
if a!=d and a != 'N' and d != 'N' } | |
direct_mutations[node.name] = node_muts | |
if node.up: | |
node.sequences = {gene: muts.copy() for gene, muts in node.up.sequences.items()} | |
node.accumulated_muts = node.up.accumulated_muts.union(all_muts[node.name]['muts']) | |
acc_muts = node.accumulated_muts | |
if not acc_muts.issubset(node_muts) and node.is_terminal(): | |
print(f'Timetree accumulated mutations do not match direct mutations for {node.name}:\n' | |
f'Unique for timetree: {acc_muts.difference(node_muts)}\n' | |
f'Intersection: {acc_muts.intersection(node_muts)}\n' | |
f'Unique for direct: {node_muts.difference(acc_muts)}\n') | |
nn = node.name | |
if nn == "Hungary/SRC-00817/2020": | |
p = [n for n in tree.get_path(node)] | |
names = [n.name for n in p] | |
tmp = {n: all_muts[n]['muts'] for n in names } | |
print(f"Debug, Node Path: {','.join(names)}\n") | |
for n in p: | |
print(f"{n.name}:{all_muts[n.name]['muts']}") | |
for c in n: | |
print(f"\t\t{all_muts[c.name]['muts']}({c.name})") | |
return direct_mutations | |
def check_muts(all_muts, tree, ref=None): | |
direct_mutations = {} | |
# prepare the nodes | |
for node in tree.get_nonterminals(): | |
for c in node: | |
c.up = node | |
c.up.clade = "" | |
c.up.level = 0 | |
c.up.accumulated_muts = set() | |
tree.root.up = None | |
tree.root.sequences = {'nuc': {}} | |
# Compare mutations between those computed by treetime and those computed directly (still use treetime ancestral sequence) | |
for node in tree.find_clades(order='preorder'): | |
node_muts = {construct_mut(a, int(pos+1), d) for pos, (a,d) in | |
enumerate(zip(ref['nuc'], all_muts[node.name]['sequence'])) | |
if a!=d and a != 'N' and d != 'N' } | |
direct_mutations[node.name] = node_muts | |
if node.up: | |
node.sequences = {gene: muts.copy() for gene, muts in node.up.sequences.items()} | |
node.accumulated_muts = node.up.accumulated_muts.union(all_muts[node.name]['muts']) | |
acc_muts = node.accumulated_muts | |
if not acc_muts.issubset(node_muts) and node.is_terminal(): | |
print(f'Timetree accumulated mutations do not match direct mutations for {node.name}:\n' | |
f'Unique for timetree: {acc_muts.difference(node_muts)}\n' | |
f'Intersection: {acc_muts.intersection(node_muts)}\n' | |
f'Unique for direct: {node_muts.difference(acc_muts)}\n') | |
nn = node.name | |
if nn == "Hungary/SRC-00817/2020": | |
p = [n for n in tree.get_path(node)] | |
names = [n.name for n in p] | |
tmp = {n: all_muts[n]['muts'] for n in names } | |
print(f"Debug, Node Path: {','.join(names)}\n") | |
for n in p: | |
print(f"{n.name}:{all_muts[n.name]['muts']}") | |
for c in n: | |
print(f"\t\t{all_muts[c.name]['muts']}({c.name})") | |
return direct_mutations | |
def register_arguments(parser): | |
parser.add_argument('-t', '--tree', help="prebuilt Newick -- no tree will be built if provided") | |
parser.add_argument('-m', '--mutations', nargs='+', | |
help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ') | |
parser.add_argument('-r', '--reference', | |
help='fasta files containing reference and tip nucleotide and/or amino-acid sequences ') | |
parser.add_argument('-a', '--alignment', help="alignment in fasta format") | |
def run(args): | |
## read tree and data, if reading data fails, return with error code | |
tree = Phylo.read(args.tree, 'newick') | |
node_data = read_node_data(args.mutations, args.tree) | |
refseq = {} | |
refseq['nuc'] = SeqIO.read(args.reference, 'genbank').seq | |
if node_data is None: | |
print("ERROR: could not read node data (incl sequences)") | |
return 1 | |
all_muts = node_data['nodes'] | |
from Bio import AlignIO | |
alignment = AlignIO.read(open(args.alignment), "fasta") | |
aln_seq = {} | |
for s in alignment: | |
aln_seq[s.id] = s.seq | |
# extract reference sequences from the root node entry in the mutation json | |
# if this doesn't exist, it will complain but not error. | |
ref = get_reference_sequence_from_root_node(all_muts, tree.root.name) | |
# assert(refseq['nuc'] == "".join(ref['nuc'])) | |
# aln_mutations = check_muts_with_aln(tree, aln_seq, all_muts, refseq) | |
direct_mutations = check_muts(all_muts, tree, refseq) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser( | |
prog="check-muts", | |
description="Test mutations") | |
args = register_arguments(parser) | |
exit(run(parser.parse_args())) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Timetree accumulated mutations do not match direct mutations for Singapore/15/2020: | |
Unique for timetree: {'C28144-', 'T28144C'} | |
Intersection: {'T27931-', 'T28039-', 'G27967-', 'A27979-', 'G28075-', 'T28046-', 'G27955-', 'A28213-', 'G28193-', 'A28152-', 'A28096-', 'G28122-', 'G27906-', 'C27884-', 'G27890-', 'G28179-', 'C28099-', 'C27923-', 'T28067-', 'T28150-', 'T28217-', 'G28038-', 'T28010-', 'T28215-', 'C27998-', 'T28208-', 'A28024-', 'A28097-', 'A27994-', 'T28061-', 'T27901-', 'T27958-', 'T28004-', 'G28003-', 'T28200-', 'A28070-', 'C28164-', 'C28146-', 'T28157-', 'G28212-', 'G28191-', 'A28025-', 'A28222-', 'A27886-', 'C28057-', 'C28115-', 'A27938-', 'A27872-', 'T28082-', 'G28183-', 'C28045-', 'A28207-', 'A28018-', 'C28147-', 'G27936-', 'T27895-', 'G28221-', 'T27910-', 'C27970-', 'G28085-', 'T27929-', 'T28216-', 'A28210-', 'T28014-', 'C27864-', 'A27983-', 'T27902-', 'T28029-', 'T27866-', 'G28123-', 'A28133-', 'C28138-', 'A28175-', 'A28081-', 'C27999-', 'A27871-', 'A28168-', 'G28044-', 'C28153-', 'T27953-', 'T27869-', 'T28020-', 'G28188-', 'A27892-', 'A27927-', 'A28058-', 'A28064-', 'A27961-', 'T27885-', 'A27950-', 'A27924-', 'C27934-', 'T28229-', 'T28017-', 'G27882-', 'T28149-', 'G28162-', 'C28227-', 'T28071-', 'T28098-', 'T28088-', 'T28063-', 'G28223-', 'A27980-', 'C27978-', 'T27908-', 'G28198-', 'C27942-', 'A27859-', 'C27881-', 'A27921-', 'A28008-', 'T27907-', 'C28194-', 'A28155-', 'T28062-', 'G28166-', 'T28137-', 'A28051-', 'G28079-', 'A27873-', 'A28225-', 'T28053-', 'A27862-', 'C27975-', 'T28143-', 'A27932-', 'T27988-', 'G27990-', 'A28035-', 'A27949-', 'A27853-', 'T27863-', 'A27973-', 'T27909-', 'A28065-', 'G28028-', 'T27995-', 'C27928-', 'G28167-', 'A28211-', 'T28072-', 'C28163-', 'A28165-', 'A28040-', 'T28172-', 'A28182-', 'T28031-', 'T28007-', 'T28161-', 'A27947-', 'T28114-', 'T28160-', 'A27865-', 'T28022-', 'T28074-', 'T28206-', 'C27893-', 'G28080-', 'T28196-', 'T27935-', 'A27959-', 'A28037-', 'C28059-', 'C28076-', 'T28224-', 'G28180-', 'G28041-', 'C27879-', 'A28126-', 'T28105-', 'C28171-', 'A28159-', 'C27972-', 'A27887-', 'G28077-', 'T28091-', 'C28011-', 'T28177-', 'T28135-', 'A27969-', 'A28084-', 'T28128-', 'A28158-', 'A27918-', 'G28073-', 'G27896-', 'T28002-', 'T28127-', 'A27989-', 'T28203-', 'C28000-', 'T27991-', 'T28110-', 'G27933-', 'C27964-', 'T27856-', 'T28176-', 'T28140-', 'A28108-', 'A28069-', 'T27912-', 'A27926-', 'A27899-', 'G27948-', 'C27944-', 'C28112-', 'A27848-', 'C27903-', 'A27965-', 'G28042-', 'A28145-', 'G28109-', 'T28151-', 'A27880-', 'T27984-', 'C28205-', 'T28118-', 'C28201-', 'G27861-', 'A28095-', 'A28052-', 'T28120-', 'T28124-', 'G28195-', 'C28102-', 'G27996-', 'T27963-', 'T27851-', 'A28104-', 'C28132-', 'G28001-', 'C28006-', 'T28192-', 'G27870-', 'C28214-', 'T27913-', 'T27875-', 'T28204-', 'T27977-', 'T28197-', 'A27917-', 'T28218-', 'T27971-', 'C28170-', 'C28005-', 'A28030-', 'A28047-', 'C28060-', 'A28125-', 'T27986-', 'T28066-', 'G28089-', 'T28189-', 'A28050-', 'C28139-', 'T28033-', 'C27883-', 'A27868-', 'G27916-', 'T27968-', 'C27849-', 'T28136-', 'T27957-', 'C27982-', 'C28185-', 'G28036-', 'T28130-', 'G27930-', 'T27876-', 'A27888-', 'C27911-', 'A28154-', 'A28174-', 'T27900-', 'T28019-', 'A27898-', 'C28054-', 'T27905-', 'A28055-', 'A27974-', 'G28083-', 'A28129-', 'A27914-', 'G27962-', 'G27877-', 'G28086-', 'T27951-', 'T27904-', 'T28156-', 'T27919-', 'A27976-', 'A27894-', 'T28094-', 'A28117-', 'C8782T', 'G28090-', 'A28228-', 'T27956-', 'C28016-', 'T27941-', 'C28107-', 'C27874-', 'T28078-', 'A28169-', 'G28048-', 'T28009-', 'C28087-', 'G28141-', 'C28021-', 'T28092-', 'T28034-', 'T27992-', 'G28178-', 'T27878-', 'C28013-', 'T28186-', 'G28134-', 'C27945-', 'G27993-', 'G27852-', 'G27952-', 'C28101-', 'T27939-', 'T28184-', 'C28103-', 'T28226-', 'A28220-', 'A27897-', 'C27855-', 'T28026-', 'T28106-', 'C27858-', 'G28056-', 'T27940-', 'T28199-', 'A28012-', 'A27854-', 'T27850-', 'T28015-', 'A28111-', 'A27985-', 'T28142-', 'C28093-', 'C28121-', 'A28100-', 'G27987-', 'A27943-', 'A28023-', 'A28173-', 'C27960-', 'T27966-', 'A28131-', 'T28219-', 'A27946-', 'C27925-', 'G28068-', 'A28113-', 'C27920-', 'A27891-', 'G27915-', 'G28027-', 'A28119-', 'C27937-', 'A27860-', 'C27889-', 'G27857-', 'A28190-', 'A28032-', 'T28181-', 'A28049-', 'T27922-', 'A27997-', 'G28209-', 'G28202-', 'T28148-', 'A27954-', 'A27867-', 'A28043-', 'C27981-', 'G28116-', 'T28187-'} | |
Unique for direct: {'T28144-'} | |
Timetree accumulated mutations do not match direct mutations for UnitedArabEmirates/L4682/2020: | |
Unique for timetree: {'G11083T', 'T11083G'} | |
Intersection: {'T28688C', 'G1397A', 'G2246A', 'G29742T', 'C9246T'} | |
Unique for direct: set() | |
Timetree accumulated mutations do not match direct mutations for Iran/HGRC-2-2162/2020: | |
Unique for timetree: {'G11083T', 'T11083G'} | |
Intersection: {'G20887A', 'T28688C', 'C21627T', 'G1397A', 'C22735T', 'T8707C', 'G29742T', 'C28830T'} | |
Unique for direct: set() | |
Timetree accumulated mutations do not match direct mutations for Iran/KHGRC-2-2162/2020: | |
Unique for timetree: {'G11083T', 'T11083G'} | |
Intersection: {'G20887A', 'T28688C', 'C21627T', 'G1397A', 'C22735T', 'T8707C', 'G29742T', 'C28830T'} | |
Unique for direct: set() | |
Timetree accumulated mutations do not match direct mutations for Kazakhstan/NCB-1/2020: | |
Unique for timetree: {'G29742T', 'T29742G'} | |
Intersection: {'G11083T', 'T28688C', 'T16272G', 'T3333-', 'C884T', 'C4234T', 'G1397A', 'G3335-', 'G8653T', 'T3334-', 'C17141A'} | |
Unique for direct: set() | |
Timetree accumulated mutations do not match direct mutations for UnitedArabEmirates/L0231/2020: | |
Unique for timetree: {'G11083T', 'T11083G'} | |
Intersection: {'T28688C', 'C884T', 'G27915T', 'C28868T', 'G1397A', 'A19073G', 'G8653T', 'G29742T', 'C29077T'} | |
Unique for direct: set() | |
Timetree accumulated mutations do not match direct mutations for SriLanka/COV486/2020: | |
Unique for timetree: {'G11083T', 'T11083G'} | |
Intersection: {'C14805T', 'C1762T', 'T17247C', 'G26144T'} | |
Unique for direct: set() | |
Timetree accumulated mutations do not match direct mutations for Philippines/RITM-03/2020: | |
Unique for timetree: {'G11083T', 'T11083G'} | |
Intersection: {'A6659G', 'C23929T', 'C28311T', 'C13730T', 'C26425T', 'G27463T', 'G12793T', 'C6312A'} | |
Unique for direct: set() | |
Timetree accumulated mutations do not match direct mutations for DRC/3451/2020: | |
Unique for timetree: {'T241-', 'C241T'} | |
Intersection: {'C583T', 'A23403G', 'C14408T', 'G4180T', 'T16983A', 'C3037T', 'G28202A', 'C15324T'} | |
Unique for direct: {'C241-'} | |
Timetree accumulated mutations do not match direct mutations for Brazil/CV4/2020: | |
Unique for timetree: {'T241C', 'C241T'} | |
Intersection: {'A23403G', 'G25563T', 'C18877T', 'C14408T', 'C3037T'} | |
Unique for direct: set() | |
Timetree accumulated mutations do not match direct mutations for Austria/Graz-MUG6/2020: | |
Unique for timetree: {'C3037T', 'T3037C'} | |
Intersection: {'G24197T', 'A23403G', 'G25563T', 'C18877T', 'G15957T', 'C14408T', 'C241T', 'C26735T'} | |
Unique for direct: set() | |
Timetree accumulated mutations do not match direct mutations for Austria/Graz-MUG9/2020: | |
Unique for timetree: {'C3037T', 'T3037C'} | |
Intersection: {'G24197T', 'A23403G', 'G25563T', 'C18877T', 'G15957T', 'C14408T', 'C241T', 'C26735T'} | |
Unique for direct: set() | |
Timetree accumulated mutations do not match direct mutations for Austria/Graz-MUG8/2020: | |
Unique for timetree: {'C3037T', 'T3037C'} | |
Intersection: {'G24197T', 'A23403G', 'G25563T', 'C18877T', 'G15957T', 'C14408T', 'C241T', 'C26735T'} | |
Unique for direct: set() | |
Timetree accumulated mutations do not match direct mutations for BosniaandHerzegovina/01-Livno/2020: | |
Unique for timetree: {'C3037T', 'T3037C', 'T241C', 'C241T'} | |
Intersection: {'G28882A', 'C25614T', 'C29838-', 'A6867G', 'T29840-', 'T29834-', 'G29841-', 'A29844-', 'T25533A', 'C29836-', 'T29845-', 'T29847-', 'T29842-', 'T26300C', 'T29832-', 'C29831-', 'G28881A', 'T29848-', 'T29846-', 'G29843-', 'A23403G', 'C29837-', 'C29835-', 'A29833-', 'C14408T', 'A29839-', 'G28883C'} | |
Unique for direct: set() | |
Timetree accumulated mutations do not match direct mutations for Uganda/UG009/2020: | |
Unique for timetree: {'C3037T', 'T14408C', 'T3037C', 'C14408T'} | |
Intersection: {'G28882A', 'G28580T', 'C241T', 'A23403G', 'T7207A', 'G10097A', 'T606C', 'C5997T', 'C21335T', 'G28881A', 'G28883C'} | |
Unique for direct: set() | |
Timetree accumulated mutations do not match direct mutations for Uganda/UG003/2020: | |
Unique for timetree: {'C3037T', 'T14408C', 'T3037C', 'C14408T'} | |
Intersection: {'G28882A', 'A23403G', 'G16330C', 'C23644T', 'C241T', 'G28881A', 'G28883C'} | |
Unique for direct: set() | |
Timetree accumulated mutations do not match direct mutations for Uganda/UG014/2020: | |
Unique for timetree: {'A10097G', 'G10097A'} | |
Intersection: {'G28882A', 'G28580T', 'T29693A', 'T7207A', 'A454G', 'A23403G', 'A532G', 'C23731T', 'G29828A', 'C14408T', 'C3037T', 'C241T', 'A22466G', 'A29723G', 'G28881A', 'G28883C', 'G29830A'} | |
Unique for direct: set() | |
Timetree accumulated mutations do not match direct mutations for Hungary/SRC-00817/2020: | |
Unique for timetree: {'C23731T', 'T23731C'} | |
Intersection: {'G28882A', 'C13536T', 'A23403G', 'C4002T', 'G10097A', 'C14408T', 'C241T', 'C3037T', 'C19718T', 'G28881A', 'G28883C'} | |
Unique for direct: set() | |
Debug, Node Path: NODE_0000100,NODE_0000343,NODE_0000376,NODE_0000341,NODE_0000342,NODE_0000394,NODE_0000323,NODE_0000348,NODE_0000739,NODE_0000873,NODE_0000881,NODE_0000734,NODE_0000896,Hungary/SRC-00817/2020 | |
NODE_0000100:[] | |
[](NODE_0000079) | |
[](NODE_0000343) | |
NODE_0000343:[] | |
['A5084G', 'C28854T'](NODE_0000268) | |
[](NODE_0000376) | |
NODE_0000376:[] | |
['C17373T'](NODE_0000330) | |
['C241T', 'A23403G'](NODE_0000341) | |
NODE_0000341:['C241T', 'A23403G'] | |
['T141C', 'C1473T', 'C16523A', 'G26144T'](Uganda/UG013/2020) | |
['C3037T'](NODE_0000342) | |
NODE_0000342:['C3037T'] | |
['G27915T', 'G28580T'](NODE_0000347) | |
[](NODE_0000394) | |
NODE_0000394:[] | |
[](NODE_0000396) | |
[](NODE_0000323) | |
NODE_0000323:[] | |
[](Germany/BavPat2-ChVir984-ChVir1017/2020) | |
['C14408T'](NODE_0000348) | |
NODE_0000348:['C14408T'] | |
['G10523C', 'C13424A', 'C14097T', 'G14580T', 'A19298G', 'A21163T', 'A21166T'](Morocco/6893/2020) | |
['A3337G', 'T3741C', 'T3762A', 'G21985A', 'T22308C', 'T26187C'](Finland/FIN-25/2020) | |
['C13620T', 'C21595T'](NODE_0000402) | |
['C23575T'](NODE_0000464) | |
['A187G'](NODE_0000473) | |
['A20268G'](NODE_0000380) | |
['A26530G'](NODE_0000359) | |
[](NODE_0000356) | |
['C15324T'](NODE_0000475) | |
['G25563T'](NODE_0000578) | |
['G28881A', 'G28882A', 'G28883C'](NODE_0000739) | |
NODE_0000739:['G28881A', 'G28882A', 'G28883C'] | |
['A8945G', 'T24022C'](Peru/010/2020) | |
['G957A'](Nigeria/Lagos01/2020) | |
['A1103G', 'C1420T', 'G1743T', 'G2422C', 'T27153G', 'C27689T'](Portugal/PT0593/2020) | |
['C4010T', 'C18555T', 'A25983G'](Ecuador/HGSQ-USFQ-018/2020) | |
['A8449C', 'C13115T', 'C20234T'](SouthAfrica/KRISP-109/2020) | |
['A23116T'](Denmark/SSI-01/2020) | |
['G10427A', 'C14786T', 'C20389T'](Morocco/6888/2020) | |
['A5220T', 'C19170T', 'G19509A', 'T25461C'](SouthAfrica/KRISP-012/2020) | |
['C27046T'](NODE_0000800) | |
['G12832A'](NODE_0000815) | |
['A2869G', 'C17733T', 'C29095T'](NODE_0000783) | |
['T3037C'](NODE_0000801) | |
['C6573T', 'C25528T'](NODE_0000850) | |
['C3373A'](NODE_0000859) | |
['G10097A', 'C23731T'](NODE_0000873) | |
[](NODE_0000889) | |
NODE_0000873:['G10097A', 'C23731T'] | |
['G28580T'](NODE_0000874) | |
['C4002T', 'C13536T'](NODE_0000881) | |
NODE_0000881:['C4002T', 'C13536T'] | |
['G28209T'](Croatia/ZG-297-20/2020) | |
[](Denmark/SSI-05/2020) | |
[](Denmark/ALAB-SSI-248/2020) | |
['A6693G', 'C8386T'](NODE_0000908) | |
['G11083T'](NODE_0000915) | |
[](NODE_0000734) | |
NODE_0000734:[] | |
['C19718T'](NODE_0000896) | |
[](NODE_0000736) | |
NODE_0000896:['C19718T'] | |
['T23731C'](Hungary/SRC-00817/2020) | |
['C22323T'](Austria/CeMM0067/2020) | |
Hungary/SRC-00817/2020:['T23731C'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Go to augur output directory | |
source activate augur | |
python check_muts.py --tree africa/tree.nwk \ | |
--reference reference.gb \ | |
--mutations africa/nt_muts.json \ | |
--alignment africa/subsampled_alignment.fasta >check_muts.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment