Skip to content

Instantly share code, notes, and snippets.

@huddlej
Last active March 2, 2024 02:45
Show Gist options
  • Save huddlej/5d7bd023d3807c698bd18c706974f2db to your computer and use it in GitHub Desktop.
Save huddlej/5d7bd023d3807c698bd18c706974f2db to your computer and use it in GitHub Desktop.
Command line tool to convert annotated phylogenetic trees nextstrain.org's JSON format to a tidy data frame of tree attributes

Convert Auspice tree JSON to a data frame and Newick tree

An example script to convert an Auspice tree JSON to a data frame and Newick tree for processing by downstream analyses.

Setup

Install Nextstrain.

Usage

Convert a tree JSON from the ncov workflow into a table using all attributes annotated on the root node of the tree by default. This command will only emit attributes for tips of the tree.

python3 auspice_tree_to_table.py \
  --tree auspice/ncov_gisaid_europe.json \
  --output-metadata ncov_gisaid_europe.tsv

Alternately, convert the JSON to a table requesting values for internal nodes and only for specific attributes of each node.

python3 auspice_tree_to_table.py \
  --tree auspice/ncov_gisaid_europe.json \
  --output-metadata ncov_gisaid_europe.tsv \
  --attributes num_date S1_mutations \
  --include-internal-nodes  

The output of the above command looks like this:

name	num_date	S1_mutations
NODE_0000000	2019.97	0.00
hCoV-19/Wuhan/Hu-1/2019	2019.98	0.00
NODE_0000023	2019.97	0.00
hCoV-19/Wuhan/WH01/2019	2019.98	0.00
hCoV-19/mink/Netherlands/NB02_06KS/2020	2020.33	2.00
NODE_0000002	2020.05	0.00
hCoV-19/Hangzhou/HZ-1/2020	2020.05	0.00
NODE_0000003	2020.17	0.00
hCoV-19/USA/CA-CZB-1092/2020	2020.33	0.00
hCoV-19/France/10060KV/2020	2020.17	0.00
NODE_0000005	2020.00	0.00
hCoV-19/Greece/226_35576/2020	2020.21	0.00
NODE_0000006	2020.16	0.00
hCoV-19/Spain/Valencia22/2020	2020.19	0.00
hCoV-19/Spain/Valencia002/2020	2020.17	0.00
hCoV-19/Spain/Valencia003/2020	2020.18	0.00
hCoV-19/Spain/Valencia8/2020	2020.17	0.00
NODE_0000010	2020.05	1.00
NODE_0000007	2020.08	1.00
hCoV-19/Germany/BavPat3-ChVir1020/2020	2020.08	1.00
hCoV-19/Germany/BavPat1-ChVir929/2020	2020.08	1.00
NODE_0000012	2020.10	1.00
hCoV-19/mink/Netherlands/NB01_01KS/2020	2020.33	1.00
NODE_0000019	2020.17	1.00
hCoV-19/France/10006HC/2020	2020.17	1.00
hCoV-19/France/10078MA/2020	2020.17	1.00
NODE_0000020	2020.15	1.00
hCoV-19/USA/FL_5125/2020	2020.16	1.00
NODE_0000011	2020.17	1.00
hCoV-19/Greece/227_35969/2020	2020.22	1.00
NODE_0000013	2020.18	1.00
hCoV-19/USA/NY-PV09116/2020	2020.21	1.00
hCoV-19/Greece/41_36861/2020	2020.24	1.00
NODE_0000014	2020.12	1.00
NODE_0000015	2020.16	1.00
hCoV-19/Greece/246_32206/2020	2020.19	1.00
hCoV-19/Greece/248_32261/2020	2020.19	1.00
NODE_0000016	2020.14	1.00
hCoV-19/France/10023FD/2020	2020.17	2.00
NODE_0000017	2020.17	1.00
hCoV-19/USA/WI-UW-268/2020	2020.26	1.00
hCoV-19/France/40003KA/2020	2020.17	1.00

Convert a tree JSON from the ncov workflow into both a metadata table and Newick file using all attributes annotated on the root node of the tree by default.

python3 auspice_tree_to_table.py \
  --tree auspice/ncov_gisaid_europe.json \
  --output-metadata ncov_gisaid_europe.tsv \
  --output-tree ncov_gisaid_europe.nwk

You can view the resulting outputs in auspice.us by first dragging on the Newick file and then dragging on the metadata table.

import argparse
from augur.utils import annotate_parents_for_tree
import Bio.Phylo
import json
import pandas as pd
import sys
def json_to_tree(json_dict, root=True, parent_cumulative_branch_length=None):
"""Returns a Bio.Phylo tree corresponding to the given JSON dictionary exported
by `tree_to_json`.
Assigns links back to parent nodes for the root of the tree.
Test opening a JSON from augur export v1.
>>> import json
>>> json_fh = open("tests/data/json_tree_to_nexus/flu_h3n2_ha_3y_tree.json", "r")
>>> json_dict = json.load(json_fh)
>>> tree = json_to_tree(json_dict)
>>> tree.name
'NODE_0002020'
>>> len(tree.clades)
2
>>> tree.clades[0].name
'NODE_0001489'
>>> hasattr(tree, "attr")
True
>>> "dTiter" in tree.attr
True
>>> tree.clades[0].parent.name
'NODE_0002020'
>>> tree.clades[0].branch_length > 0
True
Test opening a JSON from augur export v2.
>>> json_fh = open("tests/data/zika.json", "r")
>>> json_dict = json.load(json_fh)
>>> tree = json_to_tree(json_dict)
>>> hasattr(tree, "name")
True
>>> len(tree.clades) > 0
True
>>> tree.clades[0].branch_length > 0
True
Branch lengths should be the length of the branch to each node and not the
length from the root. The cumulative branch length from the root gets its
own attribute.
>>> tip = [tip for tip in tree.find_clades(terminal=True) if tip.name == "USA/2016/FLWB042"][0]
>>> round(tip.cumulative_branch_length, 6)
0.004747
>>> round(tip.branch_length, 6)
0.000186
"""
# Check for v2 JSON which has combined metadata and tree data.
if root and "meta" in json_dict and "tree" in json_dict:
json_dict = json_dict["tree"]
node = Bio.Phylo.Newick.Clade()
# v1 and v2 JSONs use different keys for strain names.
if "name" in json_dict:
node.name = json_dict["name"]
else:
node.name = json_dict["strain"]
# Assign all non-children attributes.
for attr, value in json_dict.items():
if attr != "children":
setattr(node, attr, value)
# Only v1 JSONs support a single `attr` attribute.
if hasattr(node, "attr"):
node.numdate = node.attr.get("num_date")
node.cumulative_branch_length = node.attr.get("div")
if "translations" in node.attr:
node.translations = node.attr["translations"]
elif hasattr(node, "node_attrs"):
node.cumulative_branch_length = node.node_attrs.get("div")
node.branch_length = 0.0
if parent_cumulative_branch_length is not None and hasattr(node, "cumulative_branch_length"):
node.branch_length = node.cumulative_branch_length - parent_cumulative_branch_length
if "children" in json_dict:
# Recursively add children to the current node.
node.clades = [
json_to_tree(
child,
root=False,
parent_cumulative_branch_length=node.cumulative_branch_length
)
for child in json_dict["children"]
]
if root:
node = annotate_parents_for_tree(node)
return node
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--tree", required=True, help="auspice tree JSON")
parser.add_argument("--output-metadata", help="tab-delimited file of attributes per node of the given tree")
parser.add_argument("--output-tree", help="Newick version of the given tree")
parser.add_argument("--include-internal-nodes", action="store_true", help="include data from internal nodes in metadata output")
parser.add_argument("--attributes", nargs="+", help="names of attributes to export from the given tree in the metadata output")
args = parser.parse_args()
# Load tree from JSON.
with open(args.tree, "r", encoding="utf-8") as fh:
tree_json = json.load(fh)
tree = json_to_tree(tree_json)
# Output the tree, if requested.
if args.output_tree:
Bio.Phylo.write(
tree,
args.output_tree,
"newick",
)
if args.output_metadata:
# Collect attributes per node from the tree to export.
records = []
if args.attributes:
attributes = args.attributes
else:
attributes = sorted(
set(tree.root.node_attrs.keys()) |
set(tree.root.branch_attrs.keys())
)
for node in tree.find_clades():
if node.is_terminal() or args.include_internal_nodes:
record = {
"name": node.name
}
for attribute in attributes:
if attribute in node.node_attrs:
value = node.node_attrs[attribute]
elif attribute in node.branch_attrs:
value = node.branch_attrs[attribute]
else:
print(f"Could not find attribute '{attribute}' for node '{node.name}'.", file=sys.stderr)
value = None
if value is not None:
if isinstance(value, dict) and "value" in value:
value = value["value"]
record[attribute] = value
records.append(record)
# Convert records to a data frame and save as a tab-delimited file.
df = pd.DataFrame(records)
df.to_csv(
args.output_metadata,
sep="\t",
header=True,
index=False,
columns=["name"] + list(attributes),
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment