Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Automatic detection of branch length outliers on a gene tree
#!/usr/bin/env python
helptext='''This script identfies branch length outliers on a phylogeny. An outlier is a
branch with a length that exceeds a percentage of the maximum depth of the tree.
The input is a file containing one tree in newick format, and an optional file containing
a list of outgroup taxa (one per line).
The output will be an ASCII depiction of each branch with a length exceeding the threshold
(default 25% for ingroups, 75% for outgroups). A PNG file will also be generated for the
tree, with outgroup taxa in blue and branch length outliers in red.
Python > 2.7
ETE3 installed with all graphical dependencies
#Given a tree, determine if there are long branch lengths
import sys, argparse, os
from ete3 import Tree,TreeStyle,TextFace,NodeStyle
def get_bad_nodes(t,inlen,outlen,leaflen,outgroups=None):
bad_nodes = []
tree_depth = t.get_farthest_node()[1]
for node in t.traverse():
if node.dist > tree_depth * inlen:
isOutgroup = True
for leaf in node.get_leaves():
if not in outgroups:
isOutgroup = False
if isOutgroup:
if node.dist > tree_depth * outlen:
elif node.is_leaf():
if node.dist > tree_depth * leaflen:
return bad_nodes
def make_png(t,bad_nodes,png_name,outgroups=None):
for n in t.traverse():
if n in bad_nodes:
nstyle = NodeStyle()
nstyle["hz_line_color"] = "red"
nstyle["hz_line_width"] = 3
if n.is_leaf():
if in outgroups:
name_face = TextFace(, fgcolor="blue")
name_face = TextFace(, fgcolor="black")
if len(bad_nodes) > 0:
ts = TreeStyle()
ts.show_leaf_name = False
gene_name = "_".join(sys.argv[1].split('.')[1].split("_")[:2])
my_png = t.render(png_name,tree_style=ts)
def main():
parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("treefile",help="File containing one tree in newick format")
parser.add_argument("--outgroups",help="File containing list of outgroup taxa, one per line")
parser.add_argument("--png",help="Name of png file, default is same as tree file name",default=None)
parser.add_argument("--inlen",help="Percentage of max tree depth for ingroup outliers default = %(default)s",default=0.25,type=float)
parser.add_argument("--outlen",help="Percentage of max tree depth for outgroup outliers default = %(default)s",default=0.75,type=float)
parser.add_argument("--leaflen",help="Percentage of max tree depth for leaf outliers default = %(default)s",default=0.25,type=float)
if len(sys.argv) == 1:
args = parser.parse_args()
if os.path.isfile(args.treefile):
t = Tree(args.treefile)
print("Treefile {} not found!\n".format(args.treefile))
if args.png:
if args.png.endswith(".png"):
png_name = args.png
png_name = args.png + ".png"
png_name = os.path.basename(args.treefile).split(".")[0] + ".png"
if args.outgroups:
outgroups = set([x.rstrip() for x in open(args.outgroups)])
outgroups = None
bad_nodes = get_bad_nodes(t,args.inlen,args.outlen,args.leaflen,outgroups=outgroups)
if len(bad_nodes) > 0:
#print("No outliers found for {}\n".format(os.path.basename(args.treefile)))
if __name__ == "__main__":main()

This comment has been minimized.

Copy link
Owner Author

@mossmatters mossmatters commented Feb 28, 2017

Given a file outgroups.txtthat has one outgroup name per line, and a file gene_names.txt that has one gene name per line, the following GNU parallel command can process all gene trees in a directory:

parallel --tag python {}.tre --outgroups outgroups.txt :::: gene_names.txt > brlen_outliers.txt

The nodes flagged for being branch length outliers are printed to the screen, and are tagged with the gene name. A PNG file will be generated for each gene tree.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment