Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Automatic detection of branch length outliers on a gene tree
#!/usr/bin/env python
helptext='''This script identfies branch length outliers on a phylogeny. An outlier is a
branch with a length that exceeds a percentage of the maximum depth of the tree.
The input is a file containing one tree in newick format, and an optional file containing
a list of outgroup taxa (one per line).
The output will be an ASCII depiction of each branch with a length exceeding the threshold
(default 25% for ingroups, 75% for outgroups). A PNG file will also be generated for the
tree, with outgroup taxa in blue and branch length outliers in red.
Python > 2.7
ETE3 installed with all graphical dependencies
#Given a tree, determine if there are long branch lengths
import sys, argparse, os
from ete3 import Tree,TreeStyle,TextFace,NodeStyle
def get_bad_nodes(t,inlen,outlen,leaflen,outgroups=None):
bad_nodes = []
tree_depth = t.get_farthest_node()[1]
for node in t.traverse():
if node.dist > tree_depth * inlen:
isOutgroup = True
for leaf in node.get_leaves():
if not in outgroups:
isOutgroup = False
if isOutgroup:
if node.dist > tree_depth * outlen:
elif node.is_leaf():
if node.dist > tree_depth * leaflen:
return bad_nodes
def make_png(t,bad_nodes,png_name,outgroups=None):
for n in t.traverse():
if n in bad_nodes:
nstyle = NodeStyle()
nstyle["hz_line_color"] = "red"
nstyle["hz_line_width"] = 3
if n.is_leaf():
if in outgroups:
name_face = TextFace(, fgcolor="blue")
name_face = TextFace(, fgcolor="black")
if len(bad_nodes) > 0:
ts = TreeStyle()
ts.show_leaf_name = False
gene_name = "_".join(sys.argv[1].split('.')[1].split("_")[:2])
my_png = t.render(png_name,tree_style=ts)
def main():
parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("treefile",help="File containing one tree in newick format")
parser.add_argument("--outgroups",help="File containing list of outgroup taxa, one per line")
parser.add_argument("--png",help="Name of png file, default is same as tree file name",default=None)
parser.add_argument("--inlen",help="Percentage of max tree depth for ingroup outliers default = %(default)s",default=0.25,type=float)
parser.add_argument("--outlen",help="Percentage of max tree depth for outgroup outliers default = %(default)s",default=0.75,type=float)
parser.add_argument("--leaflen",help="Percentage of max tree depth for leaf outliers default = %(default)s",default=0.25,type=float)
if len(sys.argv) == 1:
args = parser.parse_args()
if os.path.isfile(args.treefile):
t = Tree(args.treefile)
print("Treefile {} not found!\n".format(args.treefile))
if args.png:
if args.png.endswith(".png"):
png_name = args.png
png_name = args.png + ".png"
png_name = os.path.basename(args.treefile).split(".")[0] + ".png"
if args.outgroups:
outgroups = set([x.rstrip() for x in open(args.outgroups)])
outgroups = None
bad_nodes = get_bad_nodes(t,args.inlen,args.outlen,args.leaflen,outgroups=outgroups)
if len(bad_nodes) > 0:
#print("No outliers found for {}\n".format(os.path.basename(args.treefile)))
if __name__ == "__main__":main()

This comment has been minimized.

Copy link
Owner Author

commented Feb 28, 2017

Given a file outgroups.txtthat has one outgroup name per line, and a file gene_names.txt that has one gene name per line, the following GNU parallel command can process all gene trees in a directory:

parallel --tag python {}.tre --outgroups outgroups.txt :::: gene_names.txt > brlen_outliers.txt

The nodes flagged for being branch length outliers are printed to the screen, and are tagged with the gene name. A PNG file will be generated for each gene tree.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.