Created
February 28, 2017 17:56
-
-
Save mossmatters/f00e675d471ce1b33a56455718b0a02e to your computer and use it in GitHub Desktop.
Automatic detection of branch length outliers on a gene tree
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
helptext='''This script identfies branch length outliers on a phylogeny. An outlier is a | |
branch with a length that exceeds a percentage of the maximum depth of the tree. | |
The input is a file containing one tree in newick format, and an optional file containing | |
a list of outgroup taxa (one per line). | |
The output will be an ASCII depiction of each branch with a length exceeding the threshold | |
(default 25% for ingroups, 75% for outgroups). A PNG file will also be generated for the | |
tree, with outgroup taxa in blue and branch length outliers in red. | |
Dependencies: | |
Python > 2.7 | |
ETE3 installed with all graphical dependencies | |
''' | |
#Given a tree, determine if there are long branch lengths | |
import sys, argparse, os | |
from ete3 import Tree,TreeStyle,TextFace,NodeStyle | |
def get_bad_nodes(t,inlen,outlen,leaflen,outgroups=None): | |
bad_nodes = [] | |
tree_depth = t.get_farthest_node()[1] | |
for node in t.traverse(): | |
if node.dist > tree_depth * inlen: | |
isOutgroup = True | |
for leaf in node.get_leaves(): | |
if leaf.name not in outgroups: | |
isOutgroup = False | |
if isOutgroup: | |
if node.dist > tree_depth * outlen: | |
print(node) | |
bad_nodes.append(node) | |
elif node.is_leaf(): | |
if node.dist > tree_depth * leaflen: | |
print(node) | |
bad_nodes.append(node) | |
else: | |
print(node) | |
bad_nodes.append(node) | |
return bad_nodes | |
def make_png(t,bad_nodes,png_name,outgroups=None): | |
for n in t.traverse(): | |
n.img_style["size"]=0 | |
if n in bad_nodes: | |
nstyle = NodeStyle() | |
nstyle["hz_line_color"] = "red" | |
nstyle["hz_line_width"] = 3 | |
n.set_style(nstyle) | |
if n.is_leaf(): | |
if n.name in outgroups: | |
name_face = TextFace(n.name, fgcolor="blue") | |
else: | |
name_face = TextFace(n.name, fgcolor="black") | |
n.add_face(name_face,0,"branch-right") | |
if len(bad_nodes) > 0: | |
ts = TreeStyle() | |
ts.show_leaf_name = False | |
gene_name = "_".join(sys.argv[1].split('.')[1].split("_")[:2]) | |
ts.title.add_face(TextFace(gene_name,fsize=15,bold=True),0) | |
my_png = t.render(png_name,tree_style=ts) | |
def main(): | |
parser = argparse.ArgumentParser(description=helptext,formatter_class=argparse.RawTextHelpFormatter) | |
parser.add_argument("treefile",help="File containing one tree in newick format") | |
parser.add_argument("--outgroups",help="File containing list of outgroup taxa, one per line") | |
parser.add_argument("--png",help="Name of png file, default is same as tree file name",default=None) | |
parser.add_argument("--inlen",help="Percentage of max tree depth for ingroup outliers default = %(default)s",default=0.25,type=float) | |
parser.add_argument("--outlen",help="Percentage of max tree depth for outgroup outliers default = %(default)s",default=0.75,type=float) | |
parser.add_argument("--leaflen",help="Percentage of max tree depth for leaf outliers default = %(default)s",default=0.25,type=float) | |
if len(sys.argv) == 1: | |
parser.print_help() | |
sys.exit(1) | |
args = parser.parse_args() | |
if os.path.isfile(args.treefile): | |
t = Tree(args.treefile) | |
else: | |
print("Treefile {} not found!\n".format(args.treefile)) | |
sys.exit(1) | |
if args.png: | |
if args.png.endswith(".png"): | |
png_name = args.png | |
else: | |
png_name = args.png + ".png" | |
else: | |
png_name = os.path.basename(args.treefile).split(".")[0] + ".png" | |
if args.outgroups: | |
outgroups = set([x.rstrip() for x in open(args.outgroups)]) | |
else: | |
outgroups = None | |
bad_nodes = get_bad_nodes(t,args.inlen,args.outlen,args.leaflen,outgroups=outgroups) | |
if len(bad_nodes) > 0: | |
make_png(t,bad_nodes,png_name,outgroups=outgroups) | |
#else: | |
#print("No outliers found for {}\n".format(os.path.basename(args.treefile))) | |
if __name__ == "__main__":main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Given a file
outgroups.txt
that has one outgroup name per line, and a filegene_names.txt
that has one gene name per line, the following GNU parallel command can process all gene trees in a directory:parallel --tag python brlen_outliers.py {}.tre --outgroups outgroups.txt :::: gene_names.txt > brlen_outliers.txt
The nodes flagged for being branch length outliers are printed to the screen, and are tagged with the gene name. A PNG file will be generated for each gene tree.