Created
March 1, 2013 16:35
-
-
Save avrilcoghlan/5065876 to your computer and use it in GitHub Desktop.
Perl script that connects to the TreeFam mysql database, and prints out of a list of the TreeFam trees that contain the different possible topologies with respect to the relationship between man, dog, and mouse.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script treefam_dog_man_mouse.pl | |
# Written by Avril Coghlan (alc@sanger.ac.uk). | |
# 5-Apr-06. | |
# | |
# For the TreeFam project. | |
# | |
# This perl script connects to the MYSQL database of TreeFam families and | |
# prints out a list of the TreeFam trees that contain different topologies | |
# with respect to dog, man and mouse. | |
# | |
# The command-line format is: | |
# % perl <treefam_dog_man_mouse.pl> tree_type db | |
# where tree_type is the type of tree to look at (CLEAN/FULL/SEED/all), | |
# db is the TreeFam database to look at (A/B/all). | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 2) | |
{ | |
print "Usage of treefam_dog_man_mouse.pl <tree_type> <db>\n\n"; | |
print "where <tree_type> is the type of tree to look at (CLEAN/FULL/SEED/all),\n"; | |
print " <db> is the TreeFam database to look at (A/B/all).\n"; | |
print "perl -w treefam_dog_man_mouse.pl\n"; | |
print "For example, >perl -w treefam_dog_man_mouse.pl SEED A\n"; | |
exit; | |
} | |
# DECLARE MYSQL USERNAME AND HOST: | |
use DBI; | |
use lib "/nfs/disk100/wormpub2/Avril/downloadnov05/bioperl/bioperl-1.5.1/"; | |
use Bio::TreeIO; | |
use IO::String; | |
# FIND THE TYPE OF TREE TO LOOK AT: | |
$tree_type = $ARGV[0]; | |
if ($tree_type ne 'CLEAN' && $tree_type ne 'FULL' && $tree_type ne 'SEED' && $tree_type ne 'all') | |
{ | |
print STDERR "ERROR: tree_type should be CLEAN/FULL/SEED/all.\n"; | |
exit; | |
} | |
# FIND THE TREEFAM DATABASE TO LOOK AT: | |
$db = $ARGV[1]; | |
if ($db ne 'A' && $db ne 'B') | |
{ | |
print STDERR "ERROR: db should be A/B/all.\n"; | |
exit; | |
} | |
#------------------------------------------------------------------# | |
# GET THE TREEFAM-A TREES FROM THE MYSQL DATABASE: | |
$dbh = DBI->connect("dbi:mysql:treefam_2:db.treefam.org:3308", 'anonymous', '') || return; | |
$table_w = 'trees'; | |
# THIS TABLE HAS THE ACCESSION NUMBER, TYPE OF TREE (SEED/FULL/WHOLE/CLEAN) and the TREE ITSELF (NEWICK FORMAT). | |
$st = "SELECT AC, TYPE, TREE from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) { | |
$AC = $array[0]; # eg., TF101001 | |
$TYPE = $array[1]; # eg., SEED | |
$TREE = $array[2]; # THE TREE IN NEWICK FORMAT. | |
# NOW READ IN THE NEWICK FORMAT TREE AND SEE IF IT CONTAINS INTERNAL PLANT/YEAST SEQUENCES: | |
if ($AC eq '') { print STDERR "ERROR: AC is $AC.\n"; exit;} | |
if ($TYPE eq '') { print STDERR "ERROR: TYPE is $TYPE.\n"; exit;} | |
# CHECK THAT THIS IS THE TYPE OF TREE (CLEAN/FULL/SEED/all) THAT WE WANT TO LOOK AT: | |
if ($TYPE eq $tree_type || $tree_type eq 'all') | |
{ | |
# CHECK THAT THIS TREE COMES FROM THE TREEFAM DATABASE THAT WE WANT TO LOOK AT (A/B/all): | |
if ((substr($AC,0,3) eq 'TF1' && $db eq 'A') || (substr($AC,0,3) eq 'TF3' && $db eq 'B') || $db eq 'all') | |
{ | |
# CHECK THE NHX FORMAT OF THE TREE IS FINE: | |
$TREE = &check_nhx_format($TREE,$AC,$TYPE); | |
# PARSE THE TREE USING THE BIOPERL TREE PARSER: | |
if ($TREE ne 'none') # IF THE TREE'S FORMAT IS OK. | |
{ | |
&read_tree($TREE,$AC,$TYPE); | |
} | |
} | |
} | |
} | |
} | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# SUBROUTINE TO CHECK THE NHX FORMAT OF THE TREE, TO SEE IF IT WILL | |
# BE ABLE TO BE READ BY THE BIOPERL TREE PARSER. SPECIFICALLY, THIS | |
# CHECKS TO SEE IF THE TREE CONTAINS ANY INCORRECT SQUARE BRACKETS, ie. | |
# [&&NHX] (WITHOUT ANY TAG INFORMATION). | |
# 23-FEB-06. | |
sub check_nhx_format | |
{ | |
my $tree = $_[0]; #> THE PHYLOGENETIC TREE. | |
my $AC = $_[1]; #> THE TREEFAM ACCESSION NUMBER. | |
my $TYPE = $_[2]; #> THE TYPE OF TREE. | |
my $length = length($tree); #> LENGTH OF $tree. | |
my $i; #> | |
my $take = 1; #> | |
my $char; #> A CHARACTER IN $tree. | |
my $bracket = ""; #> | |
my $new_tree = ""; #> | |
my $seen_semicolon = 0; #> SAYS WHETHER WE HAVE SEEN ; IN THE TREE. | |
# LOOK AT THE TREE $tree, CHARACTER-BY-CHARACTER: | |
for ($i = 0; $i < $length; $i++) | |
{ | |
$char = substr($tree,$i,1); | |
# CHECK IF THE CHARACTER IS A SEMICOLON: | |
if ($char eq ';') { $seen_semicolon = 1;} | |
# RECORD THE CHARACTERS WITHIN SQUARE BRACKETS: | |
if ($take == 0 || $char eq '[' || $char eq ']') { $bracket = $bracket.$char;} | |
# CHECK IF THIS IS THE START OR END OF A REGION WITHIN SQUARE BRACKETS: | |
if ($char eq '[') # THE START OF A NHX TAG. | |
{ | |
$take = 0; | |
} | |
elsif ($char eq ']') # THE END OF A NHX TAG. | |
{ | |
$take = 1; | |
# CHECK IF THERE IS TAG INFORMATION WITHIN THE BRACKETED REGION: | |
if ($bracket eq '[&&NHX]') | |
{ | |
$bracket = '[&&NHX:S=Hello]'; | |
} | |
$new_tree = $new_tree.$bracket; | |
$bracket = ""; | |
} | |
# CHECK IF THIS IS A REGION OUTSIDE SQUARE BRACKETS: | |
if ($take == 1 && $char ne ']') | |
{ | |
$new_tree = $new_tree.$char; | |
} | |
} | |
# CHECK WHETHER THE TREE ENDED EARLY: | |
if ($seen_semicolon == 0) | |
{ | |
print STDERR "ERROR: tree $AC $TYPE is truncated.\n"; | |
print "ERROR: tree $AC $TYPE is truncated.\n"; | |
return "none"; | |
} | |
return $new_tree; | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE TO READ IN THE NEWICK FORMAT TREE AND SEE IF IT CONTAINS | |
# INTERNAL PLANT/YEAST SEQUENCES. THIS USES BIOPERL TO PARSE THE TREE. | |
# 22-FEB-06. | |
sub read_tree | |
{ | |
my $tree = $_[0]; #> THE TREE IN NEWICK FORMAT. | |
my $AC = $_[1]; #> THE ACCESSION NUMBER FOR THE TREEFAM FAMILY. | |
my $TYPE = $_[2]; #> THE TYPE OF TREE (SEED/FULL/CLEAN). | |
my $treefh; #> | |
my $input; #> | |
my $new_tree = ''; #> | |
my $rootnode; #> | |
my $parent; #> | |
my %PARENT = (); #> HASH TABLE OF THE PARENTS OF NODES. | |
my %SPLITS_INTO = (); #> HASH TABLE OF THE DAUGHTERS OF NODES. | |
my %SEQ = (); #> HASH TABLE OF SEQUENCE NAMES FOR LEAVES OF THE TREE. | |
my $node; #> | |
my @nodes; #> | |
my $seq; #> | |
my $no_outgroups = 0; #> THE NUMBER OF OUTGROUP SEQUENCES IN THE TREE. | |
my @outgroups; #> ARRAY OF OUTGROUP SEQUENCES IN THE TREE. | |
my @non_outgroups; #> ARRAY OF NON-OUTGROUP SEQUENCES IN THE TREE. | |
my $outgroups_together; #> SAYS WHETHER THE OUTGROUPS GROUP TOGETHER IN THE TREE. | |
my $PRINT_FORK_DETAILS = 0; #> IF 1, PRINT DETAILS OF THE BRANCHING OF THE TREE. | |
my $nodes; #> | |
my $i; #> | |
my $no_dog = 0; #> | |
my $no_chicken = 0; #> | |
my $no_mouse = 0; #> | |
my $no_human = 0; #> | |
my @dogmouse; # xxx | |
my @doghuman; # xxx | |
my @mousehuman; # xxx | |
my @mouse; # xxx | |
my @dog; # xxx | |
my @human; # xxx | |
my @chicken | |
# PARSE THE TREE USING BIOPERL. THE TREE IS IN EXTENDED NEWICK FORMAT. | |
$treefh = new IO::String($tree); | |
$input = new Bio::TreeIO(-fh => $treefh, -format => "nhx"); | |
eval { $new_tree = $input->next_tree }; | |
if ($@) | |
{ | |
print STDERR "ERROR: not able to parse tree $AC $TYPE.\n"; | |
print "ERROR: not able to parse tree $AC $TYPE.\n"; | |
return; | |
} | |
if ($new_tree eq '') | |
{ | |
print STDERR "ERROR: not able to parse tree $AC $TYPE.\n"; | |
print "ERROR: not able to parse tree $AC $TYPE.\n"; | |
return; | |
} | |
# GET THE ROOT NODE OF THE TREE: | |
$rootnode = $new_tree->get_root_node; | |
# GET ALL THE INTERNAL NODES IN THE TREE, AND RECORD THEIR PARENT NODES IN THE HASH | |
# TABLE %PARENT, AND THEIR DESCENDANTS IN THE HASH TABLE %SPLITS_INTO: | |
@nodes = $new_tree->get_nodes(); | |
foreach $node(@nodes) | |
{ | |
if ($node ne $rootnode) | |
{ | |
$parent = $node->ancestor; | |
# RECORD THAT NODE $parent IS THE PARENT NODE OF $node: | |
$PARENT{$node} = $parent; | |
# RECORD THAT $parent HAS $node AS A DESCENDANT: | |
if (!($SPLITS_INTO{$parent})) { $SPLITS_INTO{$parent} = $node; } | |
else { $SPLITS_INTO{$parent} = $SPLITS_INTO{$parent}.",".$node;} | |
# IF THIS IS A LEAF, CHECK WHETHER THIS IS AN OUTGROUP SEQUENCE: | |
if ($node->is_Leaf) | |
{ | |
$seq = $node->id(); | |
$SEQ{$node} = $seq; | |
if ($seq =~ /_CANFA/) | |
{ | |
@dog = (@dog,$node); | |
@doghuman = (@doghuman,$node); | |
@dogmouse = (@dogmouse,$node); | |
} | |
elsif ($seq =~ /_MOUSE/) | |
{ | |
@mouse = (@mouse,$node); | |
@dogmouse = (@dogmouse,$node); | |
@mousehuman = (@mousehuman,$node); | |
} | |
elsif ($seq =~ /_HUMAN/) | |
{ | |
@human = (@human,$node); | |
@mousehuman = (@mousehuman,$node); | |
@doghuman = (@doghuman,$node); | |
} | |
#xxxif ($seq =~ /_ARATH/ || $seq =~ /_YEAST/ || $seq =~ /_SCHPO/) | |
#{ | |
# $no_outgroups++; | |
# @outgroups = (@outgroups,$node); | |
#} | |
#else | |
#{ | |
# @non_outgroups = (@non_outgroups,$node); | |
#} | |
} | |
} | |
} | |
# SEE IF WE WANT TO PRINT OUT DETAILS OF THE BRANCHING OF THE TREE: | |
if ($PRINT_FORK_DETAILS == 1) | |
{ | |
foreach $parent (keys %SPLITS_INTO) | |
{ | |
print "$parent splits into "; | |
$nodes = $SPLITS_INTO{$parent}; | |
@nodes = split(/\,/,$nodes); | |
for ($i = 0; $i <= $#nodes; $i++) | |
{ | |
$node = $nodes[$i]; | |
if ($SEQ{$node}) { print "$SEQ{$node}, ";} | |
else { print "$node, "; } | |
} | |
print "\n"; | |
} | |
} | |
# NOW GO THROUGH EACH OF THE NODES IN THE TREE, AND FIND OUT WHETHER THERE | |
# IS ARE OUTGROUPS IN THE TREE, AND IF SO, WHETHER THE OUTGROUP SEQUENCES | |
# GROUP TOGETHER IN THE TREE: | |
#xxx if ($no_outgroups > 1) | |
if ($#dog >= 0 && $#human >= 0 && $#mouse >= 0) | |
{ | |
$dogmouse_together = &check_if_outgroups_group_together(\%PARENT,\%SPLITS_INTO,\@dogmouse,\@human,$AC,$TYPE,\%SEQ,"dogmouse"); | |
$doghuman_together = &check_if_outgroups_group_together(\%PARENT,\%SPLITS_INTO,\@doghuman,\@mouse,$AC,$TYPE,\%SEQ,"doghuman"); | |
$mousehuman_together = &check_if_outgroups_group_together(\%PARENT,\%SPLITS_INTO,\@mousehuman,\@dog,$AC,$TYPE,\%SEQ,"mousehuman"); | |
print "Tree $AC $TYPE dogmouse $dogmouse_together doghuman $doghuman_together mousehuman $mousehuman_together\n"; | |
print STDERR "Tree $AC $TYPE dogmouse $dogmouse_together doghuman $doghuman_together mousehuman $mousehuman_together\n"; | |
#xxx $outgroups_together = &check_if_outgroups_group_together(\%PARENT,\%SPLITS_INTO,\@outgroups,\@non_outgroups,$AC,$TYPE,\%SEQ); | |
#if ($outgroups_together == 1) | |
#{ | |
# print "Tree $AC $TYPE has the outgroups grouped together correctly.\n"; | |
# print STDERR "Tree $AC $TYPE has the outgroups grouped together correctly.\n"; | |
#} | |
#else | |
#{ | |
# print "*** Tree $AC $TYPE has internal outgroups in the tree!!! ***\n"; | |
# print STDERR "*** Tree $AC $TYPE has internal outgroups in the tree!!! ***\n"; | |
#} | |
} | |
else | |
{ | |
print "Tree $AC $TYPE has not got dog and mouse and human.\n"; | |
print STDERR "Tree $AC $TYPE has not got dog and mouse and human.\n"; | |
#xxxprint "Tree $AC $TYPE has $no_outgroups outgroups.\n"; | |
#print STDERR "Tree $AC $TYPE has $no_outgroups outgroups.\n"; | |
} | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE TO CHECK WHETHER THE OUTGROUPS IN THE TREE (IF THERE ARE | |
# ANY) ARE GROUPED TOGETHER: | |
# 22-FEB-06. | |
sub check_if_outgroups_group_together | |
{ | |
my $PARENT = $_[0]; #> POINTER TO HASH TABLE OF THE PARENTS OF NODES. | |
my $SPLITS_INTO = $_[1]; #> POINTER TO HASH TABLE OF THE DAUGHTERS OF NODES. | |
my $outgroups = $_[2]; #> POINTER TO ARRAY OF OUTGROUP SEQUENCES IN THE TREE. | |
my $non_outgroups = $_[3]; #> POINTER TO ARRAY OF NON-OUTGROUP SEQUENCES IN THE TREE. | |
my $AC = $_[4]; #> THE ACCESSION NUMBER OF THE TREEFAM TREE. | |
my $TYPE = $_[5]; #> THE TYPE OF THE TREEFAM TREE. | |
my $SEQ = $_[6]; #> POINTER TO HASH TABLE OF NAMES FOR LEAVES OF THE TREE. | |
my $taxa = $_[7]; # xxx dogmouse/doghuman/mousehuman | |
my $i; #> | |
my $non_outgroup; #> | |
my $parent; #> | |
my @parents; #> | |
my $lca; #> THE LAST COMMON ANCESTOR OF THE NON-OUTGROUP SEQUENCES IN THE TREE. | |
my $descendants; #> | |
my @descendants; #> | |
my %SEEN = (); #> | |
my $no_descendants; #> | |
my $added_new_descendant; #> | |
my $descendant; #> | |
my $daughters; #> | |
my @daughters; #> | |
my $j; #> | |
my $outgroups_together = 1; #> | |
my @temp; #> | |
my $seq; #> | |
# FIND THE LAST COMMON ANCESTOR OF THE NON-OUTGROUP SEQUENCES. | |
for ($i = 0; $i < @$non_outgroups; $i++) | |
{ | |
$non_outgroup = $non_outgroups->[$i]; | |
# FIND THE ANCESTORS OF $non_outgroup, UP TO THE ROOT OF THE TREE: | |
$parent = &find_ancestors($non_outgroup,$PARENT); | |
@parents = (@parents,$parent); | |
} | |
$lca = &find_lca(\@parents); | |
# NOW CHECK WHETHER THE LAST COMMON ANCESTOR OF THE ANIMAL SEQUENCES HAS | |
# ANY YEAST/PLANT DESCENDANTS: | |
if (!($SPLITS_INTO->{$lca})) { print "ERROR: do not know descendants of $lca.\n"; exit;} | |
$descendants = $SPLITS_INTO->{$lca}; | |
@descendants = split(/\,/,$descendants); | |
$SEEN{$lca} = 1; | |
LOOP_AGAIN_NOW: | |
$no_descendants = $#descendants + 1; | |
$added_new_descendant = 0; | |
for ($i = 0; $i < $no_descendants; $i++) | |
{ | |
$descendant = $descendants[$i]; | |
if ($SEQ->{$descendant}) | |
{ | |
$seq = $SEQ->{$descendant}; | |
if (($taxa eq 'doghuman' && ($seq =~ /_CANFA/ || $seq =~ /_HUMAN/)) || | |
($taxa eq 'dogmouse' && ($seq =~ /_CANFA/ || $seq =~ /_MOUSE/)) || | |
($taxa eq 'mousehuman' && ($seq =~ /_MOUSE/ || $seq =~ /_HUMAN/)) ) | |
# xxx if ($seq =~ /_YEAST/ || $seq =~ /_SCHPO/ || $seq =~ /_ARATH/) | |
{ | |
# THIS IS AN OUTGROUP SEQUENCE, SO THE OUTGROUP SEQUENCES MUST NOT GROUP TOGETHER: | |
print "Tree $AC $TYPE: sequence $seq of the non-$taxa LCA $lca is a $taxa sequence ($taxa don't group)!\n"; | |
print STDERR "Tree $AC $TYPE: sequence $seq of the non-$taxa LCA $lca is a $taxa sequence ($taxa don't group)!\n"; | |
$outgroups_together = 0; | |
return $outgroups_together; | |
} | |
} | |
if ($SEEN{$descendant}) { goto NEXT_DESCENDANT;} | |
$SEEN{$descendant} = 1; | |
if (!($SEQ->{$descendant})) # THE DESCENDANT IS A NODE. | |
{ | |
if (!($SPLITS_INTO->{$descendant})) { print "ERROR: do not know descendants of $descendant.\n"; exit;} | |
$daughters = $SPLITS_INTO->{$descendant}; | |
@daughters = split(/\,/,$daughters); | |
for ($j = 0; $j <= $#daughters; $j++) | |
{ | |
if (!($SEEN{$daughters[$j]})) | |
{ | |
@descendants= (@descendants,$daughters[$j]); | |
$added_new_descendant = 1; | |
} | |
} | |
} | |
NEXT_DESCENDANT: | |
if ($i == $no_descendants - 1) { goto OUT_OF_LOOP;} | |
} | |
OUT_OF_LOOP: | |
if ($added_new_descendant == 1) { goto LOOP_AGAIN_NOW;} | |
return $outgroups_together; | |
} | |
#------------------------------------------------------------------# | |
# | |
# SUBROUTINE TO FIND THE LAST COMMON ANCESTOR OF A GROUP OF SEQUENCES. | |
# 22-FEB-06. | |
sub find_lca | |
{ | |
my $parents = $_[0]; #> POINTER TO ARRAY OF ANCESTORS OF SEQUENCES. | |
my $i; #> | |
my $parent; #> | |
my $node; #> | |
my @nodes; #> | |
my $j; #> | |
my @temp; #> | |
my @all_nodes; #> | |
my %SEEN = (); #> | |
my $common_ancestor = 0; #> | |
my $lca = "none";#> THE LAST COMMON ANCESTOR OF THE SEQUENCES. | |
# RECORD ALL THE ANCESTORS OF ALL THE SEQUENCES: | |
for ($i = 0; $i < @$parents; $i++) | |
{ | |
$parent = $parents->[$i]; # eg. -NODE13-NODE60-NODE73-NODE74 | |
@nodes = split(/-/,$parent); | |
for ($j = 1; $j <= $#nodes; $j++) | |
{ | |
$node = $nodes[$j]; | |
if (!($SEEN{$node})) { $SEEN{$node} = 1; @all_nodes = (@all_nodes,$node);} | |
# RECORD THAT $parent CONTAINS $node: | |
$SEEN{$parent."*".$node} = 1; | |
} | |
} | |
# GO THROUGH EACH OF THE NODES AND CHECK WHETHER THERE IS ANY NODE THAT | |
# IS ANCESTRAL TO ALL THE SEQUENCES: | |
for ($i = 0; $i <= $#all_nodes; $i++) | |
{ | |
$node = $all_nodes[$i]; | |
$common_ancestor = 1; | |
for ($j = 0; $j < @$parents; $j++) | |
{ | |
$parent = $parents->[$j]; | |
if (!($SEEN{$parent."*".$node})) { $common_ancestor = 0;} | |
} | |
if ($common_ancestor == 1) | |
{ | |
# NODE $node IS THE LAST COMMON ANCESTOR OF ALL THE SEQUENCES: | |
$lca = $node; | |
return $lca; | |
} | |
} | |
if ($lca eq 'none') { print STDERR "ERROR: lca is $lca.\n"; exit;} | |
return($lca); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE TO FIND THE ANCESTORS OF A SEQUENCE, UP TO THE ROOT OF THE TREE. | |
# 22-FEB-06. | |
sub find_ancestors | |
{ | |
my $seq = $_[0]; #> THE SEQUENCE OF INTEREST. | |
my $PARENT = $_[1]; #> POINTER TO HASH TABLE OF PARENTS OF NODES. | |
my $parent = ""; #> | |
my $done = 0; #> | |
while ($done == 0) | |
{ | |
if ($PARENT->{$seq}) | |
{ | |
$parent = $parent."-".$PARENT->{$seq}; | |
$seq = $PARENT->{$seq}; | |
} | |
else | |
{ | |
$done = 1; | |
} | |
} | |
if ($parent eq '') { print STDERR "ERROR: parent of $seq is $parent.\n"; exit;} | |
return($parent); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment