Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 16:35
Show Gist options
  • Save avrilcoghlan/5065876 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065876 to your computer and use it in GitHub Desktop.
Perl script that connects to the TreeFam mysql database, and prints out of a list of the TreeFam trees that contain the different possible topologies with respect to the relationship between man, dog, and mouse.
#!/usr/local/bin/perl
#
# Perl script treefam_dog_man_mouse.pl
# Written by Avril Coghlan (alc@sanger.ac.uk).
# 5-Apr-06.
#
# For the TreeFam project.
#
# This perl script connects to the MYSQL database of TreeFam families and
# prints out a list of the TreeFam trees that contain different topologies
# with respect to dog, man and mouse.
#
# The command-line format is:
# % perl <treefam_dog_man_mouse.pl> tree_type db
# where tree_type is the type of tree to look at (CLEAN/FULL/SEED/all),
# db is the TreeFam database to look at (A/B/all).
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 2)
{
print "Usage of treefam_dog_man_mouse.pl <tree_type> <db>\n\n";
print "where <tree_type> is the type of tree to look at (CLEAN/FULL/SEED/all),\n";
print " <db> is the TreeFam database to look at (A/B/all).\n";
print "perl -w treefam_dog_man_mouse.pl\n";
print "For example, >perl -w treefam_dog_man_mouse.pl SEED A\n";
exit;
}
# DECLARE MYSQL USERNAME AND HOST:
use DBI;
use lib "/nfs/disk100/wormpub2/Avril/downloadnov05/bioperl/bioperl-1.5.1/";
use Bio::TreeIO;
use IO::String;
# FIND THE TYPE OF TREE TO LOOK AT:
$tree_type = $ARGV[0];
if ($tree_type ne 'CLEAN' && $tree_type ne 'FULL' && $tree_type ne 'SEED' && $tree_type ne 'all')
{
print STDERR "ERROR: tree_type should be CLEAN/FULL/SEED/all.\n";
exit;
}
# FIND THE TREEFAM DATABASE TO LOOK AT:
$db = $ARGV[1];
if ($db ne 'A' && $db ne 'B')
{
print STDERR "ERROR: db should be A/B/all.\n";
exit;
}
#------------------------------------------------------------------#
# GET THE TREEFAM-A TREES FROM THE MYSQL DATABASE:
$dbh = DBI->connect("dbi:mysql:treefam_2:db.treefam.org:3308", 'anonymous', '') || return;
$table_w = 'trees';
# THIS TABLE HAS THE ACCESSION NUMBER, TYPE OF TREE (SEED/FULL/WHOLE/CLEAN) and the TREE ITSELF (NEWICK FORMAT).
$st = "SELECT AC, TYPE, TREE from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array) {
$AC = $array[0]; # eg., TF101001
$TYPE = $array[1]; # eg., SEED
$TREE = $array[2]; # THE TREE IN NEWICK FORMAT.
# NOW READ IN THE NEWICK FORMAT TREE AND SEE IF IT CONTAINS INTERNAL PLANT/YEAST SEQUENCES:
if ($AC eq '') { print STDERR "ERROR: AC is $AC.\n"; exit;}
if ($TYPE eq '') { print STDERR "ERROR: TYPE is $TYPE.\n"; exit;}
# CHECK THAT THIS IS THE TYPE OF TREE (CLEAN/FULL/SEED/all) THAT WE WANT TO LOOK AT:
if ($TYPE eq $tree_type || $tree_type eq 'all')
{
# CHECK THAT THIS TREE COMES FROM THE TREEFAM DATABASE THAT WE WANT TO LOOK AT (A/B/all):
if ((substr($AC,0,3) eq 'TF1' && $db eq 'A') || (substr($AC,0,3) eq 'TF3' && $db eq 'B') || $db eq 'all')
{
# CHECK THE NHX FORMAT OF THE TREE IS FINE:
$TREE = &check_nhx_format($TREE,$AC,$TYPE);
# PARSE THE TREE USING THE BIOPERL TREE PARSER:
if ($TREE ne 'none') # IF THE TREE'S FORMAT IS OK.
{
&read_tree($TREE,$AC,$TYPE);
}
}
}
}
}
$rc = $dbh->disconnect();
$rc = "";
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# SUBROUTINE TO CHECK THE NHX FORMAT OF THE TREE, TO SEE IF IT WILL
# BE ABLE TO BE READ BY THE BIOPERL TREE PARSER. SPECIFICALLY, THIS
# CHECKS TO SEE IF THE TREE CONTAINS ANY INCORRECT SQUARE BRACKETS, ie.
# [&&NHX] (WITHOUT ANY TAG INFORMATION).
# 23-FEB-06.
sub check_nhx_format
{
my $tree = $_[0]; #> THE PHYLOGENETIC TREE.
my $AC = $_[1]; #> THE TREEFAM ACCESSION NUMBER.
my $TYPE = $_[2]; #> THE TYPE OF TREE.
my $length = length($tree); #> LENGTH OF $tree.
my $i; #>
my $take = 1; #>
my $char; #> A CHARACTER IN $tree.
my $bracket = ""; #>
my $new_tree = ""; #>
my $seen_semicolon = 0; #> SAYS WHETHER WE HAVE SEEN ; IN THE TREE.
# LOOK AT THE TREE $tree, CHARACTER-BY-CHARACTER:
for ($i = 0; $i < $length; $i++)
{
$char = substr($tree,$i,1);
# CHECK IF THE CHARACTER IS A SEMICOLON:
if ($char eq ';') { $seen_semicolon = 1;}
# RECORD THE CHARACTERS WITHIN SQUARE BRACKETS:
if ($take == 0 || $char eq '[' || $char eq ']') { $bracket = $bracket.$char;}
# CHECK IF THIS IS THE START OR END OF A REGION WITHIN SQUARE BRACKETS:
if ($char eq '[') # THE START OF A NHX TAG.
{
$take = 0;
}
elsif ($char eq ']') # THE END OF A NHX TAG.
{
$take = 1;
# CHECK IF THERE IS TAG INFORMATION WITHIN THE BRACKETED REGION:
if ($bracket eq '[&&NHX]')
{
$bracket = '[&&NHX:S=Hello]';
}
$new_tree = $new_tree.$bracket;
$bracket = "";
}
# CHECK IF THIS IS A REGION OUTSIDE SQUARE BRACKETS:
if ($take == 1 && $char ne ']')
{
$new_tree = $new_tree.$char;
}
}
# CHECK WHETHER THE TREE ENDED EARLY:
if ($seen_semicolon == 0)
{
print STDERR "ERROR: tree $AC $TYPE is truncated.\n";
print "ERROR: tree $AC $TYPE is truncated.\n";
return "none";
}
return $new_tree;
}
#------------------------------------------------------------------#
# SUBROUTINE TO READ IN THE NEWICK FORMAT TREE AND SEE IF IT CONTAINS
# INTERNAL PLANT/YEAST SEQUENCES. THIS USES BIOPERL TO PARSE THE TREE.
# 22-FEB-06.
sub read_tree
{
my $tree = $_[0]; #> THE TREE IN NEWICK FORMAT.
my $AC = $_[1]; #> THE ACCESSION NUMBER FOR THE TREEFAM FAMILY.
my $TYPE = $_[2]; #> THE TYPE OF TREE (SEED/FULL/CLEAN).
my $treefh; #>
my $input; #>
my $new_tree = ''; #>
my $rootnode; #>
my $parent; #>
my %PARENT = (); #> HASH TABLE OF THE PARENTS OF NODES.
my %SPLITS_INTO = (); #> HASH TABLE OF THE DAUGHTERS OF NODES.
my %SEQ = (); #> HASH TABLE OF SEQUENCE NAMES FOR LEAVES OF THE TREE.
my $node; #>
my @nodes; #>
my $seq; #>
my $no_outgroups = 0; #> THE NUMBER OF OUTGROUP SEQUENCES IN THE TREE.
my @outgroups; #> ARRAY OF OUTGROUP SEQUENCES IN THE TREE.
my @non_outgroups; #> ARRAY OF NON-OUTGROUP SEQUENCES IN THE TREE.
my $outgroups_together; #> SAYS WHETHER THE OUTGROUPS GROUP TOGETHER IN THE TREE.
my $PRINT_FORK_DETAILS = 0; #> IF 1, PRINT DETAILS OF THE BRANCHING OF THE TREE.
my $nodes; #>
my $i; #>
my $no_dog = 0; #>
my $no_chicken = 0; #>
my $no_mouse = 0; #>
my $no_human = 0; #>
my @dogmouse; # xxx
my @doghuman; # xxx
my @mousehuman; # xxx
my @mouse; # xxx
my @dog; # xxx
my @human; # xxx
my @chicken
# PARSE THE TREE USING BIOPERL. THE TREE IS IN EXTENDED NEWICK FORMAT.
$treefh = new IO::String($tree);
$input = new Bio::TreeIO(-fh => $treefh, -format => "nhx");
eval { $new_tree = $input->next_tree };
if ($@)
{
print STDERR "ERROR: not able to parse tree $AC $TYPE.\n";
print "ERROR: not able to parse tree $AC $TYPE.\n";
return;
}
if ($new_tree eq '')
{
print STDERR "ERROR: not able to parse tree $AC $TYPE.\n";
print "ERROR: not able to parse tree $AC $TYPE.\n";
return;
}
# GET THE ROOT NODE OF THE TREE:
$rootnode = $new_tree->get_root_node;
# GET ALL THE INTERNAL NODES IN THE TREE, AND RECORD THEIR PARENT NODES IN THE HASH
# TABLE %PARENT, AND THEIR DESCENDANTS IN THE HASH TABLE %SPLITS_INTO:
@nodes = $new_tree->get_nodes();
foreach $node(@nodes)
{
if ($node ne $rootnode)
{
$parent = $node->ancestor;
# RECORD THAT NODE $parent IS THE PARENT NODE OF $node:
$PARENT{$node} = $parent;
# RECORD THAT $parent HAS $node AS A DESCENDANT:
if (!($SPLITS_INTO{$parent})) { $SPLITS_INTO{$parent} = $node; }
else { $SPLITS_INTO{$parent} = $SPLITS_INTO{$parent}.",".$node;}
# IF THIS IS A LEAF, CHECK WHETHER THIS IS AN OUTGROUP SEQUENCE:
if ($node->is_Leaf)
{
$seq = $node->id();
$SEQ{$node} = $seq;
if ($seq =~ /_CANFA/)
{
@dog = (@dog,$node);
@doghuman = (@doghuman,$node);
@dogmouse = (@dogmouse,$node);
}
elsif ($seq =~ /_MOUSE/)
{
@mouse = (@mouse,$node);
@dogmouse = (@dogmouse,$node);
@mousehuman = (@mousehuman,$node);
}
elsif ($seq =~ /_HUMAN/)
{
@human = (@human,$node);
@mousehuman = (@mousehuman,$node);
@doghuman = (@doghuman,$node);
}
#xxxif ($seq =~ /_ARATH/ || $seq =~ /_YEAST/ || $seq =~ /_SCHPO/)
#{
# $no_outgroups++;
# @outgroups = (@outgroups,$node);
#}
#else
#{
# @non_outgroups = (@non_outgroups,$node);
#}
}
}
}
# SEE IF WE WANT TO PRINT OUT DETAILS OF THE BRANCHING OF THE TREE:
if ($PRINT_FORK_DETAILS == 1)
{
foreach $parent (keys %SPLITS_INTO)
{
print "$parent splits into ";
$nodes = $SPLITS_INTO{$parent};
@nodes = split(/\,/,$nodes);
for ($i = 0; $i <= $#nodes; $i++)
{
$node = $nodes[$i];
if ($SEQ{$node}) { print "$SEQ{$node}, ";}
else { print "$node, "; }
}
print "\n";
}
}
# NOW GO THROUGH EACH OF THE NODES IN THE TREE, AND FIND OUT WHETHER THERE
# IS ARE OUTGROUPS IN THE TREE, AND IF SO, WHETHER THE OUTGROUP SEQUENCES
# GROUP TOGETHER IN THE TREE:
#xxx if ($no_outgroups > 1)
if ($#dog >= 0 && $#human >= 0 && $#mouse >= 0)
{
$dogmouse_together = &check_if_outgroups_group_together(\%PARENT,\%SPLITS_INTO,\@dogmouse,\@human,$AC,$TYPE,\%SEQ,"dogmouse");
$doghuman_together = &check_if_outgroups_group_together(\%PARENT,\%SPLITS_INTO,\@doghuman,\@mouse,$AC,$TYPE,\%SEQ,"doghuman");
$mousehuman_together = &check_if_outgroups_group_together(\%PARENT,\%SPLITS_INTO,\@mousehuman,\@dog,$AC,$TYPE,\%SEQ,"mousehuman");
print "Tree $AC $TYPE dogmouse $dogmouse_together doghuman $doghuman_together mousehuman $mousehuman_together\n";
print STDERR "Tree $AC $TYPE dogmouse $dogmouse_together doghuman $doghuman_together mousehuman $mousehuman_together\n";
#xxx $outgroups_together = &check_if_outgroups_group_together(\%PARENT,\%SPLITS_INTO,\@outgroups,\@non_outgroups,$AC,$TYPE,\%SEQ);
#if ($outgroups_together == 1)
#{
# print "Tree $AC $TYPE has the outgroups grouped together correctly.\n";
# print STDERR "Tree $AC $TYPE has the outgroups grouped together correctly.\n";
#}
#else
#{
# print "*** Tree $AC $TYPE has internal outgroups in the tree!!! ***\n";
# print STDERR "*** Tree $AC $TYPE has internal outgroups in the tree!!! ***\n";
#}
}
else
{
print "Tree $AC $TYPE has not got dog and mouse and human.\n";
print STDERR "Tree $AC $TYPE has not got dog and mouse and human.\n";
#xxxprint "Tree $AC $TYPE has $no_outgroups outgroups.\n";
#print STDERR "Tree $AC $TYPE has $no_outgroups outgroups.\n";
}
}
#------------------------------------------------------------------#
# SUBROUTINE TO CHECK WHETHER THE OUTGROUPS IN THE TREE (IF THERE ARE
# ANY) ARE GROUPED TOGETHER:
# 22-FEB-06.
sub check_if_outgroups_group_together
{
my $PARENT = $_[0]; #> POINTER TO HASH TABLE OF THE PARENTS OF NODES.
my $SPLITS_INTO = $_[1]; #> POINTER TO HASH TABLE OF THE DAUGHTERS OF NODES.
my $outgroups = $_[2]; #> POINTER TO ARRAY OF OUTGROUP SEQUENCES IN THE TREE.
my $non_outgroups = $_[3]; #> POINTER TO ARRAY OF NON-OUTGROUP SEQUENCES IN THE TREE.
my $AC = $_[4]; #> THE ACCESSION NUMBER OF THE TREEFAM TREE.
my $TYPE = $_[5]; #> THE TYPE OF THE TREEFAM TREE.
my $SEQ = $_[6]; #> POINTER TO HASH TABLE OF NAMES FOR LEAVES OF THE TREE.
my $taxa = $_[7]; # xxx dogmouse/doghuman/mousehuman
my $i; #>
my $non_outgroup; #>
my $parent; #>
my @parents; #>
my $lca; #> THE LAST COMMON ANCESTOR OF THE NON-OUTGROUP SEQUENCES IN THE TREE.
my $descendants; #>
my @descendants; #>
my %SEEN = (); #>
my $no_descendants; #>
my $added_new_descendant; #>
my $descendant; #>
my $daughters; #>
my @daughters; #>
my $j; #>
my $outgroups_together = 1; #>
my @temp; #>
my $seq; #>
# FIND THE LAST COMMON ANCESTOR OF THE NON-OUTGROUP SEQUENCES.
for ($i = 0; $i < @$non_outgroups; $i++)
{
$non_outgroup = $non_outgroups->[$i];
# FIND THE ANCESTORS OF $non_outgroup, UP TO THE ROOT OF THE TREE:
$parent = &find_ancestors($non_outgroup,$PARENT);
@parents = (@parents,$parent);
}
$lca = &find_lca(\@parents);
# NOW CHECK WHETHER THE LAST COMMON ANCESTOR OF THE ANIMAL SEQUENCES HAS
# ANY YEAST/PLANT DESCENDANTS:
if (!($SPLITS_INTO->{$lca})) { print "ERROR: do not know descendants of $lca.\n"; exit;}
$descendants = $SPLITS_INTO->{$lca};
@descendants = split(/\,/,$descendants);
$SEEN{$lca} = 1;
LOOP_AGAIN_NOW:
$no_descendants = $#descendants + 1;
$added_new_descendant = 0;
for ($i = 0; $i < $no_descendants; $i++)
{
$descendant = $descendants[$i];
if ($SEQ->{$descendant})
{
$seq = $SEQ->{$descendant};
if (($taxa eq 'doghuman' && ($seq =~ /_CANFA/ || $seq =~ /_HUMAN/)) ||
($taxa eq 'dogmouse' && ($seq =~ /_CANFA/ || $seq =~ /_MOUSE/)) ||
($taxa eq 'mousehuman' && ($seq =~ /_MOUSE/ || $seq =~ /_HUMAN/)) )
# xxx if ($seq =~ /_YEAST/ || $seq =~ /_SCHPO/ || $seq =~ /_ARATH/)
{
# THIS IS AN OUTGROUP SEQUENCE, SO THE OUTGROUP SEQUENCES MUST NOT GROUP TOGETHER:
print "Tree $AC $TYPE: sequence $seq of the non-$taxa LCA $lca is a $taxa sequence ($taxa don't group)!\n";
print STDERR "Tree $AC $TYPE: sequence $seq of the non-$taxa LCA $lca is a $taxa sequence ($taxa don't group)!\n";
$outgroups_together = 0;
return $outgroups_together;
}
}
if ($SEEN{$descendant}) { goto NEXT_DESCENDANT;}
$SEEN{$descendant} = 1;
if (!($SEQ->{$descendant})) # THE DESCENDANT IS A NODE.
{
if (!($SPLITS_INTO->{$descendant})) { print "ERROR: do not know descendants of $descendant.\n"; exit;}
$daughters = $SPLITS_INTO->{$descendant};
@daughters = split(/\,/,$daughters);
for ($j = 0; $j <= $#daughters; $j++)
{
if (!($SEEN{$daughters[$j]}))
{
@descendants= (@descendants,$daughters[$j]);
$added_new_descendant = 1;
}
}
}
NEXT_DESCENDANT:
if ($i == $no_descendants - 1) { goto OUT_OF_LOOP;}
}
OUT_OF_LOOP:
if ($added_new_descendant == 1) { goto LOOP_AGAIN_NOW;}
return $outgroups_together;
}
#------------------------------------------------------------------#
#
# SUBROUTINE TO FIND THE LAST COMMON ANCESTOR OF A GROUP OF SEQUENCES.
# 22-FEB-06.
sub find_lca
{
my $parents = $_[0]; #> POINTER TO ARRAY OF ANCESTORS OF SEQUENCES.
my $i; #>
my $parent; #>
my $node; #>
my @nodes; #>
my $j; #>
my @temp; #>
my @all_nodes; #>
my %SEEN = (); #>
my $common_ancestor = 0; #>
my $lca = "none";#> THE LAST COMMON ANCESTOR OF THE SEQUENCES.
# RECORD ALL THE ANCESTORS OF ALL THE SEQUENCES:
for ($i = 0; $i < @$parents; $i++)
{
$parent = $parents->[$i]; # eg. -NODE13-NODE60-NODE73-NODE74
@nodes = split(/-/,$parent);
for ($j = 1; $j <= $#nodes; $j++)
{
$node = $nodes[$j];
if (!($SEEN{$node})) { $SEEN{$node} = 1; @all_nodes = (@all_nodes,$node);}
# RECORD THAT $parent CONTAINS $node:
$SEEN{$parent."*".$node} = 1;
}
}
# GO THROUGH EACH OF THE NODES AND CHECK WHETHER THERE IS ANY NODE THAT
# IS ANCESTRAL TO ALL THE SEQUENCES:
for ($i = 0; $i <= $#all_nodes; $i++)
{
$node = $all_nodes[$i];
$common_ancestor = 1;
for ($j = 0; $j < @$parents; $j++)
{
$parent = $parents->[$j];
if (!($SEEN{$parent."*".$node})) { $common_ancestor = 0;}
}
if ($common_ancestor == 1)
{
# NODE $node IS THE LAST COMMON ANCESTOR OF ALL THE SEQUENCES:
$lca = $node;
return $lca;
}
}
if ($lca eq 'none') { print STDERR "ERROR: lca is $lca.\n"; exit;}
return($lca);
}
#------------------------------------------------------------------#
# SUBROUTINE TO FIND THE ANCESTORS OF A SEQUENCE, UP TO THE ROOT OF THE TREE.
# 22-FEB-06.
sub find_ancestors
{
my $seq = $_[0]; #> THE SEQUENCE OF INTEREST.
my $PARENT = $_[1]; #> POINTER TO HASH TABLE OF PARENTS OF NODES.
my $parent = ""; #>
my $done = 0; #>
while ($done == 0)
{
if ($PARENT->{$seq})
{
$parent = $parent."-".$PARENT->{$seq};
$seq = $PARENT->{$seq};
}
else
{
$done = 1;
}
}
if ($parent eq '') { print STDERR "ERROR: parent of $seq is $parent.\n"; exit;}
return($parent);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment