avrilcoghlan/treefam_dog_man_mouse.pl

## treefam_dog_man_mouse.pl
#!/usr/local/bin/perl

#
# Perl script treefam_dog_man_mouse.pl
# Written by Avril Coghlan (alc@sanger.ac.uk).
# 5-Apr-06.
#
# For the TreeFam project.
#
# This perl script connects to the MYSQL database of TreeFam families and
# prints out a list of the TreeFam trees that contain different topologies
# with respect to dog, man and mouse.
#
# The command-line format is:
# % perl <treefam_dog_man_mouse.pl> tree_type db
#   where tree_type is the type of tree to look at (CLEAN/FULL/SEED/all),
#         db is the TreeFam database to look at (A/B/all).
#
#------------------------------------------------------------------#

# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

$num_args                  = $#ARGV + 1;
if ($num_args != 2)
{
   print "Usage of treefam_dog_man_mouse.pl <tree_type> <db>\n\n";
   print "where <tree_type> is the type of tree to look at (CLEAN/FULL/SEED/all),\n";
   print "      <db> is the TreeFam database to look at (A/B/all).\n";
   print "perl -w treefam_dog_man_mouse.pl\n";
   print "For example, >perl -w treefam_dog_man_mouse.pl SEED A\n";
   exit;
}

# DECLARE MYSQL USERNAME AND HOST:

use DBI;
use lib "/nfs/disk100/wormpub2/Avril/downloadnov05/bioperl/bioperl-1.5.1/";
use Bio::TreeIO;
use IO::String;

# FIND THE TYPE OF TREE TO LOOK AT:
$tree_type                 = $ARGV[0];
if ($tree_type ne 'CLEAN' && $tree_type ne 'FULL' && $tree_type ne 'SEED' && $tree_type ne 'all')
{
   print STDERR "ERROR: tree_type should be CLEAN/FULL/SEED/all.\n";
   exit;
}

# FIND THE TREEFAM DATABASE TO LOOK AT:
$db                        = $ARGV[1];
if ($db ne 'A' && $db ne 'B')
{
   print STDERR "ERROR: db should be A/B/all.\n";
   exit;
}

#------------------------------------------------------------------#

# GET THE TREEFAM-A TREES FROM THE MYSQL DATABASE:

$dbh                       = DBI->connect("dbi:mysql:treefam_2:db.treefam.org:3308", 'anonymous', '') || return;
$table_w                   = 'trees';
# THIS TABLE HAS THE ACCESSION NUMBER, TYPE OF TREE (SEED/FULL/WHOLE/CLEAN) and the TREE ITSELF (NEWICK FORMAT).
$st                        = "SELECT AC, TYPE, TREE from $table_w";
$sth                       = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv                        = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
   while ((@array) = $sth->fetchrow_array)    {
      $AC                  = $array[0];  # eg., TF101001
      $TYPE                = $array[1];  # eg., SEED
      $TREE                = $array[2];  # THE TREE IN NEWICK FORMAT.
      # NOW READ IN THE NEWICK FORMAT TREE AND SEE IF IT CONTAINS INTERNAL PLANT/YEAST SEQUENCES:
      if ($AC   eq '') { print STDERR "ERROR: AC is $AC.\n";     exit;}
      if ($TYPE eq '') { print STDERR "ERROR: TYPE is $TYPE.\n"; exit;}
      # CHECK THAT THIS IS THE TYPE OF TREE (CLEAN/FULL/SEED/all) THAT WE WANT TO LOOK AT:
      if ($TYPE eq $tree_type || $tree_type eq 'all')
      {
         # CHECK THAT THIS TREE COMES FROM THE TREEFAM DATABASE THAT WE WANT TO LOOK AT (A/B/all):
         if ((substr($AC,0,3) eq 'TF1' && $db eq 'A') || (substr($AC,0,3) eq 'TF3' && $db eq 'B') || $db eq 'all')
         {
            # CHECK THE NHX FORMAT OF THE TREE IS FINE:
            $TREE          = &check_nhx_format($TREE,$AC,$TYPE);
            # PARSE THE TREE USING THE BIOPERL TREE PARSER:
            if ($TREE ne 'none') # IF THE TREE'S FORMAT IS OK.
            {
               &read_tree($TREE,$AC,$TYPE);
            }
         }
      }
   }
}
$rc                        = $dbh->disconnect();
$rc                        = "";

#------------------------------------------------------------------#

print STDERR "FINISHED.\n";

#------------------------------------------------------------------#

# SUBROUTINE TO CHECK THE NHX FORMAT OF THE TREE, TO SEE IF IT WILL
# BE ABLE TO BE READ BY THE BIOPERL TREE PARSER. SPECIFICALLY, THIS
# CHECKS TO SEE IF THE TREE CONTAINS ANY INCORRECT SQUARE BRACKETS, ie.
# [&&NHX] (WITHOUT ANY TAG INFORMATION).
# 23-FEB-06.

sub check_nhx_format
{
   my $tree                = $_[0];          #> THE PHYLOGENETIC TREE.
   my $AC                  = $_[1];          #> THE TREEFAM ACCESSION NUMBER.
   my $TYPE                = $_[2];          #> THE TYPE OF TREE.
   my $length              = length($tree);  #> LENGTH OF $tree.
   my $i;                                    #>
   my $take                = 1;              #>
   my $char;                                 #> A CHARACTER IN $tree.
   my $bracket             = "";             #>
   my $new_tree            = "";             #>
   my $seen_semicolon      = 0;              #> SAYS WHETHER WE HAVE SEEN ; IN THE TREE.

   # LOOK AT THE TREE $tree, CHARACTER-BY-CHARACTER:
   for ($i = 0; $i < $length; $i++)
   {
      $char                = substr($tree,$i,1);
      # CHECK IF THE CHARACTER IS A SEMICOLON:
      if ($char eq ';') { $seen_semicolon = 1;}
      # RECORD THE CHARACTERS WITHIN SQUARE BRACKETS:
      if ($take == 0 || $char eq '[' || $char eq ']') { $bracket = $bracket.$char;}
      # CHECK IF THIS IS THE START OR END OF A REGION WITHIN SQUARE BRACKETS:
      if    ($char eq '[') # THE START OF A NHX TAG.
      {
         $take             = 0;
      }
      elsif ($char eq ']') # THE END OF A NHX TAG.
      {
         $take             = 1;
         # CHECK IF THERE IS TAG INFORMATION WITHIN THE BRACKETED REGION:
         if ($bracket eq '[&&NHX]')
         {
            $bracket       = '[&&NHX:S=Hello]';
         }
         $new_tree         = $new_tree.$bracket;
         $bracket          = "";
      }
      # CHECK IF THIS IS A REGION OUTSIDE SQUARE BRACKETS:
      if ($take == 1 && $char ne ']')
      {
         $new_tree         = $new_tree.$char;
      }
   }


   # CHECK WHETHER THE TREE ENDED EARLY:
   if ($seen_semicolon == 0)
   {
      print STDERR "ERROR: tree $AC $TYPE is truncated.\n";
      print "ERROR: tree $AC $TYPE is truncated.\n";
      return "none";
   }

   return $new_tree;
}

#------------------------------------------------------------------#

# SUBROUTINE TO READ IN THE NEWICK FORMAT TREE AND SEE IF IT CONTAINS
# INTERNAL PLANT/YEAST SEQUENCES. THIS USES BIOPERL TO PARSE THE TREE.
# 22-FEB-06.

sub read_tree
{
   my $tree                = $_[0]; #> THE TREE IN NEWICK FORMAT.
   my $AC                  = $_[1]; #> THE ACCESSION NUMBER FOR THE TREEFAM FAMILY.
   my $TYPE                = $_[2]; #> THE TYPE OF TREE (SEED/FULL/CLEAN).
   my $treefh;                      #>
   my $input;                       #>
   my $new_tree            = '';    #>
   my $rootnode;                    #>
   my $parent;                      #>
   my %PARENT              = ();    #> HASH TABLE OF THE PARENTS OF NODES.
   my %SPLITS_INTO         = ();    #> HASH TABLE OF THE DAUGHTERS OF NODES.
   my %SEQ                 = ();    #> HASH TABLE OF SEQUENCE NAMES FOR LEAVES OF THE TREE.
   my $node;                        #>
   my @nodes;                       #>
   my $seq;                         #>
   my $no_outgroups        = 0;     #> THE NUMBER OF OUTGROUP SEQUENCES IN THE TREE.
   my @outgroups;                   #> ARRAY OF OUTGROUP SEQUENCES IN THE TREE.
   my @non_outgroups;               #> ARRAY OF NON-OUTGROUP SEQUENCES IN THE TREE.
   my $outgroups_together;          #> SAYS WHETHER THE OUTGROUPS GROUP TOGETHER IN THE TREE.
   my $PRINT_FORK_DETAILS  = 0;     #> IF 1, PRINT DETAILS OF THE BRANCHING OF THE TREE.
   my $nodes;                       #>
   my $i;                           #>
   my $no_dog              = 0;     #>
   my $no_chicken          = 0;     #>
   my $no_mouse            = 0;     #>
   my $no_human            = 0;     #>

   my @dogmouse;   # xxx
   my @doghuman;   # xxx
   my @mousehuman; # xxx
   my @mouse;      # xxx
   my @dog;        # xxx
   my @human;      # xxx
   my @chicken

   # PARSE THE TREE USING BIOPERL. THE TREE IS IN EXTENDED NEWICK FORMAT.
   $treefh                 = new IO::String($tree);
   $input                  = new Bio::TreeIO(-fh => $treefh, -format => "nhx");

   eval { $new_tree        = $input->next_tree };
   if ($@)
   {
      print STDERR "ERROR: not able to parse tree $AC $TYPE.\n";
      print "ERROR: not able to parse tree $AC $TYPE.\n";
      return;
   }
   if ($new_tree eq '')
   {
      print STDERR "ERROR: not able to parse tree $AC $TYPE.\n";
      print "ERROR: not able to parse tree $AC $TYPE.\n";
      return;
   }

   # GET THE ROOT NODE OF THE TREE:
   $rootnode               = $new_tree->get_root_node;

   # GET ALL THE INTERNAL NODES IN THE TREE, AND RECORD THEIR PARENT NODES IN THE HASH
   # TABLE %PARENT, AND THEIR DESCENDANTS IN THE HASH TABLE %SPLITS_INTO:
   @nodes                  = $new_tree->get_nodes();
   foreach $node(@nodes)
   {
     if ($node ne $rootnode)
     {
         $parent           = $node->ancestor;
         # RECORD THAT NODE $parent IS THE PARENT NODE OF $node:
         $PARENT{$node}    = $parent;
         # RECORD THAT $parent HAS $node AS A DESCENDANT:
         if (!($SPLITS_INTO{$parent})) { $SPLITS_INTO{$parent} = $node;                          }
         else                          { $SPLITS_INTO{$parent} = $SPLITS_INTO{$parent}.",".$node;}

         # IF THIS IS A LEAF, CHECK WHETHER THIS IS AN OUTGROUP SEQUENCE:
         if ($node->is_Leaf)
         {
            $seq           = $node->id();
            $SEQ{$node}    = $seq;
            if ($seq =~ /_CANFA/)
            {
               @dog        = (@dog,$node);
               @doghuman   = (@doghuman,$node);
               @dogmouse   = (@dogmouse,$node);
            }
            elsif ($seq =~ /_MOUSE/)
            {
               @mouse      = (@mouse,$node);
               @dogmouse   = (@dogmouse,$node);
               @mousehuman = (@mousehuman,$node);
            }
            elsif ($seq =~ /_HUMAN/)
            {
               @human      = (@human,$node);
               @mousehuman = (@mousehuman,$node);
               @doghuman   = (@doghuman,$node);
            }
            #xxxif ($seq =~ /_ARATH/ || $seq =~ /_YEAST/ || $seq =~ /_SCHPO/)
            #{
            #   $no_outgroups++;
            #   @outgroups  = (@outgroups,$node);
            #}
            #else
            #{
            #   @non_outgroups = (@non_outgroups,$node);
            #}
         }
      }
   }

   # SEE IF WE WANT TO PRINT OUT DETAILS OF THE BRANCHING OF THE TREE:
   if ($PRINT_FORK_DETAILS == 1)
   {
      foreach $parent (keys %SPLITS_INTO)
      {
         print "$parent splits into ";
         $nodes            = $SPLITS_INTO{$parent};
         @nodes            = split(/\,/,$nodes);
         for ($i = 0; $i <= $#nodes; $i++)
         {
            $node          = $nodes[$i];
            if ($SEQ{$node}) { print "$SEQ{$node}, ";}
            else             { print "$node, ";      }
         }
         print "\n";
      }
   }

   # NOW GO THROUGH EACH OF THE NODES IN THE TREE, AND FIND OUT WHETHER THERE
   # IS ARE OUTGROUPS IN THE TREE, AND IF SO, WHETHER THE OUTGROUP SEQUENCES
   # GROUP TOGETHER IN THE TREE:
   #xxx if ($no_outgroups > 1)
   if ($#dog >= 0 && $#human >= 0 && $#mouse >= 0)
   {
      $dogmouse_together   = &check_if_outgroups_group_together(\%PARENT,\%SPLITS_INTO,\@dogmouse,\@human,$AC,$TYPE,\%SEQ,"dogmouse");
      $doghuman_together   = &check_if_outgroups_group_together(\%PARENT,\%SPLITS_INTO,\@doghuman,\@mouse,$AC,$TYPE,\%SEQ,"doghuman");
      $mousehuman_together = &check_if_outgroups_group_together(\%PARENT,\%SPLITS_INTO,\@mousehuman,\@dog,$AC,$TYPE,\%SEQ,"mousehuman");

      print "Tree $AC $TYPE dogmouse $dogmouse_together doghuman $doghuman_together mousehuman $mousehuman_together\n";
      print STDERR "Tree $AC $TYPE dogmouse $dogmouse_together doghuman $doghuman_together mousehuman $mousehuman_together\n";
      #xxx $outgroups_together  = &check_if_outgroups_group_together(\%PARENT,\%SPLITS_INTO,\@outgroups,\@non_outgroups,$AC,$TYPE,\%SEQ);
      #if ($outgroups_together == 1)
      #{
      #   print "Tree $AC $TYPE has the outgroups grouped together correctly.\n";
      #   print STDERR "Tree $AC $TYPE has the outgroups grouped together correctly.\n";
      #}
      #else
      #{
      #   print "*** Tree $AC $TYPE has internal outgroups in the tree!!! ***\n";
      #   print STDERR "*** Tree $AC $TYPE has internal outgroups in the tree!!! ***\n";
      #}
   }
   else
   {
      print "Tree $AC $TYPE has not got dog and mouse and human.\n";
      print STDERR "Tree $AC $TYPE has not got dog and mouse and human.\n";
      #xxxprint "Tree $AC $TYPE has $no_outgroups outgroups.\n";
      #print STDERR "Tree $AC $TYPE has $no_outgroups outgroups.\n";
   }
}

#------------------------------------------------------------------#

# SUBROUTINE TO CHECK WHETHER THE OUTGROUPS IN THE TREE (IF THERE ARE
# ANY) ARE GROUPED TOGETHER:
# 22-FEB-06.

sub check_if_outgroups_group_together
{
   my $PARENT              = $_[0]; #> POINTER TO HASH TABLE OF THE PARENTS OF NODES.
   my $SPLITS_INTO         = $_[1]; #> POINTER TO HASH TABLE OF THE DAUGHTERS OF NODES.
   my $outgroups           = $_[2]; #> POINTER TO ARRAY OF OUTGROUP SEQUENCES IN THE TREE.
   my $non_outgroups       = $_[3]; #> POINTER TO ARRAY OF NON-OUTGROUP SEQUENCES IN THE TREE.
   my $AC                  = $_[4]; #> THE ACCESSION NUMBER OF THE TREEFAM TREE.
   my $TYPE                = $_[5]; #> THE TYPE OF THE TREEFAM TREE.
   my $SEQ                 = $_[6]; #> POINTER TO HASH TABLE OF NAMES FOR LEAVES OF THE TREE.
   my $taxa                = $_[7]; # xxx dogmouse/doghuman/mousehuman
   my $i;                           #>
   my $non_outgroup;                #>
   my $parent;                      #>
   my @parents;                     #>
   my $lca;                         #> THE LAST COMMON ANCESTOR OF THE NON-OUTGROUP SEQUENCES IN THE TREE.
   my $descendants;                 #>
   my @descendants;                 #>
   my %SEEN                = ();    #>
   my $no_descendants;              #>
   my $added_new_descendant;        #>
   my $descendant;                  #>
   my $daughters;                   #>
   my @daughters;                   #>
   my $j;                           #>
   my $outgroups_together  = 1;     #>
   my @temp;                        #>
   my $seq;                         #>

   # FIND THE LAST COMMON ANCESTOR OF THE NON-OUTGROUP SEQUENCES.
   for ($i = 0; $i < @$non_outgroups; $i++)
   {
      $non_outgroup        = $non_outgroups->[$i];
      # FIND THE ANCESTORS OF $non_outgroup, UP TO THE ROOT OF THE TREE:
      $parent              = &find_ancestors($non_outgroup,$PARENT);
      @parents             = (@parents,$parent);
   }
   $lca                    = &find_lca(\@parents);

   # NOW CHECK WHETHER THE LAST COMMON ANCESTOR OF THE ANIMAL SEQUENCES HAS
   # ANY YEAST/PLANT DESCENDANTS:
   if (!($SPLITS_INTO->{$lca})) { print "ERROR: do not know descendants of $lca.\n"; exit;}
   $descendants            = $SPLITS_INTO->{$lca};
   @descendants            = split(/\,/,$descendants);
   $SEEN{$lca}             = 1;

   LOOP_AGAIN_NOW:
   $no_descendants         = $#descendants + 1;
   $added_new_descendant   = 0;
   for ($i = 0; $i < $no_descendants; $i++)
   {
      $descendant          = $descendants[$i];
      if ($SEQ->{$descendant})
      {
         $seq              = $SEQ->{$descendant};
         if (($taxa eq 'doghuman'   && ($seq =~ /_CANFA/ || $seq =~ /_HUMAN/)) ||
             ($taxa eq 'dogmouse'   && ($seq =~ /_CANFA/ || $seq =~ /_MOUSE/)) ||
             ($taxa eq 'mousehuman' && ($seq =~ /_MOUSE/ || $seq =~ /_HUMAN/)) )
         # xxx if ($seq =~ /_YEAST/ || $seq =~ /_SCHPO/ || $seq =~ /_ARATH/)
         {
            # THIS IS AN OUTGROUP SEQUENCE, SO THE OUTGROUP SEQUENCES MUST NOT GROUP TOGETHER:
            print "Tree $AC $TYPE: sequence $seq of the non-$taxa LCA $lca is a $taxa sequence ($taxa don't group)!\n";
            print STDERR "Tree $AC $TYPE: sequence $seq of the non-$taxa LCA $lca is a $taxa sequence ($taxa don't group)!\n";
            $outgroups_together = 0;
            return $outgroups_together;
         }
      }
      if ($SEEN{$descendant}) { goto NEXT_DESCENDANT;}
      $SEEN{$descendant}   = 1;

      if (!($SEQ->{$descendant})) # THE DESCENDANT IS A NODE.
      {
         if (!($SPLITS_INTO->{$descendant})) { print "ERROR: do not know descendants of $descendant.\n"; exit;}
         $daughters        = $SPLITS_INTO->{$descendant};
         @daughters        = split(/\,/,$daughters);
         for ($j = 0; $j <= $#daughters; $j++)
         {
            if (!($SEEN{$daughters[$j]}))
            {
               @descendants= (@descendants,$daughters[$j]);
               $added_new_descendant = 1;
            }
         }
      }
      NEXT_DESCENDANT:
      if ($i == $no_descendants - 1) { goto OUT_OF_LOOP;}
   }
   OUT_OF_LOOP:
   if ($added_new_descendant == 1) { goto LOOP_AGAIN_NOW;}

   return $outgroups_together;
}

#------------------------------------------------------------------#
#
# SUBROUTINE TO FIND THE LAST COMMON ANCESTOR OF A GROUP OF SEQUENCES.
# 22-FEB-06.

sub find_lca
{
   my $parents             = $_[0]; #> POINTER TO ARRAY OF ANCESTORS OF SEQUENCES.
   my $i;                           #>
   my $parent;                      #>
   my $node;                        #>
   my @nodes;                       #>
   my $j;                           #>
   my @temp;                        #>
   my @all_nodes;                   #>
   my %SEEN                = ();    #>
   my $common_ancestor     = 0;     #>
   my $lca                 = "none";#> THE LAST COMMON ANCESTOR OF THE SEQUENCES.

   # RECORD ALL THE ANCESTORS OF ALL THE SEQUENCES:
   for ($i = 0; $i < @$parents; $i++)
   {
      $parent              = $parents->[$i]; # eg. -NODE13-NODE60-NODE73-NODE74
      @nodes               = split(/-/,$parent);
      for ($j = 1; $j <= $#nodes; $j++)
      {
         $node             = $nodes[$j];
         if (!($SEEN{$node})) { $SEEN{$node} = 1; @all_nodes = (@all_nodes,$node);}
         # RECORD THAT $parent CONTAINS $node:
         $SEEN{$parent."*".$node} = 1;
      }
   }

   # GO THROUGH EACH OF THE NODES AND CHECK WHETHER THERE IS ANY NODE THAT
   # IS ANCESTRAL TO ALL THE SEQUENCES:
   for ($i = 0; $i <= $#all_nodes; $i++)
   {
      $node                = $all_nodes[$i];
      $common_ancestor     = 1;
      for ($j = 0; $j < @$parents; $j++)
      {
         $parent           = $parents->[$j];
         if (!($SEEN{$parent."*".$node})) { $common_ancestor = 0;}
      }
      if ($common_ancestor == 1)
      {
         # NODE $node IS THE LAST COMMON ANCESTOR OF ALL THE SEQUENCES:
         $lca              = $node;
         return $lca;
      }
   }

   if ($lca eq 'none') { print STDERR "ERROR: lca is $lca.\n"; exit;}
   return($lca);
}

#------------------------------------------------------------------#

# SUBROUTINE TO FIND THE ANCESTORS OF A SEQUENCE, UP TO THE ROOT OF THE TREE.
# 22-FEB-06.

sub find_ancestors
{
   my $seq                 = $_[0]; #> THE SEQUENCE OF INTEREST.
   my $PARENT              = $_[1]; #> POINTER TO HASH TABLE OF PARENTS OF NODES.
   my $parent              = "";    #>
   my $done                = 0;     #>

   while ($done == 0)
   {
      if ($PARENT->{$seq})
      {
         $parent           = $parent."-".$PARENT->{$seq};
         $seq              = $PARENT->{$seq};
      }
      else
      {
         $done             = 1;
      }
   }
   if ($parent eq '') { print STDERR "ERROR: parent of $seq is $parent.\n"; exit;}

   return($parent);
}

#------------------------------------------------------------------#
	#!/usr/local/bin/perl

	#
	# Perl script treefam_dog_man_mouse.pl
	# Written by Avril Coghlan (alc@sanger.ac.uk).
	# 5-Apr-06.
	#
	# For the TreeFam project.
	#
	# This perl script connects to the MYSQL database of TreeFam families and
	# prints out a list of the TreeFam trees that contain different topologies
	# with respect to dog, man and mouse.
	#
	# The command-line format is:
	# % perl <treefam_dog_man_mouse.pl> tree_type db
	# where tree_type is the type of tree to look at (CLEAN/FULL/SEED/all),
	# db is the TreeFam database to look at (A/B/all).
	#
	#------------------------------------------------------------------#

	# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

	$num_args = $#ARGV + 1;
	if ($num_args != 2)
	{
	print "Usage of treefam_dog_man_mouse.pl <tree_type> <db>\n\n";
	print "where <tree_type> is the type of tree to look at (CLEAN/FULL/SEED/all),\n";
	print " <db> is the TreeFam database to look at (A/B/all).\n";
	print "perl -w treefam_dog_man_mouse.pl\n";
	print "For example, >perl -w treefam_dog_man_mouse.pl SEED A\n";
	exit;
	}

	# DECLARE MYSQL USERNAME AND HOST:

	use DBI;
	use lib "/nfs/disk100/wormpub2/Avril/downloadnov05/bioperl/bioperl-1.5.1/";
	use Bio::TreeIO;
	use IO::String;

	# FIND THE TYPE OF TREE TO LOOK AT:
	$tree_type = $ARGV[0];
	if ($tree_type ne 'CLEAN' && $tree_type ne 'FULL' && $tree_type ne 'SEED' && $tree_type ne 'all')
	{
	print STDERR "ERROR: tree_type should be CLEAN/FULL/SEED/all.\n";
	exit;
	}

	# FIND THE TREEFAM DATABASE TO LOOK AT:
	$db = $ARGV[1];
	if ($db ne 'A' && $db ne 'B')
	{
	print STDERR "ERROR: db should be A/B/all.\n";
	exit;
	}

	#------------------------------------------------------------------#

	# GET THE TREEFAM-A TREES FROM THE MYSQL DATABASE:

	$dbh = DBI->connect("dbi:mysql:treefam_2:db.treefam.org:3308", 'anonymous', '') \|\| return;
	$table_w = 'trees';
	# THIS TABLE HAS THE ACCESSION NUMBER, TYPE OF TREE (SEED/FULL/WHOLE/CLEAN) and the TREE ITSELF (NEWICK FORMAT).
	$st = "SELECT AC, TYPE, TREE from $table_w";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array) {
	$AC = $array[0]; # eg., TF101001
	$TYPE = $array[1]; # eg., SEED
	$TREE = $array[2]; # THE TREE IN NEWICK FORMAT.
	# NOW READ IN THE NEWICK FORMAT TREE AND SEE IF IT CONTAINS INTERNAL PLANT/YEAST SEQUENCES:
	if ($AC eq '') { print STDERR "ERROR: AC is $AC.\n"; exit;}
	if ($TYPE eq '') { print STDERR "ERROR: TYPE is $TYPE.\n"; exit;}
	# CHECK THAT THIS IS THE TYPE OF TREE (CLEAN/FULL/SEED/all) THAT WE WANT TO LOOK AT:
	if ($TYPE eq $tree_type \|\| $tree_type eq 'all')
	{
	# CHECK THAT THIS TREE COMES FROM THE TREEFAM DATABASE THAT WE WANT TO LOOK AT (A/B/all):
	if ((substr($AC,0,3) eq 'TF1' && $db eq 'A') \|\| (substr($AC,0,3) eq 'TF3' && $db eq 'B') \|\| $db eq 'all')
	{
	# CHECK THE NHX FORMAT OF THE TREE IS FINE:
	$TREE = &check_nhx_format($TREE,$AC,$TYPE);
	# PARSE THE TREE USING THE BIOPERL TREE PARSER:
	if ($TREE ne 'none') # IF THE TREE'S FORMAT IS OK.
	{
	&read_tree($TREE,$AC,$TYPE);
	}
	}
	}
	}
	}
	$rc = $dbh->disconnect();
	$rc = "";

	#------------------------------------------------------------------#

	print STDERR "FINISHED.\n";

	#------------------------------------------------------------------#

	# SUBROUTINE TO CHECK THE NHX FORMAT OF THE TREE, TO SEE IF IT WILL
	# BE ABLE TO BE READ BY THE BIOPERL TREE PARSER. SPECIFICALLY, THIS
	# CHECKS TO SEE IF THE TREE CONTAINS ANY INCORRECT SQUARE BRACKETS, ie.
	# [&&NHX] (WITHOUT ANY TAG INFORMATION).
	# 23-FEB-06.

	sub check_nhx_format
	{
	my $tree = $_[0]; #> THE PHYLOGENETIC TREE.
	my $AC = $_[1]; #> THE TREEFAM ACCESSION NUMBER.
	my $TYPE = $_[2]; #> THE TYPE OF TREE.
	my $length = length($tree); #> LENGTH OF $tree.
	my $i; #>
	my $take = 1; #>
	my $char; #> A CHARACTER IN $tree.
	my $bracket = ""; #>
	my $new_tree = ""; #>
	my $seen_semicolon = 0; #> SAYS WHETHER WE HAVE SEEN ; IN THE TREE.

	# LOOK AT THE TREE $tree, CHARACTER-BY-CHARACTER:
	for ($i = 0; $i < $length; $i++)
	{
	$char = substr($tree,$i,1);
	# CHECK IF THE CHARACTER IS A SEMICOLON:
	if ($char eq ';') { $seen_semicolon = 1;}
	# RECORD THE CHARACTERS WITHIN SQUARE BRACKETS:
	if ($take == 0 \|\| $char eq '[' \|\| $char eq ']') { $bracket = $bracket.$char;}
	# CHECK IF THIS IS THE START OR END OF A REGION WITHIN SQUARE BRACKETS:
	if ($char eq '[') # THE START OF A NHX TAG.
	{
	$take = 0;
	}
	elsif ($char eq ']') # THE END OF A NHX TAG.
	{
	$take = 1;
	# CHECK IF THERE IS TAG INFORMATION WITHIN THE BRACKETED REGION:
	if ($bracket eq '[&&NHX]')
	{
	$bracket = '[&&NHX:S=Hello]';
	}
	$new_tree = $new_tree.$bracket;
	$bracket = "";
	}
	# CHECK IF THIS IS A REGION OUTSIDE SQUARE BRACKETS:
	if ($take == 1 && $char ne ']')
	{
	$new_tree = $new_tree.$char;
	}
	}


	# CHECK WHETHER THE TREE ENDED EARLY:
	if ($seen_semicolon == 0)
	{
	print STDERR "ERROR: tree $AC $TYPE is truncated.\n";
	print "ERROR: tree $AC $TYPE is truncated.\n";
	return "none";
	}

	return $new_tree;
	}

	#------------------------------------------------------------------#

	# SUBROUTINE TO READ IN THE NEWICK FORMAT TREE AND SEE IF IT CONTAINS
	# INTERNAL PLANT/YEAST SEQUENCES. THIS USES BIOPERL TO PARSE THE TREE.
	# 22-FEB-06.

	sub read_tree
	{
	my $tree = $_[0]; #> THE TREE IN NEWICK FORMAT.
	my $AC = $_[1]; #> THE ACCESSION NUMBER FOR THE TREEFAM FAMILY.
	my $TYPE = $_[2]; #> THE TYPE OF TREE (SEED/FULL/CLEAN).
	my $treefh; #>
	my $input; #>
	my $new_tree = ''; #>
	my $rootnode; #>
	my $parent; #>
	my %PARENT = (); #> HASH TABLE OF THE PARENTS OF NODES.
	my %SPLITS_INTO = (); #> HASH TABLE OF THE DAUGHTERS OF NODES.
	my %SEQ = (); #> HASH TABLE OF SEQUENCE NAMES FOR LEAVES OF THE TREE.
	my $node; #>
	my @nodes; #>
	my $seq; #>
	my $no_outgroups = 0; #> THE NUMBER OF OUTGROUP SEQUENCES IN THE TREE.
	my @outgroups; #> ARRAY OF OUTGROUP SEQUENCES IN THE TREE.
	my @non_outgroups; #> ARRAY OF NON-OUTGROUP SEQUENCES IN THE TREE.
	my $outgroups_together; #> SAYS WHETHER THE OUTGROUPS GROUP TOGETHER IN THE TREE.
	my $PRINT_FORK_DETAILS = 0; #> IF 1, PRINT DETAILS OF THE BRANCHING OF THE TREE.
	my $nodes; #>
	my $i; #>
	my $no_dog = 0; #>
	my $no_chicken = 0; #>
	my $no_mouse = 0; #>
	my $no_human = 0; #>

	my @dogmouse; # xxx
	my @doghuman; # xxx
	my @mousehuman; # xxx
	my @mouse; # xxx
	my @dog; # xxx
	my @human; # xxx
	my @chicken

	# PARSE THE TREE USING BIOPERL. THE TREE IS IN EXTENDED NEWICK FORMAT.
	$treefh = new IO::String($tree);
	$input = new Bio::TreeIO(-fh => $treefh, -format => "nhx");

	eval { $new_tree = $input->next_tree };
	if ($@)
	{
	print STDERR "ERROR: not able to parse tree $AC $TYPE.\n";
	print "ERROR: not able to parse tree $AC $TYPE.\n";
	return;
	}
	if ($new_tree eq '')
	{
	print STDERR "ERROR: not able to parse tree $AC $TYPE.\n";
	print "ERROR: not able to parse tree $AC $TYPE.\n";
	return;
	}

	# GET THE ROOT NODE OF THE TREE:
	$rootnode = $new_tree->get_root_node;

	# GET ALL THE INTERNAL NODES IN THE TREE, AND RECORD THEIR PARENT NODES IN THE HASH
	# TABLE %PARENT, AND THEIR DESCENDANTS IN THE HASH TABLE %SPLITS_INTO:
	@nodes = $new_tree->get_nodes();
	foreach $node(@nodes)
	{
	if ($node ne $rootnode)
	{
	$parent = $node->ancestor;
	# RECORD THAT NODE $parent IS THE PARENT NODE OF $node:
	$PARENT{$node} = $parent;
	# RECORD THAT $parent HAS $node AS A DESCENDANT:
	if (!($SPLITS_INTO{$parent})) { $SPLITS_INTO{$parent} = $node; }
	else { $SPLITS_INTO{$parent} = $SPLITS_INTO{$parent}.",".$node;}

	# IF THIS IS A LEAF, CHECK WHETHER THIS IS AN OUTGROUP SEQUENCE:
	if ($node->is_Leaf)
	{
	$seq = $node->id();
	$SEQ{$node} = $seq;
	if ($seq =~ /_CANFA/)
	{
	@dog = (@dog,$node);
	@doghuman = (@doghuman,$node);
	@dogmouse = (@dogmouse,$node);
	}
	elsif ($seq =~ /_MOUSE/)
	{
	@mouse = (@mouse,$node);
	@dogmouse = (@dogmouse,$node);
	@mousehuman = (@mousehuman,$node);
	}
	elsif ($seq =~ /_HUMAN/)
	{
	@human = (@human,$node);
	@mousehuman = (@mousehuman,$node);
	@doghuman = (@doghuman,$node);
	}
	#xxxif ($seq =~ /_ARATH/ \|\| $seq =~ /_YEAST/ \|\| $seq =~ /_SCHPO/)
	#{
	# $no_outgroups++;
	# @outgroups = (@outgroups,$node);
	#}
	#else
	#{
	# @non_outgroups = (@non_outgroups,$node);
	#}
	}
	}
	}

	# SEE IF WE WANT TO PRINT OUT DETAILS OF THE BRANCHING OF THE TREE:
	if ($PRINT_FORK_DETAILS == 1)
	{
	foreach $parent (keys %SPLITS_INTO)
	{
	print "$parent splits into ";
	$nodes = $SPLITS_INTO{$parent};
	@nodes = split(/\,/,$nodes);
	for ($i = 0; $i <= $#nodes; $i++)
	{
	$node = $nodes[$i];
	if ($SEQ{$node}) { print "$SEQ{$node}, ";}
	else { print "$node, "; }
	}
	print "\n";
	}
	}

	# NOW GO THROUGH EACH OF THE NODES IN THE TREE, AND FIND OUT WHETHER THERE
	# IS ARE OUTGROUPS IN THE TREE, AND IF SO, WHETHER THE OUTGROUP SEQUENCES
	# GROUP TOGETHER IN THE TREE:
	#xxx if ($no_outgroups > 1)
	if ($#dog >= 0 && $#human >= 0 && $#mouse >= 0)
	{
	$dogmouse_together = &check_if_outgroups_group_together(\%PARENT,\%SPLITS_INTO,\@dogmouse,\@human,$AC,$TYPE,\%SEQ,"dogmouse");
	$doghuman_together = &check_if_outgroups_group_together(\%PARENT,\%SPLITS_INTO,\@doghuman,\@mouse,$AC,$TYPE,\%SEQ,"doghuman");
	$mousehuman_together = &check_if_outgroups_group_together(\%PARENT,\%SPLITS_INTO,\@mousehuman,\@dog,$AC,$TYPE,\%SEQ,"mousehuman");

	print "Tree $AC $TYPE dogmouse $dogmouse_together doghuman $doghuman_together mousehuman $mousehuman_together\n";
	print STDERR "Tree $AC $TYPE dogmouse $dogmouse_together doghuman $doghuman_together mousehuman $mousehuman_together\n";
	#xxx $outgroups_together = &check_if_outgroups_group_together(\%PARENT,\%SPLITS_INTO,\@outgroups,\@non_outgroups,$AC,$TYPE,\%SEQ);
	#if ($outgroups_together == 1)
	#{
	# print "Tree $AC $TYPE has the outgroups grouped together correctly.\n";
	# print STDERR "Tree $AC $TYPE has the outgroups grouped together correctly.\n";
	#}
	#else
	#{
	# print "* Tree $AC $TYPE has internal outgroups in the tree!!! *\n";
	# print STDERR "* Tree $AC $TYPE has internal outgroups in the tree!!! *\n";
	#}
	}
	else
	{
	print "Tree $AC $TYPE has not got dog and mouse and human.\n";
	print STDERR "Tree $AC $TYPE has not got dog and mouse and human.\n";
	#xxxprint "Tree $AC $TYPE has $no_outgroups outgroups.\n";
	#print STDERR "Tree $AC $TYPE has $no_outgroups outgroups.\n";
	}
	}

	#------------------------------------------------------------------#

	# SUBROUTINE TO CHECK WHETHER THE OUTGROUPS IN THE TREE (IF THERE ARE
	# ANY) ARE GROUPED TOGETHER:
	# 22-FEB-06.

	sub check_if_outgroups_group_together
	{
	my $PARENT = $_[0]; #> POINTER TO HASH TABLE OF THE PARENTS OF NODES.
	my $SPLITS_INTO = $_[1]; #> POINTER TO HASH TABLE OF THE DAUGHTERS OF NODES.
	my $outgroups = $_[2]; #> POINTER TO ARRAY OF OUTGROUP SEQUENCES IN THE TREE.
	my $non_outgroups = $_[3]; #> POINTER TO ARRAY OF NON-OUTGROUP SEQUENCES IN THE TREE.
	my $AC = $_[4]; #> THE ACCESSION NUMBER OF THE TREEFAM TREE.
	my $TYPE = $_[5]; #> THE TYPE OF THE TREEFAM TREE.
	my $SEQ = $_[6]; #> POINTER TO HASH TABLE OF NAMES FOR LEAVES OF THE TREE.
	my $taxa = $_[7]; # xxx dogmouse/doghuman/mousehuman
	my $i; #>
	my $non_outgroup; #>
	my $parent; #>
	my @parents; #>
	my $lca; #> THE LAST COMMON ANCESTOR OF THE NON-OUTGROUP SEQUENCES IN THE TREE.
	my $descendants; #>
	my @descendants; #>
	my %SEEN = (); #>
	my $no_descendants; #>
	my $added_new_descendant; #>
	my $descendant; #>
	my $daughters; #>
	my @daughters; #>
	my $j; #>
	my $outgroups_together = 1; #>
	my @temp; #>
	my $seq; #>

	# FIND THE LAST COMMON ANCESTOR OF THE NON-OUTGROUP SEQUENCES.
	for ($i = 0; $i < @$non_outgroups; $i++)
	{
	$non_outgroup = $non_outgroups->[$i];
	# FIND THE ANCESTORS OF $non_outgroup, UP TO THE ROOT OF THE TREE:
	$parent = &find_ancestors($non_outgroup,$PARENT);
	@parents = (@parents,$parent);
	}
	$lca = &find_lca(\@parents);

	# NOW CHECK WHETHER THE LAST COMMON ANCESTOR OF THE ANIMAL SEQUENCES HAS
	# ANY YEAST/PLANT DESCENDANTS:
	if (!($SPLITS_INTO->{$lca})) { print "ERROR: do not know descendants of $lca.\n"; exit;}
	$descendants = $SPLITS_INTO->{$lca};
	@descendants = split(/\,/,$descendants);
	$SEEN{$lca} = 1;

	LOOP_AGAIN_NOW:
	$no_descendants = $#descendants + 1;
	$added_new_descendant = 0;
	for ($i = 0; $i < $no_descendants; $i++)
	{
	$descendant = $descendants[$i];
	if ($SEQ->{$descendant})
	{
	$seq = $SEQ->{$descendant};
	if (($taxa eq 'doghuman' && ($seq =~ /_CANFA/ \|\| $seq =~ /_HUMAN/)) \|\|
	($taxa eq 'dogmouse' && ($seq =~ /_CANFA/ \|\| $seq =~ /_MOUSE/)) \|\|
	($taxa eq 'mousehuman' && ($seq =~ /_MOUSE/ \|\| $seq =~ /_HUMAN/)) )
	# xxx if ($seq =~ /_YEAST/ \|\| $seq =~ /_SCHPO/ \|\| $seq =~ /_ARATH/)
	{
	# THIS IS AN OUTGROUP SEQUENCE, SO THE OUTGROUP SEQUENCES MUST NOT GROUP TOGETHER:
	print "Tree $AC $TYPE: sequence $seq of the non-$taxa LCA $lca is a $taxa sequence ($taxa don't group)!\n";
	print STDERR "Tree $AC $TYPE: sequence $seq of the non-$taxa LCA $lca is a $taxa sequence ($taxa don't group)!\n";
	$outgroups_together = 0;
	return $outgroups_together;
	}
	}
	if ($SEEN{$descendant}) { goto NEXT_DESCENDANT;}
	$SEEN{$descendant} = 1;

	if (!($SEQ->{$descendant})) # THE DESCENDANT IS A NODE.
	{
	if (!($SPLITS_INTO->{$descendant})) { print "ERROR: do not know descendants of $descendant.\n"; exit;}
	$daughters = $SPLITS_INTO->{$descendant};
	@daughters = split(/\,/,$daughters);
	for ($j = 0; $j <= $#daughters; $j++)
	{
	if (!($SEEN{$daughters[$j]}))
	{
	@descendants= (@descendants,$daughters[$j]);
	$added_new_descendant = 1;
	}
	}
	}
	NEXT_DESCENDANT:
	if ($i == $no_descendants - 1) { goto OUT_OF_LOOP;}
	}
	OUT_OF_LOOP:
	if ($added_new_descendant == 1) { goto LOOP_AGAIN_NOW;}

	return $outgroups_together;
	}

	#------------------------------------------------------------------#
	#
	# SUBROUTINE TO FIND THE LAST COMMON ANCESTOR OF A GROUP OF SEQUENCES.
	# 22-FEB-06.

	sub find_lca
	{
	my $parents = $_[0]; #> POINTER TO ARRAY OF ANCESTORS OF SEQUENCES.
	my $i; #>
	my $parent; #>
	my $node; #>
	my @nodes; #>
	my $j; #>
	my @temp; #>
	my @all_nodes; #>
	my %SEEN = (); #>
	my $common_ancestor = 0; #>
	my $lca = "none";#> THE LAST COMMON ANCESTOR OF THE SEQUENCES.

	# RECORD ALL THE ANCESTORS OF ALL THE SEQUENCES:
	for ($i = 0; $i < @$parents; $i++)
	{
	$parent = $parents->[$i]; # eg. -NODE13-NODE60-NODE73-NODE74
	@nodes = split(/-/,$parent);
	for ($j = 1; $j <= $#nodes; $j++)
	{
	$node = $nodes[$j];
	if (!($SEEN{$node})) { $SEEN{$node} = 1; @all_nodes = (@all_nodes,$node);}
	# RECORD THAT $parent CONTAINS $node:
	$SEEN{$parent."*".$node} = 1;
	}
	}

	# GO THROUGH EACH OF THE NODES AND CHECK WHETHER THERE IS ANY NODE THAT
	# IS ANCESTRAL TO ALL THE SEQUENCES:
	for ($i = 0; $i <= $#all_nodes; $i++)
	{
	$node = $all_nodes[$i];
	$common_ancestor = 1;
	for ($j = 0; $j < @$parents; $j++)
	{
	$parent = $parents->[$j];
	if (!($SEEN{$parent."*".$node})) { $common_ancestor = 0;}
	}
	if ($common_ancestor == 1)
	{
	# NODE $node IS THE LAST COMMON ANCESTOR OF ALL THE SEQUENCES:
	$lca = $node;
	return $lca;
	}
	}

	if ($lca eq 'none') { print STDERR "ERROR: lca is $lca.\n"; exit;}
	return($lca);
	}

	#------------------------------------------------------------------#

	# SUBROUTINE TO FIND THE ANCESTORS OF A SEQUENCE, UP TO THE ROOT OF THE TREE.
	# 22-FEB-06.

	sub find_ancestors
	{
	my $seq = $_[0]; #> THE SEQUENCE OF INTEREST.
	my $PARENT = $_[1]; #> POINTER TO HASH TABLE OF PARENTS OF NODES.
	my $parent = ""; #>
	my $done = 0; #>

	while ($done == 0)
	{
	if ($PARENT->{$seq})
	{
	$parent = $parent."-".$PARENT->{$seq};
	$seq = $PARENT->{$seq};
	}
	else
	{
	$done = 1;
	}
	}
	if ($parent eq '') { print STDERR "ERROR: parent of $seq is $parent.\n"; exit;}

	return($parent);
	}

	#------------------------------------------------------------------#