Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5065978 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065978 to your computer and use it in GitHub Desktop.
Perl script that, given a list of TreeFam families, and a file of features of the sequences in trees (eg. Pfam domains or GO terms), infers the likely features of the ancestral nodes in the trees for the families.
#!/usr/local/bin/perl
#
# Perl script treefam_infer_ancestral_features.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 30-May-07.
#
# This perl script infers the features of ancestral nodes
# in a list of TreeFam trees, given the features in the
# leaves.
#
# The command-line format is:
# % perl <treefam_infer_ancestral_features.pl> families alg feature_type GO_file
# where families is the list of TreeFam families,
# alg is the algorithm to use,
# feature_type is the type of feature to use (GO or Pfam),
# GO_file is the file containing the GO annotations for the leaves (if feature_type=GO; none otherwise).
# 'alg' can be 'rd1' = Richard's algorithm for GO annotations from the TreeFam meeting,
# 'rd2' = Traditional parsimony algorithm from Richard's book,
# 'rd3' = Weighted parsimony algorithm from Richard's book.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 4)
{
print "Usage of treefam_infer_ancestral_features.pl\n\n";
print "perl treefam_infer_ancestral_features.pl <families> <alg> <feature_type> <GO_file>\n";
print "where <families> is the list of TreeFam families,\n";
print " <alg> is the algorithm to use,\n";
print " <feature_type> is the type of feature to use (GO or Pfam),\n";
print " <GO_file> is the file containing the GO annotations for the leaves (if feature_type=GO; none otherwise).\n";
print "NOTE: alg can be: rd1 = Richard's algorithm for GO annotations,\n";
print " rd2 = traditional parsimony algorithm from Richard's book,\n";
print " rd3 = weighted parsimony algorithm from Richard's book.\n";
print "For example, >perl treefam_infer_ancestral_features.pl\n";
print "neural_genes_2may07 rd2 GO GO_slim_terms11_2may07\n";
exit;
}
# READ IN MY PERL MODULES:
BEGIN {
unshift (@INC, '/nfs/team71/phd/alc/perl/modules');
}
use lib "/nfs/team71/phd/alc/perl/treefam/Treefam_api_3may07/"; #xxx note that this is old, I've hacked it
#xxx to make it use TreeFam-4.
use Avril_modules;
use Treefam::DBConnection;
# FIND THE NAME OF THE LIST OF TREEFAM FAMILIES:
$fams = $ARGV[0];
# FIND THE ALGORITHM TO USE:
$alg = $ARGV[1];
if ($alg ne 'rd1' && $alg ne 'rd2' && $alg ne 'rd3')
{
print STDERR "ERROR: alg should be rd1 or rd2 or rd3!\n";
exit;
}
# FIND THE TYPE OF FEATURE TO USE:
$feature_type = $ARGV[2];
if ($feature_type ne 'GO' && $feature_type ne 'Pfam')
{
print STDERR "ERROR: feature_type should be GO or Pfam!\n";
exit;
}
# FIND THE NAME OF THE FILE CONTAINING THE GO ANNOTATIONS FOR THE LEAVES:
$GO_file = $ARGV[3];
srand;
$VERBOSE_PFAM = 0; # PRINTS PFAM DOMAINS IN GENES.
$VERBOSE_GOID = 0; # PRINTS GO ANNOTATIONS FOR GENES.
$VERBOSE3 = 1;
#------------------------------------------------------------------#
# READ IN THE LIST OF FAMILIES:
($families) = &read_family_list($fams);
print STDERR "Read in families...\n";
# READ IN THE INPUT DATA:
if ($feature_type eq 'Pfam')
{
# GET THE PFAM DOMAINS FOR THE GENES IN THE TREEFAM FAMILIES:
($FEATURES,$features,$SPECIES_WITH_DATA) = &get_pfam($families);
print STDERR "Got pfam domains from TreeFam...\n";
}
elsif ($feature_type eq 'GO')
{
# GET THE GO ANNOTATIONS FOR THE GENES IN THE TREEFAM FAMILIES:
($FEATURES,$features,$SPECIES_WITH_DATA) = &get_GO($GO_file,$families);
print STDERR "Got GO from $GO_file...\n";
}
else { print STDERR "ERROR: feature_type $feature_type.\n"; exit;}
# NOW INFER THE FEATURES OF ANCESTRAL NODES IN THESE FAMILIES:
&infer_ancestral_nodes($FEATURES,$families,$features,$SPECIES_WITH_DATA);
print STDERR "FINISHED.\n";
print "FINISHED\n";
#------------------------------------------------------------------#
# READ IN THE LIST OF FAMILIES FROM A FILE:
sub read_family_list
{
my $fams = $_[0]; #>
my $line; #>
my @temp; #>
my $family; #>
my @families; #>
my %SEEN = (); #>
open(FAMS,"$fams") || die "ERROR: cannot open $fams.\n";
while(<FAMS>)
{
$line = $_;
chomp $line; # eg. TF101101
@temp = split(/\s+/,$line);
if (substr($line,0,2) eq 'TF')
{
$family = $temp[0];
if (!($SEEN{$family}))
{
@families = (@families,$family);
$SEEN{$family} = 1;
}
}
}
close(FAMS);
return(\@families);
}
#------------------------------------------------------------------#
# IF $node IS A DUPLICATION NODE, ONLY CONFIRM IN THE CHILDREN THOSE
# CONFIRMED TERMS IN THE CURRENT NODE THAT ARE PROVISIONAL TERMS IN
# THE CHILD NODE.
sub add_confirmed_terms_to_children2
{
my $node = $_[0]; #
my $node_name = $_[1]; #
my $CONFIRMED_FEAT = $_[2]; #
my $VERBOSE_PREORDER = $_[3]; #
my $PROVISIONAL_FEAT = $_[4]; #
my $FEATURES = $_[5]; #
my $confirmed_feat; #
my @confirmed_feat; #
my $daughter; #
my $daughter_name; #
my $provisional_feat; #
my @provisional_feat; #
my %CONFIRMED_IN_NODE = (); #
my $i; #
my $j; #
my $daughter_confirmed_feat; #
my %SEEN = (); #
my $daughter_seqname; #
if ($VERBOSE_PREORDER == 1) { print "___ adding confirmed feats for duplication node $node_name to its chidren's confirmed lists\n";}
# FIND THE CONFIRMED FEATURESs OF $node:
if ($CONFIRMED_FEAT->{$node_name})
{
$confirmed_feat = $CONFIRMED_FEAT->{$node_name};
@confirmed_feat = split(/\,/,$confirmed_feat);
for ($i = 0; $i <= $#confirmed_feat; $i++) { $CONFIRMED_IN_NODE{$confirmed_feat[$i]} = 1;}
}
else # $node DOES NOT HAVE ANY CONFIRMED FEATURES.
{
return;
}
# FIND THE CHILDREN OF NODE $node:
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $node:
foreach $daughter ($node->children)
{
$daughter_confirmed_feat = "";
%SEEN = ();
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); }
else { $daughter_name = $daughter->internalID;}
# FIND THE PROVISIONAL FEATURES FOR $daughter_name:
if ($PROVISIONAL_FEAT->{$daughter_name})
{
$provisional_feat = $PROVISIONAL_FEAT->{$daughter_name};
@provisional_feat = split(/\,/,$provisional_feat);
for ($j = 0; $j <= $#provisional_feat; $j++)
{
if ($CONFIRMED_IN_NODE{$provisional_feat[$j]}) # IF THIS FEAT IS CONFIRMED IN $node.
{
if (!($SEEN{$provisional_feat[$j]}))
{
$SEEN{$provisional_feat[$j]} = 1;
$daughter_confirmed_feat = $daughter_confirmed_feat.",".$provisional_feat[$j];
}
}
}
}
else # THERE IS NO PROVISIONAL FEATURES FOR $daughter_name. $daughter_name MAY BE A LEAF.
{
if ($daughter->is_leaf)
{
$daughter_seqname = $daughter->get_tag_value('O');
if ($FEATURES->{$daughter_seqname})
{
$provisional_feat = $FEATURES->{$daughter_seqname};
@provisional_feat = split(/\,/,$provisional_feat);
for ($j = 0; $j <= $#provisional_feat; $j++)
{
if ($CONFIRMED_IN_NODE{$provisional_feat[$j]}) # IF THIS FEATURE IS CONFIRMED IN $node.
{
if (!($SEEN{$provisional_feat[$j]}))
{
$SEEN{$provisional_feat[$j]} = 1;
$daughter_confirmed_feat = $daughter_confirmed_feat.",".$provisional_feat[$j];
}
}
}
}
}
}
if ($daughter_confirmed_feat ne "")
{
$daughter_confirmed_feat = substr($daughter_confirmed_feat,1,length($daughter_confirmed_feat)-1);
if ($VERBOSE_PREORDER == 1) { print "___ adding $daughter_confirmed_feat to daughter $daughter_name of $node_name\n";}
if (!($CONFIRMED_FEAT->{$daughter_name})) { $CONFIRMED_FEAT->{$daughter_name} = $daughter_confirmed_feat;}
else { print STDERR "ERROR: daughter_name $daughter_name should not have a confirmed feat already.\n"; exit;}
}
}
}
}
#------------------------------------------------------------------#
# IF $node IS A SPECIATION NODE, ADD ALL ITS CONFIRMED TERMS TO THE
# CONFIRMED LISTS OF BOTH ITS CHILDREN:
sub add_confirmed_terms_to_children
{
my $node = $_[0];
my $node_name = $_[1];
my $CONFIRMED_FEAT = $_[2];
my $VERBOSE_PREORDER = $_[3];
my $daughter;
my $daughter_name;
my $confirmed_feat;
if ($VERBOSE_PREORDER == 1) { print "___ adding confirmed feats for speciation node $node_name to its chidren's confirmed lists\n";}
# FIND THE CONFIRMED FEATURES OF $node:
if ($CONFIRMED_FEAT->{$node_name})
{
$confirmed_feat = $CONFIRMED_FEAT->{$node_name};
}
else # $node DOES NOT HAVE ANY CONFIRMED FEATURES.
{
return;
}
# FIND THE CHILDREN OF NODE $node:
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $node:
foreach $daughter ($node->children)
{
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); }
else { $daughter_name = $daughter->internalID;}
if ($VERBOSE_PREORDER == 1) { print "___ adding $confirmed_feat to daughter $daughter_name of $node_name\n";}
if (!($CONFIRMED_FEAT->{$daughter_name})) { $CONFIRMED_FEAT->{$daughter_name} = $confirmed_feat;}
else { print STDERR "ERROR: daughter_name $daughter_name should not have a confirmed feat already.\n"; exit;}
}
}
}
#------------------------------------------------------------------#
# RECORD THE FEATURES FOR NODE $node. IN THE PRE-ORDER TRAVERSAL,
# WE CHOOSE A CHARACTER FOR NODE $node. WE TRY TO PICK THE SAME RESIDUE
# FOR THE DAUGHTER NODES IF POSSIBLE.
sub assign_confirmed_feats_rd2
{
my $PROVISIONAL_FEAT = $_[0]; #>
my $node = $_[1]; #>
my $node_name = $_[2]; #>
my $CONFIRMED_FEAT = $_[3]; #>
my $VERBOSE_PREORDER = $_[4]; #>
my $FEATURES = $_[5]; #>
my $myfeats = $_[6]; #>
my $root_name = $_[7]; #>
my $parent; #>
my $parent_name; #>
my $i; #>
my $myfeat; #>
my $key; #>
my $provisional; #>
my $confirmed; #>
my $rand; #>
my $node_seqname; #>
my $parent_key; #>
my $parent_confirmed; #>
my $confirmed_feat = ""; #>
if ($VERBOSE_PREORDER == 1) { print "___ finding confirmed feats for node $node_name\n";}
# IF THIS IS NOT THE ROOT:
if ($node_name ne $root_name)
{
# FIND THE PARENT OF NODE $node:
if (defined($node->parent)) { $parent = $node->parent; }
else { print STDERR "ERROR: do not know parent of node $node_name.\n"; exit;}
$parent_name = $parent->internalID;
# FIND THE PROVISIONAL FEATURES FOR THE NODE $node_name:
for ($i = 0; $i < @$myfeats; $i++)
{
$myfeat = $myfeats->[$i];
$key = $node_name."_".$myfeat;
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
if (!($PROVISIONAL_FEAT->{$key})) { print STDERR "ERROR: do not know provisional feat for $key.\n"; exit;}
$provisional = $PROVISIONAL_FEAT->{$key};
}
else # THIS IS A SEQUENCE.
{
$node_seqname = $node->get_tag_value('O');
$key = $node_seqname."_".$myfeat;
if (!($FEATURES->{$key})) { print STDERR "ERROR: do not know feat for $key.\n"; exit;}
$provisional = $FEATURES->{$key};
}
$parent_key = $parent_name."_".$myfeat;
if (!($CONFIRMED_FEAT->{$parent_key})) { print STDERR "ERROR: do not know confirmed feat for $parent_key.\n"; exit;}
$parent_confirmed = $CONFIRMED_FEAT->{$parent_key};
if ($provisional eq 'yes') { $confirmed = 'yes';}
elsif ($provisional eq 'no') { $confirmed = 'no'; }
elsif ($provisional eq 'yes,no')
{
$confirmed = $parent_confirmed;
}
else { print STDERR "ERROR: provisional $provisional\n"; exit;}
$CONFIRMED_FEAT->{$key} = $confirmed;
$confirmed_feat = $confirmed_feat."_".$confirmed;
}
}
else # THIS IS THE ROOT:
{
# FIND THE PROVISIONAL FEATURES FOR THE ROOT NODE:
for ($i = 0; $i < @$myfeats; $i++)
{
$myfeat = $myfeats->[$i];
$key = $node_name."_".$myfeat;
if (!($PROVISIONAL_FEAT->{$key})) { print STDERR "ERROR: do not know provisional feat for $key.\n"; exit;}
$provisional = $PROVISIONAL_FEAT->{$key};
if ($provisional eq 'yes') { $confirmed = 'yes';}
elsif ($provisional eq 'no') { $confirmed = 'no'; }
elsif ($provisional eq 'yes,no')
{
$rand = int rand 2; # xxx we might want to print out all equally parsimonious trees
if ($rand == 0)
{
$confirmed = 'yes';
}
elsif ($rand == 1)
{
$confirmed = 'no';
}
else { print STDERR "ERROR: rand $rand.\n"; exit;}
print "Node $node_name : taking confirmed as 'no' for $myfeat, as it could be yes or no\n"; #xxx
$confirmed = 'no'; # xxx # FORCE IT TO CHOOSE GAINS OVER LOSSES.
}
else { print STDERR "ERROR: provisional $provisional\n"; exit;}
$CONFIRMED_FEAT->{$key} = $confirmed;
$confirmed_feat = $confirmed_feat."_".$confirmed;
}
}
if ($confirmed_feat eq '') { print STDERR "ERROR: confirmed_feat $confirmed_feat\n"; exit;}
$confirmed_feat = substr($confirmed_feat,1,length($confirmed_feat)-1);
if ($VERBOSE_PREORDER == 1) { print "___ node $node_name has confirmed feat $confirmed_feat\n";}
}
#------------------------------------------------------------------#
# RECORD THE FEATURES FOR NODE $node. IN THE PRE-ORDER TRAVERSAL,
# WE ADD TO THE CONFIRMED TERM LIST OF A NODE THOSE PROVISIONAL
# TERMS PRESENT IN BOTH CHILDREN OF THE CURRENT NODE:
sub assign_confirmed_feats_rd1
{
my $PROVISIONAL_FEAT = $_[0]; #
my $node = $_[1]; #
my $node_name = $_[2]; #
my $CONFIRMED_FEAT = $_[3]; #
my $VERBOSE_PREORDER = $_[4]; #
my $FEATURES = $_[5]; #
my $daughter; #
my $daughter_name; #
my $daughter_seqname; #
my $daughter_feats; #
my @daughter_feats; #
my $i; #
my $no_daughters = 0; #
my %DAUGHTER_FEAT = (); #
my %SEEN_FEAT = (); #
my @all_feats; #
my $feat; #
my $confirmed_feat = ""; #
my $shared_feat; #
my $j; #
if ($VERBOSE_PREORDER == 1) { print "___ finding confirmed feats for node $node_name\n";}
# FIND THE CHILDREN OF NODE $node:
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $node:
foreach $daughter ($node->children)
{
$no_daughters++;
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); }
else { $daughter_name = $daughter->internalID;}
if ($VERBOSE_PREORDER == 1) { print "___ looking at daughter $daughter_name of parent $node_name\n";}
# FIND THE FEATURES OF NODE $daughter:
if ($daughter->is_leaf)
{
$daughter_seqname = $daughter->get_tag_value('O');
if ($FEATURES->{$daughter_seqname})
{
$daughter_feats = $FEATURES->{$daughter_seqname};
if ($VERBOSE_PREORDER == 1) { print "___ daughter $daughter_name ($daughter_seqname) has feats $daughter_feats\n";}
@daughter_feats = split(/\,/,$daughter_feats);
for ($i = 0; $i <= $#daughter_feats; $i++)
{
$DAUGHTER_FEAT{$no_daughters."_".$daughter_feats[$i]} = 1;
if (!($SEEN_FEAT{$daughter_feats[$i]}))
{
$SEEN_FEAT{$daughter_feats[$i]} = 1;
@all_feats = (@all_feats,$daughter_feats[$i]);
}
}
}
}
else
{
if ($PROVISIONAL_FEAT->{$daughter_name})
{
$daughter_feats = $PROVISIONAL_FEAT->{$daughter_name};
if ($VERBOSE_PREORDER == 1) { print "___ daughter $daughter_name has feats $daughter_feats\n";}
@daughter_feats = split(/\,/,$daughter_feats);
for ($i = 0; $i <= $#daughter_feats; $i++)
{
$DAUGHTER_FEAT{$no_daughters."_".$daughter_feats[$i]} = 1;
if (!($SEEN_FEAT{$daughter_feats[$i]}))
{
$SEEN_FEAT{$daughter_feats[$i]} = 1;
@all_feats = (@all_feats,$daughter_feats[$i]);
}
}
}
}
}
}
# FIND THE FEATURES THAT ARE SHARED BY ALL THE DAUGHTERS OF NODE $node:
$confirmed_feat = "";
for ($i = 0; $i <= $#all_feats; $i++)
{
$feat = $all_feats[$i];
$shared_feat = 1;
for ($j = 1; $j <= $no_daughters; $j++)
{
if (!($DAUGHTER_FEAT{$j."_".$feat})) { $shared_feat = 0;}
}
if ($shared_feat == 1)
{
$confirmed_feat = $confirmed_feat.",".$feat;
}
}
if ($confirmed_feat ne "")
{
$confirmed_feat = substr($confirmed_feat,1,length($confirmed_feat)-1);
if ($VERBOSE_PREORDER == 1) { print "___ node $node_name has confirmed feat $confirmed_feat\n";}
$CONFIRMED_FEAT->{$node_name} = $confirmed_feat;
}
}
#------------------------------------------------------------------#
# GET THE PROVISIONAL FEATURES IN A NODE:
sub get_daughter_feats
{
my $daughter_seqname = $_[0]; #>
my $PROVISIONAL_FEAT = $_[1]; #>
my $myfeats = $_[2]; #>
my $daughter_feats = ""; #>
my $i; #>
my $key; #>
my $feats; #>
my $success = 1; #>
for ($i = 0; $i < @$myfeats; $i++)
{
$key = $daughter_seqname."_".$myfeats->[$i];
if (!($PROVISIONAL_FEAT->{$key}))
{
$success = 0;
# WE DO NOT HAVE THE PROVISIONAL FEATURES FOR THIS NODE YET:
return($success,$daughter_feats);
}
$feats = $PROVISIONAL_FEAT->{$key}; # yes OR no OR yes,no
if ($feats ne 'yes' && $feats ne 'no' && $feats ne 'yes,no') { print "ERROR: feats $feats.\n"; exit;}
$daughter_feats = $daughter_feats."_".$feats;
}
$daughter_feats = substr($daughter_feats,1,length($daughter_feats)-1);
return($success,$daughter_feats);
}
#------------------------------------------------------------------#
sub get_intersection
{
my $myfeats = $_[0]; #>
my $i; #>
my $j; #>
my $myfeat; #>
my $intersection = ""; #>
my @temp; #>
my $no_feats = 0; #>
my %CNT = (); #>
for ($i = 0; $i < @$myfeats; $i++)
{
$no_feats++;
$myfeat = $myfeats->[$i];
@temp = split(/\,/,$myfeat);
for ($j = 0; $j <= $#temp; $j++)
{
if (!($CNT{$temp[$j]})) { $CNT{$temp[$j]} = 1;}
else { $CNT{$temp[$j]}++; }
}
}
foreach $myfeat (keys %CNT)
{
if ($CNT{$myfeat} == $no_feats)
{
$intersection = $intersection.",".$myfeat;
}
}
if ($intersection ne "")
{
$intersection = substr($intersection,1,length($intersection)-1);
if ($intersection eq 'no,yes') { $intersection = "yes,no";}
if ($intersection ne "no" && $intersection ne "yes" && $intersection ne "yes,no")
{
print "ERROR: intersection $intersection\n";
exit;
}
}
return $intersection;
}
#------------------------------------------------------------------#
sub get_union
{
my $myfeats = $_[0];
my $i;
my $j;
my $myfeat;
my $union = "";
my @temp;
my %SEEN = ();
for ($i = 0; $i < @$myfeats; $i++)
{
$myfeat = $myfeats->[$i];
@temp = split(/\,/,$myfeat);
for ($j = 0; $j <= $#temp; $j++)
{
if (!($SEEN{$temp[$j]}))
{
$SEEN{$temp[$j]} = 1;
$union = $union.",".$temp[$j];
}
}
}
if ($union eq "") { print "ERROR: union is $union !"; exit;}
$union = substr($union,1,length($union)-1);
if ($union eq 'no,yes') { $union = "yes,no";}
if ($union ne "no" && $union ne "yes" && $union ne "yes,no")
{
print "ERROR: union $union\n";
exit;
}
return $union;
}
#------------------------------------------------------------------#
# GET THE PROVISIONAL FEATURES FOR A PARENT NODE $parent_name:
sub get_provisional_feat
{
my $provisional_feats = $_[0]; #> THE PROVISIONAL FEATURES FOR THE DAUGHTER NODES.
my $parent_name = $_[1]; #>
my $myfeats = $_[2]; #>
my $PROVISIONAL_FEAT = $_[3]; #>
my $VERBOSE_POSTORDER = $_[4]; #>
my $i; #>
my $provisional_feat; #>
my @provisional_feat; #>
my $j; #>
my @daughter_feats; #>
my $daughter_feat; #>
my $intersection = ""; #>
my $union = ""; #>
my $parent_provisional_feat = ""; #>
my $myfeat; #>
my $key; #>
my $cost = 0; #>
for ($j = 0; $j < @$myfeats; $j++)
{
$myfeat = $myfeats->[$j];
@daughter_feats = &Avril_modules::empty_array(@daughter_feats);
$intersection = "";
$union = "";
# GET THE PROVISIONAL FEATURES FOR $parent_name FOR $myfeats->[$j]:
for ($i = 0; $i < @$provisional_feats; $i++)
{
# GET THE PROVISIONAL FEATURES FOR DAUGHTER $i FOR $j:
$provisional_feat = $provisional_feats->[$i]; # eg yes_yes_no_yes,no_no
@provisional_feat = split(/\_/,$provisional_feat);
$daughter_feat = $provisional_feat[$j];
@daughter_feats = (@daughter_feats,$daughter_feat);
}
# SEE IF THE DAUGHTER NODES SHARE ANY FEATURES:
$union = &get_union(\@daughter_feats);
$intersection = &get_intersection(\@daughter_feats);
# IF THE INTESECTION {S} OF THE DAUGHTER NODES' FEATURES IS NON-EMPTY,
# THEN THE PARENT'S PROVISIONAL FEATURE SET IS SET EQUAL TO THAT INTERSECTION {S}.
if ($intersection ne "")
{
$key = $parent_name."_".$myfeat;
$PROVISIONAL_FEAT->{$key} = $intersection;
$parent_provisional_feat = $parent_provisional_feat."_".$intersection;
}
# IF THE INTERSECTION S OF THE DAUGHTER NODES' FEATURES IS EMPTY,
# THEN SET THE PARENT'S PROVISIONAL FEATURE SET TO BE EQUAL TO THE UNION
# OF THE DAUGHTER NODES' FEATURES:
else
{
$key = $parent_name."_".$myfeat;
$PROVISIONAL_FEAT->{$key} = $union;
$parent_provisional_feat = $parent_provisional_feat."_".$union;
# INCREMENT COST BY 1:
$cost++;
}
}
$parent_provisional_feat = substr($parent_provisional_feat,1,length($parent_provisional_feat)-1);
if ($VERBOSE_POSTORDER == 1)
{
print "Parent $parent_name has provisional feat $parent_provisional_feat : cost $cost\n";
}
return($cost);
}
#------------------------------------------------------------------#
# RECORD THE FEATURES FOR PARENT $parent.
# IN THE POST-ORDER TRAVERSAL, WE ASSIGN TO EACH INTERNAL NODE
# A SET OF PROVISIONAL TERMS, SET S:
# (i) IF THE INTERSECTION OF THE TERMS ASSIGNED TO ITS CHILDREN IS NOT EMPTY,
# THEN SET S IS THAT INTERSECTION.
# (ii) IF THE INTERSECTION OF THE TERMS ASSIGNED TO ITS CHILDREN IS EMPTY,
# THEN SET S IS THE UNION OF THE TERMS ASSIGNED TO ITS CHILDREN.
sub assign_provisional_feats_rd2
{
my $FEATURES = $_[0]; #> THE FEATURES.
my $parent = $_[1]; #>
my $parent_name = $_[2]; #>
my $PROVISIONAL_FEAT = $_[3]; #>
my $VERBOSE_POSTORDER = $_[4]; #>
my $myfeats = $_[5]; #> ARRAY OF ALL THE FEATURES IN THE TREE.
my $daughter; #>
my $daughter_name; #>
my $daughter_seqname; #>
my $daughter_feats; #>
my @daughter_feats; #>
my $cost = 0; #>
my $success = 1; #>
if ($VERBOSE_POSTORDER == 1) { print "___ finding provisional feats for parent $parent_name\n";}
# FIND THE CHILDREN OF NODE $parent:
if (!($parent->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $parent:
foreach $daughter ($parent->children)
{
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); }
else { $daughter_name = $daughter->internalID;}
if ($VERBOSE_POSTORDER == 1) { print "___ looking at daughter $daughter_name of parent $parent_name\n";}
# FIND THE FEATURES OF NODE $daughter:
if ($daughter->is_leaf)
{
$daughter_seqname = $daughter->get_tag_value('O');
($success,$daughter_feats) = &get_daughter_feats($daughter_seqname,$FEATURES,$myfeats);
if ($success == 0) { print STDERR "ERROR: could not get feats for $daughter_seqname\n"; exit;}
@daughter_feats = (@daughter_feats,$daughter_feats);
if ($VERBOSE_POSTORDER == 1) { print "___ daughter $daughter_name ($daughter_seqname) has feats $daughter_feats\n";}
}
else # THE DAUGHTER $daughter IS AN INTERNAL NODE:
{
($success,$daughter_feats) = &get_daughter_feats($daughter_name,$PROVISIONAL_FEAT,$myfeats);
if ($success == 0)
{
if ($VERBOSE_POSTORDER == 1) { print "Do not know provisional feats for $daughter_name yet ...\n";}
return($cost,$success);
}
@daughter_feats = (@daughter_feats,$daughter_feats);
if ($VERBOSE_POSTORDER == 1) { print "___ daughter $daughter_name (internal) has feats $daughter_feats\n";}
}
}
}
else { print STDERR "ERROR: parent $parent_name is a sequence.\n"; exit;}
# GET THE PROVISIONAL FEATURES FOR $parent_name:
$cost = &get_provisional_feat(\@daughter_feats,$parent_name,$myfeats,$PROVISIONAL_FEAT,
$VERBOSE_POSTORDER);
return($cost,$success);
}
#------------------------------------------------------------------#
# RECORD THE FEATURES FOR PARENT $parent. IN THE POST-ORDER
# TRAVERSAL, WE ASSIGN TO EACH INTERNAL NODE A SET OF PROVISIONAL
# TERMS THAT IS THE UNION OF THE TERMS ASSIGNED TO ITS CHILDREN:
sub assign_provisional_feats_rd1
{
my $FEATURES = $_[0]; # THE FEATURES.
my $parent = $_[1]; #
my $parent_name = $_[2]; #
my $PROVISIONAL_FEAT = $_[3]; #
my $VERBOSE_POSTORDER = $_[4]; #
my $daughter; #
my $daughter_name; #
my $daughter_seqname; #
my $daughter_feats; #
my @daughter_feats; #
my $provisional_feat = ""; #
my $i; #
my %SEEN = (); #
if ($VERBOSE_POSTORDER == 1) { print "___ finding provisional feats for parent $parent_name\n";}
# FIND THE CHILDREN OF NODE $parent:
if (!($parent->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $parent:
foreach $daughter ($parent->children)
{
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); }
else { $daughter_name = $daughter->internalID;}
if ($VERBOSE_POSTORDER == 1) { print "___ looking at daughter $daughter_name of parent $parent_name\n";}
# FIND THE FEATURES OF NODE $daughter:
if ($daughter->is_leaf)
{
$daughter_seqname = $daughter->get_tag_value('O');
if ($FEATURES->{$daughter_seqname})
{
$daughter_feats = $FEATURES->{$daughter_seqname};
if ($VERBOSE_POSTORDER == 1) { print "___ daughter $daughter_name ($daughter_seqname) has feats $daughter_feats\n";}
@daughter_feats = split(/\,/,$daughter_feats);
for ($i = 0; $i <= $#daughter_feats; $i++)
{
if (!($SEEN{$daughter_feats[$i]}))
{
$SEEN{$daughter_feats[$i]} = 1;
$provisional_feat = $provisional_feat.",".$daughter_feats[$i];
}
}
}
}
else
{
if ($PROVISIONAL_FEAT->{$daughter_name})
{
$daughter_feats = $PROVISIONAL_FEAT->{$daughter_name};
if ($VERBOSE_POSTORDER == 1) { print "___ daughter $daughter_name has feats $daughter_feats\n";}
@daughter_feats = split(/\,/,$daughter_feats);
for ($i = 0; $i <= $#daughter_feats; $i++)
{
if (!($SEEN{$daughter_feats[$i]}))
{
$SEEN{$daughter_feats[$i]} = 1;
$provisional_feat = $provisional_feat.",".$daughter_feats[$i];
}
}
}
}
}
}
else { print STDERR "ERROR: parent $parent_name is a sequence.\n"; exit;}
if ($provisional_feat ne "")
{
$provisional_feat = substr($provisional_feat,1,length($provisional_feat)-1);
if ($VERBOSE_POSTORDER == 1) { print "___ parent $parent_name has provisional feat $provisional_feat\n";}
$PROVISIONAL_FEAT->{$parent_name} = $provisional_feat;
}
}
#------------------------------------------------------------------#
# DO A POSTORDER TRAVERSAL OF THE TREE:
sub do_postorder_traversal_rd2
{
my $tree = $_[0]; #> THE TREE OBJECT.
my $FEATURES = $_[1]; #> THE FEATURES.
my $myfeats = $_[2]; #> ARRAY OF ALL THE FEATURES IN THE WHOLE FAMILY.
my $root; #>
my $root_name; #>
my @nodes; #>
my @parents; #>
my $no_rounds; #>
my $no_new_nodes; #>
my %SEEN_NODE = (); #>
my %SEEN_PARENT = (); #>
my $node; #>
my $node_name; #>
my $VERBOSE_POSTORDER = 0; #>
my $parent; #>
my $parent_name; #>
my $daughter; #>
my $daughter_name; #>
my $i; #>
my %PROVISIONAL_FEAT = (); #>
my $total_cost = 0; #>
my $cost = 0; #>
my $success = 0; #>
# FIND THE ROOT OF THE TREE:
$root = $tree->root;
$root_name = $root->internalID;
# LIST ALL THE LEAVES FROM THE TREE:
@nodes = &Avril_modules::empty_array(@nodes);
@parents = &Avril_modules::empty_array(@parents);
@nodes = $tree->get_leaves;
$no_rounds = 0;
NEXT_ROUND1_rd2:
$no_rounds++;
$no_new_nodes = 0;
%SEEN_NODE = ();
%SEEN_PARENT = ();
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1,
# THIS ARRAY CONTAINS THE LEAVES. WHEN $no_rounds=2, THIS ARRAY
# CONTAINS THEIR PARENTS. WHEN $no_rounds=3, THIS ARRAY CONTAINS
# THEIR GRANDPARENTS, AND SO ON.
foreach $node(@nodes)
{
if ($node->is_leaf) { $node_name = $node->id(); }
else { $node_name = $node->internalID(); }
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited node $node_name\n";}
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;}
# FIND THE PARENT OF NODE $node:
if (defined($node->parent)) { $parent = $node->parent; }
else { print STDERR "ERROR: do not know parent of node $node_name.\n"; exit;}
$parent_name = $parent->internalID;
# FIND THE OTHER CHILD OF NODE $parent:
if (!($parent->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $parent:
foreach $daughter ($parent->children)
{
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); }
else { $daughter_name = $daughter->internalID;}
if ($daughter_name ne $node_name)
{
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited sibling $daughter_name\n";}
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;}
}
}
}
else { print STDERR "ERROR: parent $parent_name is a sequence.\n"; exit;}
# RECORD THAT WE VISITED THE PARENT $parent:
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: saw parent $parent_name\n";}
if (!($SEEN_PARENT{$parent_name}) && $parent_name ne $root_name)
{
# RECORD THE FEATURES FOR PARENT $parent.
# IN THE POST-ORDER TRAVERSAL, WE ASSIGN TO EACH INTERNAL NODE
# A SET OF PROVISIONAL TERMS, SET S:
# (i) IF THE INTERSECTION OF THE TERMS ASSIGNED TO ITS CHILDREN IS NOT EMPTY,
# THEN SET S IS THAT INTERSECTION.
# (ii) IF THE INTERSECTION OF THE TERMS ASSIGNED TO ITS CHILDREN IS EMPTY,
# THEN SET S IS THE UNION OF THE TERMS ASSIGNED TO ITS CHILDREN.
($cost,$success) = &assign_provisional_feats_rd2($FEATURES,$parent,$parent_name,\%PROVISIONAL_FEAT,
$VERBOSE_POSTORDER,$myfeats);
if ($success == 1)
{
@parents = (@parents,$parent);
$SEEN_PARENT{$parent_name} = 1;
$total_cost = $total_cost + $cost;
}
else # xxx need to fix for rd1 alg
{
@parents = (@parents,$node);
$SEEN_PARENT{$node_name} = 1;
}
}
}
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds rounds: number new = $no_new_nodes\n";}
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL:
if ($no_new_nodes > 0)
{
@nodes = &Avril_modules::empty_array(@nodes);
for ($i = 0; $i <= $#parents; $i++) { @nodes = (@nodes,$parents[$i]);}
@parents = &Avril_modules::empty_array(@parents);
goto NEXT_ROUND1_rd2;
}
# VISIT ROOT:
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited root $root_name\n";}
($cost,$success) = &assign_provisional_feats_rd2($FEATURES,$root,$root_name,\%PROVISIONAL_FEAT,
$VERBOSE_POSTORDER,$myfeats);
if ($success == 0) { print STDERR "ERROR: success $success at root node.\n"; exit;}
$total_cost = $total_cost + $cost;
if ($VERBOSE_POSTORDER == 1) { print "Finished post-order traversal, total cost $total_cost\n";}
return(\%PROVISIONAL_FEAT);
}
#------------------------------------------------------------------#
# DO A POSTORDER TRAVERSAL OF THE TREE:
sub do_postorder_traversal_rd1
{
my $tree = $_[0]; # THE TREE OBJECT.
my $FEATURES = $_[1]; # THE FEATURES.
my $root; #
my $root_name; #
my @nodes; #
my @parents; #
my $no_rounds; #
my $no_new_nodes; #
my %SEEN_NODE = (); #
my %SEEN_PARENT = (); #
my $node; #
my $node_name; #
my $VERBOSE_POSTORDER = 0; #
my $parent; #
my $parent_name; #
my $daughter; #
my $daughter_name; #
my $i; #
my %PROVISIONAL_FEAT = (); #
# FIND THE ROOT OF THE TREE:
$root = $tree->root;
$root_name = $root->internalID;
# LIST ALL THE LEAVES FROM THE TREE:
@nodes = &Avril_modules::empty_array(@nodes);
@parents = &Avril_modules::empty_array(@parents);
@nodes = $tree->get_leaves;
$no_rounds = 0;
NEXT_ROUND1_rd1:
$no_rounds++;
$no_new_nodes = 0;
%SEEN_NODE = ();
%SEEN_PARENT = ();
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1,
# THIS ARRAY CONTAINS THE LEAVES. WHEN $no_rounds=2, THIS ARRAY
# CONTAINS THEIR PARENTS. WHEN $no_rounds=3, THIS ARRAY CONTAINS
# THEIR GRANDPARENTS, AND SO ON.
foreach $node(@nodes)
{
if ($node->is_leaf) { $node_name = $node->id(); }
else { $node_name = $node->internalID(); }
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited node $node_name\n";}
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;}
# FIND THE PARENT OF NODE $node:
if (defined($node->parent)) { $parent = $node->parent; }
else { print STDERR "ERROR: do not know parent of node $node_name.\n"; exit;}
$parent_name = $parent->internalID;
# FIND THE OTHER CHILD OF NODE $parent:
if (!($parent->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $parent:
foreach $daughter ($parent->children)
{
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); }
else { $daughter_name = $daughter->internalID;}
if ($daughter_name ne $node_name)
{
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited sibling $daughter_name\n";}
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;}
}
}
}
else { print STDERR "ERROR: parent $parent_name is a sequence.\n"; exit;}
# RECORD THAT WE VISITED THE PARENT $parent:
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: saw parent $parent_name\n";}
if (!($SEEN_PARENT{$parent_name}) && $parent_name ne $root_name)
{
@parents = (@parents,$parent);
$SEEN_PARENT{$parent_name} = 1;
# RECORD THE FEATURES FOR PARENT $parent. IN THE POST-ORDER
# TRAVERSAL, WE ASSIGN TO EACH INTERNAL NODE A SET OF PROVISIONAL
# TERMS THAT IS THE UNION OF THE TERMS ASSIGNED TO ITS CHILDREN:
&assign_provisional_feats_rd1($FEATURES,$parent,$parent_name,\%PROVISIONAL_FEAT,$VERBOSE_POSTORDER);
}
}
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds rounds: number new = $no_new_nodes\n";}
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL:
if ($no_new_nodes > 0)
{
@nodes = &Avril_modules::empty_array(@nodes);
for ($i = 0; $i <= $#parents; $i++) { @nodes = (@nodes,$parents[$i]);}
@parents = &Avril_modules::empty_array(@parents);
goto NEXT_ROUND1_rd1;
}
# VISIT ROOT:
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited root $root_name\n";}
&assign_provisional_feats_rd1($FEATURES,$root,$root_name,\%PROVISIONAL_FEAT,$VERBOSE_POSTORDER);
if ($VERBOSE_POSTORDER == 1) { print "Finished post-order traversal\n";}
return(\%PROVISIONAL_FEAT);
}
#------------------------------------------------------------------#
# GET THE DESCENDANTS OF A NODE:
sub get_descendants
{
my $this_node = $_[0]; #
my $SPECIES_WITH_DATA = $_[1]; #
my @nodes; #
my @children; #
my $no_rounds; #
my $no_new_nodes; #
my %SEEN_NODE = (); #
my %SEEN_CHILD = (); #
my $node; #
my $node_name; #
my $child; #
my $child_name; #
my $i; #
my $descendants = ""; #
my $no_descendants = 0; #
# ARRAY @nodes STARTS BY CONTAINING JUST THE NODE $this_node:
@nodes = &Avril_modules::empty_array(@nodes);
@children = &Avril_modules::empty_array(@children);
@nodes = (@nodes,$this_node);
$no_rounds = 0;
GET_MORE_DESCENDANTS:
$no_rounds++;
$no_new_nodes = 0;
%SEEN_NODE = ();
%SEEN_CHILD = ();
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1,
# THIS ARRAY CONTAINS THE ROOT. WHEN $no_rounds=2, THIS ARRAY
# CONTAINS ITS CHILDREN. WHEN $no_rounds=3, THIS ARRAY CONTAINS
# ITS GRANDCHILDREN, AND SO ON.
foreach $node(@nodes)
{
if ($node->is_leaf) { $node_name = $node->id(); }
else { $node_name = $node->internalID(); }
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;}
# FIND THE CHILDREN OF NODE $node:
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $node:
foreach $child ($node->children)
{
if ($child->is_leaf) { $child_name = $child->id(); }
else { $child_name = $child->internalID;}
# RECORD THAT WE SAW THE CHILD $child:
if (!($SEEN_CHILD{$child_name}))
{
@children = (@children,$child);
$SEEN_CHILD{$child_name} = 1;
}
}
}
else
{
$node_seqname = $node->get_tag_value('O');
# ONLY TAKE THIS SEQUENCE IF IT IS FROM A SPECIES WITH GO/PFAM/OTHER FEATURE ANNOTATION DATA:
if ($SPECIES_WITH_DATA->{$node_seqname})
{
$descendants = $descendants.",".$node_seqname;
$no_descendants++;
}
}
# ELSE, THIS NODE IS A SEQUENCE, SO DOES NOT HAVE ANY CHILDREN.
}
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL:
if ($no_new_nodes > 0)
{
@nodes = &Avril_modules::empty_array(@nodes);
for ($i = 0; $i <= $#children; $i++) { @nodes = (@nodes,$children[$i]);}
@children = &Avril_modules::empty_array(@children);
goto GET_MORE_DESCENDANTS;
}
if ($descendants ne '')
{
$descendants = substr($descendants,1,length($descendants)-1);
}
return($descendants,$no_descendants);
}
#------------------------------------------------------------------#
# CHECK WHETHER THE CONFIRMED FEATURES FOR NODE $node ARE
# DIFFERENT FROM THOSE OF ITS CHILDREN.
sub compare_parent_to_children_rd2
{
my $CONFIRMED_FEAT = $_[0]; #>
my $node = $_[1]; #>
my $node_name = $_[2]; #>
my $VERBOSE_CHANGES = $_[3]; #>
my $myfeats = $_[4]; #>
my $SPECIES_WITH_DATA = $_[5]; #>
my $daughter; #>
my $daughter_name; #>
my $daughter_seqname; #>
my $daughter_key; #>
my $j; #>
my $confirmed1; #>
my $confirmed2; #>
my $myfeat; #>
my $key; #>
my $leaf = 0; #>
my $descendants; #>
my $no_descendants; #>
my $node_type; #>
# COMPARE $node TO EACH OF ITS CHILDREN FOR EACH FEATURE:
if ($VERBOSE_CHANGES == 1) { print "___ comparing parent $node_name to its children\n";}
# CHECK IF $node IS A SPECIATION NODE OR A DUPLICATION NODE:
# $node_type HAS VALUE Y FOR DUPLICATION, N FOR SPECIATION.
$node_type = $node->get_tag_value('D');
if ($node_type eq 'Y') { $node_type = "duplication"; }
elsif ($node_type eq 'N') { $node_type = "speciation"; }
else { print STDERR "ERROR: node_type $node_type.\n"; exit;}
for ($j = 0; $j < @$myfeats; $j++)
{
$myfeat = $myfeats->[$j];
$key = $node_name."_".$myfeat;
if (!($CONFIRMED_FEAT->{$key})) { print STDERR "ERROR: do not know confirmed feat for $key\n"; exit;}
$confirmed1 = $CONFIRMED_FEAT->{$key};
# FIND THE TWO DAUGHTER NODES OF $node:
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $node:
foreach $daughter ($node->children)
{
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); }
else { $daughter_name = $daughter->internalID;}
# FIND THE CONFIRMED FEATURES OF NODE $daughter:
if ($daughter->is_leaf)
{
$leaf = 1;
$daughter_seqname = $daughter->get_tag_value('O');
$daughter_key = $daughter_seqname."_".$myfeat;
}
else
{
$leaf = 0;
$daughter_key = $daughter_name."_".$myfeat;
}
if (!($CONFIRMED_FEAT->{$daughter_key})) { print STDERR "ERROR: do not know confirmed feat for $daughter_key\n"; exit;}
$confirmed2 = $CONFIRMED_FEAT->{$daughter_key};
if ($confirmed1 ne $confirmed2)
{
if ($confirmed1 eq 'yes' && $confirmed2 eq 'no') # APPARENT FEATURE LOSS.
{
($descendants,$no_descendants) = &get_descendants($daughter,$SPECIES_WITH_DATA);
if ($leaf == 1)
{
print "*** FEATURE_LOSS_LEAF: $node_type node $node_name has feature $myfeat, that daughter $daughter_name does not have\n";
print " FEATURE_LOSS_LEAF: $daughter_name has descendants $descendants\n\n";
}
else
{
if ($no_descendants > 1)
{
print "*** FEATURE_LOSS_INTERNAL: $node_type node $node_name has feature $myfeat, that daughter $daughter_name does not have\n";
print " FEATURE_LOSS_INTERNAL: $daughter_name has descendants $descendants\n\n";
}
else
{
print "*** FEATURE_LOSS_LEAF: $node_type node $node_name has feature $myfeat, that daughter $daughter_name does not have\n";
print " FEATURE_LOSS_LEAF: $daughter_name has descendants $descendants\n\n";
}
}
}
elsif ($confirmed1 eq 'no' && $confirmed2 eq 'yes') # APPARENT FEATURE GAIN.
{
($descendants,$no_descendants) = &get_descendants($daughter,$SPECIES_WITH_DATA);
if ($leaf == 1)
{
print "*** FEATURE_GAIN_LEAF: $node_type node $node_name doesn't have feature $myfeat, but daughter $daughter_name does have it\n";
print "*** FEATURE_GAIN_LEAF: $daughter_name has descendants $descendants\n\n";
}
else
{
if ($no_descendants > 1)
{
print "*** FEATURE_GAIN_INTERNAL: $node_type node $node_name doesn't have feature $myfeat, but daughter $daughter_name does have it\n";
print " FEATURE_GAIN_INTERNAL: $daughter_name has descendants $descendants\n\n";
}
else
{
print "*** FEATURE_GAIN_LEAF: $node_type node $node_name doesn't have feature $myfeat, but daughter $daughter_name does have it\n";
print " FEATURE_GAIN_LEAF: $daughter_name has descendants $descendants\n\n";
}
}
}
}
}
}
else
{
print STDERR "ERROR: parent $node_name is a sequence\n"; exit;
}
}
}
#------------------------------------------------------------------#
# CHECK WHETHER THE CONFIRMED FEATURES FOR NODE $node ARE
# DIFFERENT FROM THOSE OF ITS CHILDREN.
sub compare_parent_to_children_rd1
{
my $CONFIRMED_FEAT = $_[0]; #
my $node = $_[1]; #
my $node_name = $_[2]; #
my $VERBOSE_CHANGES = $_[3]; #
my $FEATURES = $_[4]; #
my $SPECIES_WITH_DATA = $_[5]; #
my $daughter; #
my $daughter_name; #
my $daughter_seqname; #
my $daughter_feats; #
my $node_feats; #
my @daughter_feats; #
my @node_feats; #
my %NODE_FEAT = (); #
my %DAUGHTER_FEAT = (); #
my $i; #
my $feat; #
my %SEEN = (); #
my $change1; #
my $change2; #
my $descendants; #
my $no_descendants; #
# FIND THE FEATURES FOR $node:
if (!($CONFIRMED_FEAT->{$node_name})) { $node_feats = "none"; }
else { $node_feats = $CONFIRMED_FEAT->{$node_name};}
if ($VERBOSE_CHANGES == 1) { print "___ comparing parent $node_name ($node_feats) to its children\n";}
# FIND THE CHILDREN OF NODE $node:
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $node:
foreach $daughter ($node->children)
{
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); }
else { $daughter_name = $daughter->internalID;}
if ($VERBOSE_CHANGES == 1) { print "___ comparing parent $node_name to daughter $daughter_name\n";}
# FIND THE CONFIRMED FEATURES OF NODE $daughter:
$daughter_feats = "";
if ($daughter->is_leaf)
{
$daughter_seqname = $daughter->get_tag_value('O');
if ($FEATURES->{$daughter_seqname})
{
$daughter_feats = $FEATURES->{$daughter_seqname};
}
else { $daughter_feats = "none";}
if ($VERBOSE_CHANGES == 1) { print "___ daughter $daughter_name ($daughter_seqname) has feats $daughter_feats\n";}
}
else
{
if ($CONFIRMED_FEAT->{$daughter_name}) { $daughter_feats = $CONFIRMED_FEAT->{$daughter_name};}
else { $daughter_feats = "none"; }
if ($VERBOSE_CHANGES == 1) { print "___ daughter $daughter_name has feats $daughter_feats\n";}
}
# CHECK IF $daughter_feats IS DIFFERENT FROM $node_feats:
@node_feats = split(/\,/,$node_feats);
@daughter_feats = split(/\,/,$daughter_feats);
%NODE_FEAT = ();
%DAUGHTER_FEAT = ();
# RECORD ALL THE FEATURES FOR $node:
for ($i = 0; $i <= $#node_feats; $i++)
{
if ($node_feats[$i] ne 'none')
{
$NODE_FEAT{$node_feats[$i]} = 1;
}
}
# RECORD ALL THE FEATURES FOR $daughter:
for ($i = 0; $i <= $#daughter_feats; $i++)
{
if ($daughter_feats[$i] ne 'none')
{
$DAUGHTER_FEAT{$daughter_feats[$i]} = 1;
}
}
# CHECK IF $node HAS ANY FEATURES THAT $daughter DOES NOT HAVE:
foreach $feat (keys %NODE_FEAT)
{
if (!($DAUGHTER_FEAT{$feat}))
{
$change1 = $node_name."_".$daughter_name."_".$feat;
$change2 = $daughter_name."_".$node_name."_".$feat;
if (!($SEEN{$change1}) && !($SEEN{$change2}))
{
$SEEN{$change1} = 1;
$SEEN{$change2} = 1;
# xxx only print changes at internal nodes?
if (!($daughter->is_leaf())) # ONLY PRINT OUT APPARENT LOSSES IF $daughter IS AN INTERNAL NODE,
# SO THE LOSS HAS OCCURRED IN MORE THAN ONE SEQUENCE.
{
($descendants,$no_descendants) = &get_descendants($daughter,$SPECIES_WITH_DATA);
if ($no_descendants > 1)
{
print "*** INTERNAL FEATURE LOSS: node $node_name has feature $feat, that daughter $daughter_name does not have\n";
print " INTERNAL FEATURE LOSS: $daughter_name has descendants $descendants\n\n";
}
}
}
}
}
# CHECK IF $daughter HAS ANY FEATURES THAT $node DOES NOT HAVE:
foreach $feat (keys %DAUGHTER_FEAT)
{
if (!($NODE_FEAT{$feat}))
{
$change1 = $node_name."_".$daughter_name."_".$feat;
$change2 = $daughter_name."_".$node_name."_".$feat;
if (!($SEEN{$change1}) && !($SEEN{$change2}))
{
$SEEN{$change1} = 1;
$SEEN{$change2} = 1;
# xxx only print changes at internal nodes?
if (!($daughter->is_leaf())) # ONLY PRINT OUT APPARENT GAINS IF THE $daughter NODE IS AN INTERNAL NODE,
# SO THE GAIN HAS OCCURRED IN MORE THAN ONE SEQUENCE.
{
($descendants,$no_descendants) = &get_descendants($daughter,$SPECIES_WITH_DATA);
if ($no_descendants > 1)
{
print "*** INTERNAL FEATURE GAIN: daughter $daughter_name has feature $feat, that parent $node_name does not have\n";
print " INTERNAL FEATURE GAIN: $daughter_name has descendants $descendants\n";
}
}
}
}
}
}
}
else { print STDERR "ERROR: parent $node_name is a sequence.\n"; exit;}
}
#------------------------------------------------------------------#
# FIND NODES IN THE TREE WHERE CONFIRMED FEATURES OF A PARENT NODE
# DIFFERS FROM THOSE OF ITS DAUGHTER NODES:
sub find_changes_in_feat_rd2
{
my $tree = $_[0]; #>
my $CONFIRMED_FEAT = $_[1]; #>
my $FEATURES = $_[2]; #>
my $myfeats = $_[3]; #>
my $SPECIES_WITH_DATA = $_[4]; #>
my $family_id = $_[5]; #>
my $root; #>
my $root_name; #>
my @nodes; #>
my @children; #>
my $no_rounds; #>
my $no_new_nodes; #>
my %SEEN_NODE = (); #>
my %SEEN_CHILD = (); #>
my $node; #>
my $node_name; #>
my $VERBOSE_CHANGES = 0; #>
my $child; #>
my $child_name; #>
my $i; #>
my $PRINT_ROOT = 1; #>
my $myfeat; #>
my $key; #>
my $confirmed; #>
my $root_confirmed = ""; #>
# FIND THE ROOT OF THE TREE:
$root = $tree->root;
$root_name = $root->internalID;
# PRINT THE CONFIRMED FEATURES IN THE ROOT:
print "\nRoot for $family_id: ";
for ($i = 0; $i < @$myfeats; $i++)
{
$myfeat = $myfeats->[$i];
$key = $root_name."_".$myfeat;
if (!($CONFIRMED_FEAT->{$key})) { print STDERR "ERROR: do not know confirmed feat for $key\n"; exit;}
$confirmed = $CONFIRMED_FEAT->{$key};
if ($confirmed eq 'yes') { $root_confirmed = $root_confirmed." $myfeat=$confirmed";}
}
if ($root_confirmed eq '') { $root_confirmed = "none"; }
else { $root_confirmed = substr($root_confirmed,1,length($root_confirmed)-1);}
print "$root_confirmed\n\n";
# ARRAY @nodes STARTS BY CONTAINING JUST THE ROOT NODE:
@nodes = &Avril_modules::empty_array(@nodes);
@children = &Avril_modules::empty_array(@children);
@nodes = (@nodes,$root);
$no_rounds = 0;
NEXT_ROUND3_rd2:
$no_rounds++;
$no_new_nodes = 0;
%SEEN_NODE = ();
%SEEN_CHILD = ();
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1,
# THIS ARRAY CONTAINS THE ROOT. WHEN $no_rounds=2, THIS ARRAY
# CONTAINS ITS CHILDREN. WHEN $no_rounds=3, THIS ARRAY CONTAINS
# ITS GRANDCHILDREN, AND SO ON.
foreach $node(@nodes)
{
if ($node->is_leaf) { $node_name = $node->id(); }
else { $node_name = $node->internalID(); }
if ($VERBOSE_CHANGES == 1) { print "$no_rounds: visited node $node_name\n";}
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;}
# CHECK WHETHER THE CONFIRMED FEATURES FOR NODE $node ARE
# DIFFERENT FROM THOSE OF ITS CHILDREN.
if (!($node->is_leaf))
{
&compare_parent_to_children_rd2($CONFIRMED_FEAT,$node,$node_name,$VERBOSE_CHANGES,$myfeats,$SPECIES_WITH_DATA);
}
# FIND THE CHILDREN OF NODE $node:
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $node:
foreach $child ($node->children)
{
if ($child->is_leaf) { $child_name = $child->id(); }
else { $child_name = $child->internalID;}
if ($VERBOSE_CHANGES == 1) { print "$no_rounds: saw child $child_name\n"; }
# RECORD THAT WE SAW THE CHILD $child:
if (!($SEEN_CHILD{$child_name}))
{
@children = (@children,$child);
$SEEN_CHILD{$child_name} = 1;
}
}
} # ELSE, THIS NODE IS A SEQUENCE, SO DOES NOT HAVE ANY CHILDREN.
}
if ($VERBOSE_CHANGES == 1) { print "$no_rounds rounds: number new = $no_new_nodes\n";}
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL:
if ($no_new_nodes > 0)
{
@nodes = &Avril_modules::empty_array(@nodes);
for ($i = 0; $i <= $#children; $i++) { @nodes = (@nodes,$children[$i]);}
@children = &Avril_modules::empty_array(@children);
goto NEXT_ROUND3_rd2;
}
if ($VERBOSE_CHANGES == 1) { print "Finished searching for changes in features\n";}
}
#------------------------------------------------------------------#
# FIND NODES IN THE TREE WHERE CONFIRMED FEATURES OF A PARENT NODE
# DIFFERS FROM THOSE OF ITS DAUGHTER NODES:
sub find_changes_in_feat_rd1
{
my $tree = $_[0]; #
my $CONFIRMED_FEAT = $_[1]; #
my $FEATURES = $_[2]; #
my $SPECIES_WITH_DATA = $_[3]; #
my $root; #
my $root_name; #
my @nodes; #
my @children; #
my $no_rounds; #
my $no_new_nodes; #
my %SEEN_NODE = (); #
my %SEEN_CHILD = (); #
my $node; #
my $node_name; #
my $VERBOSE_CHANGES = 0; #
my $child; #
my $child_name; #
my $i; #
# FIND THE ROOT OF THE TREE:
$root = $tree->root;
$root_name = $root->internalID;
# ARRAY @nodes STARTS BY CONTAINING JUST THE ROOT NODE:
@nodes = &Avril_modules::empty_array(@nodes);
@children = &Avril_modules::empty_array(@children);
@nodes = (@nodes,$root);
$no_rounds = 0;
NEXT_ROUND3_rd1:
$no_rounds++;
$no_new_nodes = 0;
%SEEN_NODE = ();
%SEEN_CHILD = ();
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1,
# THIS ARRAY CONTAINS THE ROOT. WHEN $no_rounds=2, THIS ARRAY
# CONTAINS ITS CHILDREN. WHEN $no_rounds=3, THIS ARRAY CONTAINS
# ITS GRANDCHILDREN, AND SO ON.
foreach $node(@nodes)
{
if ($node->is_leaf) { $node_name = $node->id(); }
else { $node_name = $node->internalID(); }
if ($VERBOSE_CHANGES == 1) { print "$no_rounds: visited node $node_name\n";}
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;}
# CHECK WHETHER THE CONFIRMED FEATURES FOR NODE $node ARE
# DIFFERENT FROM THOSE OF ITS CHILDREN.
if (!($node->is_leaf))
{
&compare_parent_to_children_rd1($CONFIRMED_FEAT,$node,$node_name,$VERBOSE_CHANGES,$FEATURES,$SPECIES_WITH_DATA);
}
# FIND THE CHILDREN OF NODE $node:
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $node:
foreach $child ($node->children)
{
if ($child->is_leaf) { $child_name = $child->id(); }
else { $child_name = $child->internalID;}
if ($VERBOSE_CHANGES == 1) { print "$no_rounds: saw child $child_name\n"; }
# RECORD THAT WE SAW THE CHILD $child:
if (!($SEEN_CHILD{$child_name}))
{
@children = (@children,$child);
$SEEN_CHILD{$child_name} = 1;
}
}
} # ELSE, THIS NODE IS A SEQUENCE, SO DOES NOT HAVE ANY CHILDREN.
}
if ($VERBOSE_CHANGES == 1) { print "$no_rounds rounds: number new = $no_new_nodes\n";}
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL:
if ($no_new_nodes > 0)
{
@nodes = &Avril_modules::empty_array(@nodes);
for ($i = 0; $i <= $#children; $i++) { @nodes = (@nodes,$children[$i]);}
@children = &Avril_modules::empty_array(@children);
goto NEXT_ROUND3_rd1;
}
if ($VERBOSE_CHANGES == 1) { print "Finished searching for changes in features\n";}
}
#------------------------------------------------------------------#
# DO A PREORDER TRAVERSAL OF THE TREE TO ASSIGN CONFIRMED FEATURES:
sub do_preorder_traversal_rd2
{
my $tree = $_[0]; #>
my $PROVISIONAL_FEAT = $_[1]; #>
my $FEATURES = $_[2]; #>
my $myfeats = $_[3]; #>
my %CONFIRMED_FEAT = (); #>
my $root; #>
my $root_name; #>
my @nodes; #>
my @children; #>
my $no_rounds = 0; #>
my $no_new_nodes; #>
my %SEEN_NODE = (); #>
my %SEEN_CHILD = (); #>
my $node; #>
my $node_name; #>
my $VERBOSE_PREORDER = 0; #>
my $child; #>
my $child_name; #>
my $i; #>
# FIND THE ROOT OF THE TREE:
$root = $tree->root;
$root_name = $root->internalID;
# ARRAY @nodes STARTS BY CONTAINING JUST THE ROOT NODE:
@nodes = &Avril_modules::empty_array(@nodes);
@children = &Avril_modules::empty_array(@children);
@nodes = (@nodes,$root);
$no_rounds = 0;
NEXT_ROUND4_rd2:
$no_rounds++;
$no_new_nodes = 0;
%SEEN_NODE = ();
%SEEN_CHILD = ();
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1,
# THIS ARRAY CONTAINS THE ROOT. WHEN $no_rounds=2, THIS ARRAY
# CONTAINS ITS CHILDREN. WHEN $no_rounds=3, THIS ARRAY CONTAINS
# ITS GRANDCHILDREN, AND SO ON.
foreach $node(@nodes)
{
if ($node->is_leaf) { $node_name = $node->id(); }
else { $node_name = $node->internalID(); }
if ($VERBOSE_PREORDER == 1) { print "$no_rounds: visited node $node_name\n";}
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;}
# RECORD THE FEATURES FOR NODE $node. IN THE PRE-ORDER TRAVERSAL,
# WE CHOOSE A CHARACTER FOR NODE $node. WE TRY TO PICK THE SAME RESIDUE
# FOR THE DAUGHTER NODES IF POSSIBLE.
&assign_confirmed_feats_rd2($PROVISIONAL_FEAT,$node,$node_name,\%CONFIRMED_FEAT,$VERBOSE_PREORDER,$FEATURES,
$myfeats,$root_name);
# FIND THE CHILDREN OF NODE $node:
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $node:
foreach $child ($node->children)
{
if ($child->is_leaf) { $child_name = $child->id(); }
else { $child_name = $child->internalID;}
if ($VERBOSE_PREORDER == 1) { print "$no_rounds: saw child $child_name\n"; }
# RECORD THAT WE SAW THE CHILD $child:
if (!($SEEN_CHILD{$child_name}))
{
@children = (@children,$child);
$SEEN_CHILD{$child_name} = 1;
}
}
} # ELSE, THIS NODE IS A SEQUENCE, SO DOES NOT HAVE ANY CHILDREN.
}
if ($VERBOSE_PREORDER == 1) { print "$no_rounds rounds: number new = $no_new_nodes\n";}
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL:
if ($no_new_nodes > 0)
{
@nodes = &Avril_modules::empty_array(@nodes);
for ($i = 0; $i <= $#children; $i++) { @nodes = (@nodes,$children[$i]);}
@children = &Avril_modules::empty_array(@children);
goto NEXT_ROUND4_rd2;
}
if ($VERBOSE_PREORDER == 1) { print "Finished pre-order traversal\n";}
return(\%CONFIRMED_FEAT);
}
#------------------------------------------------------------------#
# DO A PREORDER TRAVERSAL OF THE TREE TO ASSIGN CONFIRMED FEATURES:
sub do_preorder_traversal_rd1
{
my $tree = $_[0]; #
my $PROVISIONAL_FEAT = $_[1]; #
my $FEATURES = $_[2]; #
my %CONFIRMED_FEAT = (); #
my $root; #
my $root_name; #
my @nodes; #
my @children; #
my $no_rounds = 0; #
my $no_new_nodes; #
my %SEEN_NODE = (); #
my %SEEN_CHILD = (); #
my $node; #
my $node_name; #
my $VERBOSE_PREORDER = 0; #
my $child; #
my $child_name; #
my $i; #
my $node_type; #
# FIND THE ROOT OF THE TREE:
$root = $tree->root;
$root_name = $root->internalID;
# ARRAY @nodes STARTS BY CONTAINING JUST THE ROOT NODE:
@nodes = &Avril_modules::empty_array(@nodes);
@children = &Avril_modules::empty_array(@children);
@nodes = (@nodes,$root);
$no_rounds = 0;
NEXT_ROUND2:
$no_rounds++;
$no_new_nodes = 0;
%SEEN_NODE = ();
%SEEN_CHILD = ();
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1,
# THIS ARRAY CONTAINS THE ROOT. WHEN $no_rounds=2, THIS ARRAY
# CONTAINS ITS CHILDREN. WHEN $no_rounds=3, THIS ARRAY CONTAINS
# ITS GRANDCHILDREN, AND SO ON.
foreach $node(@nodes)
{
if ($node->is_leaf) { $node_name = $node->id(); }
else { $node_name = $node->internalID(); }
if ($VERBOSE_PREORDER == 1) { print "$no_rounds: visited node $node_name\n";}
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;}
# RECORD THE FEATURES FOR NODE $node. IN THE PRE-ORDER TRAVERSAL,
# WE ADD TO THE CONFIRMED TERM LIST OF A NODE THOSE PROVISIONAL
# TERMS PRESENT IN BOTH CHILDREN OF THE CURRENT NODE:
&assign_confirmed_feats_rd1($PROVISIONAL_FEAT,$node,$node_name,\%CONFIRMED_FEAT,$VERBOSE_PREORDER,$FEATURES);
# IF THIS NODE IS AN INTERNAL NODE:
if (!($node->is_leaf))
{
# CHECK IF $node IS A SPECIATION NODE OR A DUPLICATION NODE:
# $node_type HAS VALUE Y FOR DUPLICATION, N FOR SPECIATION.
$node_type = $node->get_tag_value('D');
# IF $node IS A SPECIATION NODE, ADD ALL ITS CONFIRMED TERMS TO THE
# CONFIRMED LISTS OF BOTH ITS CHILDREN:
if ($node_type eq 'N') { &add_confirmed_terms_to_children($node,$node_name,\%CONFIRMED_FEAT,$VERBOSE_PREORDER);}
# IF $node IS A DUPLICATION NODE, ONLY CONFIRM IN THE CHILDREN THOSE
# CONFIRMED TERMS IN THE CURRENT NODE THAT ARE PROVISIONAL TERMS IN
# THE CHILD NODE.
elsif ($node_type eq 'Y') { &add_confirmed_terms_to_children2($node,$node_name,\%CONFIRMED_FEAT,$VERBOSE_PREORDER,
$PROVISIONAL_FEAT,$FEATURES);}
else { print STDERR "ERROR: node_type $node_type.\n"; exit;}
}
# FIND THE CHILDREN OF NODE $node:
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $node:
foreach $child ($node->children)
{
if ($child->is_leaf) { $child_name = $child->id(); }
else { $child_name = $child->internalID;}
if ($VERBOSE_PREORDER == 1) { print "$no_rounds: saw child $child_name\n"; }
# RECORD THAT WE SAW THE CHILD $child:
if (!($SEEN_CHILD{$child_name}))
{
@children = (@children,$child);
$SEEN_CHILD{$child_name} = 1;
}
}
} # ELSE, THIS NODE IS A SEQUENCE, SO DOES NOT HAVE ANY CHILDREN.
}
if ($VERBOSE_PREORDER == 1) { print "$no_rounds rounds: number new = $no_new_nodes\n";}
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL:
if ($no_new_nodes > 0)
{
@nodes = &Avril_modules::empty_array(@nodes);
for ($i = 0; $i <= $#children; $i++) { @nodes = (@nodes,$children[$i]);}
@children = &Avril_modules::empty_array(@children);
goto NEXT_ROUND2;
}
if ($VERBOSE_PREORDER == 1) { print "Finished pre-order traversal\n";}
return(\%CONFIRMED_FEAT);
}
#------------------------------------------------------------------#
# INFER THE FEATURES OF ANCESTRAL NODES IN THESE FAMILIES:
sub infer_ancestral_nodes
{
my $FEATURES = $_[0]; #>
my $families = $_[1]; #>
my $myfeats = $_[2]; #>
my $SPECIES_WITH_DATA = $_[3]; #>
my $i; #>
my $family_id; #>
my $tree; #>
my $dbc; #>
my $family; #>
my $famh; #>
my $PROVISIONAL_FEAT; #>
my $CONFIRMED_FEAT; #>
# CONNECT TO THE TREEFAM DATABASE USING DEFAULT CONFIGURATION:
$dbc = Treefam::DBConnection->new();
for ($i = 0; $i < @$families; $i++)
{
$family_id = $families->[$i]; # eg. TF314159
if ($VERBOSE3 == 1) { print "Looking at family $family_id\n\n";}
# GET A FAMILY:
$famh = $dbc->get_FamilyHandle();
$family = $famh->get_by_id($family_id);
if (!(defined($family)))
{
print "Do not have $family_id in the database...\n";
goto NEXT_FAMILY1;
}
# GET THE CLEAN TREE FOR THIS FAMILY:
$tree = $family->get_tree('clean');
if ($alg eq 'rd1') # RICHARD'S PARSIMONY ALGORITHM FOR GO ANNOTATIONS.
{
# DO A POSTORDER TRAVERSAL OF THE TREE TO ASSIGN PROVISIONAL FEATURES:
$PROVISIONAL_FEAT = &do_postorder_traversal_rd1($tree,$FEATURES);
# DO A PREORDER TRAVERSAL OF THE TREE TO ASSIGN CONFIRMED FEATURES:
$CONFIRMED_FEAT = &do_preorder_traversal_rd1($tree,$PROVISIONAL_FEAT,$FEATURES);
}
elsif ($alg eq 'rd2') # THE TRADIATIONAL PARSIMONY ALGORITHM FROM RICHARD'S BOOK.
{
# DO A POSTORDER TRAVERSAL OF THE TREE TO ASSIGN PROVISIONAL FEATURES:
$PROVISIONAL_FEAT = &do_postorder_traversal_rd2($tree,$FEATURES,$myfeats);
# DO A PREORDER TRAVERSAL OF THE TREE TO ASSIGN CONFIRMED FEATURES:
$CONFIRMED_FEAT = &do_preorder_traversal_rd2($tree,$PROVISIONAL_FEAT,$FEATURES,$myfeats);
}
elsif ($alg eq 'rd3') # THE WEIGHTED PARSIMONY ALGORITHM FROM RICHARD'S BOOK.
{
# DO A POSTORDER TRAVERSAL OF THE TREE TO ASSIGN PROVISIONAL FEATURES:
$PROVISIONAL_FEAT = &do_postorder_traversal_rd3($tree,$FEATURES,$myfeats);
print "xxx stuck\n";
exit; #xxx
}
else
{
print STDERR "ERROR: alg $alg.\n";
exit;
}
# FIND NODES IN THE TREE WHERE CONFIRMED FEATURES OF A PARENT NODE
# DIFFERS FROM THOSE OF ITS DAUGHTER NODES:
if ($alg eq 'rd1')
{
&find_changes_in_feat_rd1($tree,$CONFIRMED_FEAT,$FEATURES,$SPECIES_WITH_DATA);
}
elsif ($alg eq 'rd2')
{
&find_changes_in_feat_rd2($tree,$CONFIRMED_FEAT,$FEATURES,$myfeats,$SPECIES_WITH_DATA,$family_id);
}
elsif ($alg eq 'rd3')
{
print "xxx stuck\n";
exit; #xxx
}
else { print STDERR "ERROR: alg $alg.\n"; exit;}
NEXT_FAMILY1:
}
}
#------------------------------------------------------------------#
# GET THE PFAM DOMAIN ANNOTATIONS FOR GENES FROM THE TREEFAM MYSQL DATABASE:
sub get_pfam
{
my $families = $_[0]; #>
my $dbc; #>
my $family_id; #>
my $i; #>
my $famh; #>
my $family; #>
my $tree; #>
my @nodes; #>
my $node; #>
my $transcript; #>
my %TRANSCRIPTS = (); #>
my $gh; #>
my $gene; #>
my %GENE = (); #>
my $dbh; #>
my $table_w; #>
my $st; #>
my $sth; #>
my $rv; #>
my @array; #>
my $transcript2; #>
my $transcripts_in_tree; #>
my @transcripts_in_tree; #>
my $transcript_in_tree; #>
my $j; #>
my $transcripts; #>
my @transcripts; #>
my $k; #>
my $domain; #>
my %PFAM = (); #>
my %SEEN = (); #>
my @domains; #>
my $the_domains; #>
my @the_domains; #>
my $the_domain; #>
my %SPECIES_WITH_DATA = (); #>
my $taxid; #>
# CONNECT TO THE TREEFAM DATABASE USING DEFAULT CONFIGURATION:
# GET A LIST OF ALL THE TRANSCRIPT IDS IN EACH FAMILY:
$dbc = Treefam::DBConnection->new();
for ($i = 0; $i < @$families; $i++)
{
$family_id = $families->[$i]; # eg. TF314159
# GET A FAMILY:
$famh = $dbc->get_FamilyHandle();
# GET THE OBJECT FOR THAT FAMILY:
$family = $famh->get_by_id($family_id);
if (!(defined($family)))
{
print "Do not have $family_id in the database...\n";
goto GET_PFAM_NEXT_FAMILY;
}
# GET THE CLEAN TREE FOR THIS FAMILY:
$tree = $family->get_tree('clean');
# LIST ALL THE LEAVES FROM THE TREE:
@nodes = $tree->get_leaves;
# GET ALL THE TRANSCRIPTS FOR IN THE TREE:
foreach $node(@nodes)
{
# GET THE TRANSCRIPT FOR THIS NODE $node:
if ($node->is_leaf) { $transcript = $node->sequence_id(); } # eg. ENSMUST00000023454.4
else { print STDERR "ERROR: node is not a leaf.\n"; exit;}
# RECORD THAT TRANSCRIPT $transcript IS IN THIS FAMILY:
if (!($TRANSCRIPTS{$family_id})) { $TRANSCRIPTS{$family_id} = $transcript;}
else {$TRANSCRIPTS{$family_id} = $TRANSCRIPTS{$family_id}.",".$transcript;}
# FIND THE NAME OF THE GENE THAT TRANSCRIPT $transcript COMES FROM:
$gh = $node->gene;
$gene = $gh->ID(); # eg. ENSMUSG00000022770
if ($GENE{$transcript}) { print STDERR "ERROR: already know gene for $transcript.\n"; exit;}
$GENE{$transcript} = $gene;
}
GET_PFAM_NEXT_FAMILY:
}
# GET THE TRANSCRIPTS FOR EACH GENE FROM THE TREEFAM MYSQL DATABASE, AND
# GET THE PFAM DOMAINS FOR ALL THE TRANSCRIPTS IN THE GENE. NOTE THAT WE
# DON'T USE THE TREEFAM PERL API FOR THIS, AS IT IS DIFFICULT TO DO WITH
# THE PERL API.
$dbh = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') || return;
# GET ALL THE TRANSCRIPTS FOR EACH GENE:
foreach $transcript (keys %GENE)
{
# GET THE NAME OF THE GENE CORRESPONDING TO THIS TRANSCRIPT:
$gene = $GENE{$transcript}; # eg. ENSMUSG00000022770
# GET ALL THE TRANSCRIPT IDS. FOR THIS GENE FROM THE TREEFAM
# MYSQL DATABASE (IN ADDITION TO THE TRANSCRIPT $transcript IN THE TREE):
$table_w = "genes";
$st = "SELECT ID, TAX_ID from $table_w WHERE GID=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($gene) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$transcript2 = $array[0]; # eg. ENSMUST00000023454.4
$taxid = $array[1]; # eg. 7227
if (!($TRANSCRIPTS{$gene})) { $TRANSCRIPTS{$gene} = $transcript2;}
else {$TRANSCRIPTS{$gene} = $TRANSCRIPTS{$gene}.",".$transcript2;}
# RECORD WHETHER THIS TRANSCRIPT IS FROM A SPECIES WITH PFAM DATA:
if ($taxid eq '7227' || # DROSOPHILA MELANOGASTER
$taxid eq '7165' || # ANOPHELES GAMBIAE
$taxid eq '7719' || # CIONA INTESTINALIS
$taxid eq '51511'|| # CIONA SAVIGNYI
$taxid eq '69293'|| # GASTEROSTEUS ACULEATUS
$taxid eq '31033'|| # FUGU RUBRIPES
$taxid eq '13616'|| # MONODELPHIS DOMESTICA
$taxid eq '9615' || # CANIS FAMILIARIS
$taxid eq '10116'|| # RATTUS NORVEGICUS
$taxid eq '10090'|| # MUS MUSCULUS
$taxid eq '9598' || # PAN TROGLODYTES
$taxid eq '9913' || # BOS TAURUS
$taxid eq '9606' || # HOMO SAPIENS
$taxid eq '9544' || # MACACA MULATTA
$taxid eq '7955' || # BRACHYDANIO RERIO
$taxid eq '8090' || # ORYZIAS LATIPES
$taxid eq '99883'|| # TETRAODON NIGROVIRIDIS
$taxid eq '9031' || # GALLUS GALLUS
$taxid eq '8364' || # XENOPUS TROPICALIS
$taxid eq '7159') # AEDES AEGYPTI
{
$SPECIES_WITH_DATA{$transcript2} = 1;
}
}
}
}
# GET THE PFAM DOMAINS FOR THE GENES IN EACH FAMILY FROM THE TREEFAM MYSQL DATABASE:
for ($i = 0; $i < @$families; $i++)
{
$family_id = $families->[$i]; # eg. TF314159
# FIND THE TRANSCRIPTS FROM THIS FAMILY IN THE TREEFAM TREE:
if (!($TRANSCRIPTS{$family_id})) { print STDERR "ERROR: do not know genes in $family_id.\n"; exit;}
$transcripts_in_tree = $TRANSCRIPTS{$family_id};
@transcripts_in_tree = split(/\,/,$transcripts_in_tree);
for ($j = 0; $j <= $#transcripts_in_tree; $j++)
{
$transcript_in_tree = $transcripts_in_tree[$j]; # eg. ENSGACT00000026618.1
# FIND THE GENE THAT THIS TRANSCRIPT COMES FROM:
if (!($GENE{$transcript_in_tree})) { print STDERR "ERROR: do not know gene for $transcript_in_tree.\n"; exit;}
$gene = $GENE{$transcript_in_tree};
# FIND ALL THE TRANSCRIPTS FOR $gene:
if (!($TRANSCRIPTS{$gene})) { print STDERR "ERROR: do not know transcripts for $gene.\n"; exit;}
$transcripts = $TRANSCRIPTS{$gene};
@transcripts = split(/\,/,$transcripts);
for ($k = 0; $k <= $#transcripts; $k++)
{
$transcript = $transcripts[$k]; # eg. ENSMUST00000023454.4
# GET THE PFAM DOMAINS FOR TRANSCRIPT $transcript:
$table_w = "pfam";
$st = "SELECT PFAM_ID from $table_w WHERE ID=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($transcript) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$domain = $array[0];
# WE NEED TO CONSIDER DOMAINS IN THE SAME CLAN AS EQUIVALENT:
if ($domain eq 'PF07653') { $domain = 'PF00018';} #xxx
if (!($SEEN{$transcript_in_tree."=".$domain}))
{
if (!($PFAM{$transcript_in_tree})) { $PFAM{$transcript_in_tree} = $domain;}
else {$PFAM{$transcript_in_tree} = $PFAM{$transcript_in_tree}.",".$domain;}
$SEEN{$transcript_in_tree."=".$domain} = 1;
}
# RECORD THIS DOMAIN IN ARRAY DOMAINS, IF WE HAVEN'T SEEN IT BEFORE:
if (!($SEEN{$domain}))
{
@domains = (@domains,$domain);
$SEEN{$domain} = 1;
}
}
}
}
if ($VERBOSE_PFAM == 1)
{
if ($PFAM{$transcript_in_tree})
{
print "Family $family_id has gene $transcript_in_tree Pfam $PFAM{$transcript_in_tree}\n";
}
else
{
print "Family $family_id has gene $transcript_in_tree no Pfam\n";
}
}
}
}
# RECORD WHETHER EACH SEQUENCE HAS EACH DOMAIN IN THE ARRAY @domains. THIS
# IS NEEDED FOR THE rd2 ALGORITHM:
for ($j = 0; $j <= $#transcripts_in_tree; $j++)
{
$transcript_in_tree = $transcripts_in_tree[$j];
%SEEN = ();
if ($PFAM{$transcript_in_tree})
{
# FIND THE DOMAINS IN THIS TRANSCRIPT:
$the_domains = $PFAM{$transcript_in_tree};
@the_domains = split(/\,/,$the_domains);
for ($i = 0; $i <= $#the_domains; $i++)
{
$the_domain = $the_domains[$i];
$SEEN{$the_domain}= 1;
}
}
for ($i = 0; $i <= $#domains; $i++)
{
$domain = $domains[$i];
if (!($SPECIES_WITH_DATA{$transcript_in_tree}))
{
$PFAM{$transcript_in_tree."_".$domain} = "yes,no";
}
else
{
if ($SEEN{$domain})
{
$PFAM{$transcript_in_tree."_".$domain} = "yes";
}
else
{
$PFAM{$transcript_in_tree."_".$domain} = "no";
}
}
}
}
return(\%PFAM,\@domains,\%SPECIES_WITH_DATA);
}
#------------------------------------------------------------------#
# GET THE GO ANNOTATIONS FOR GENES:
sub get_GO
{
my $file = $_[0]; #> THE FILE CONTAINING THE GO IDs.
my $families = $_[1]; #> THE LIST OF FAMILIES UNDER STUDY.
my %TAKE = (); #>
my $i; #>
my $family; #>
my $line; #>
my @temp; #>
my $goid; #>
my %GOID = (); #>
my @goids; #>
my %SEEN = (); #>
my $j; #>
my $the_goids; #>
my @the_goids; #>
my $the_goid; #>
my $dbc; #>
my $family_id; #>
my $famh; #>
my $tree; #>
my @nodes; #>
my $node; #>
my $transcript; #>
my @transcripts; #>
my %SPECIES_WITH_DATA = (); #>
my $dbh; #>
my $table_w; #>
my $st; #>
my $sth; #>
my $rv; #>
my @array; #>
my $taxid; #>
# RECORD WHICH FAMILIES TO TAKE:
for ($i = 0; $i < @$families; $i++)
{
$family = $families->[$i];
$TAKE{$family} = 1;
}
print STDERR "Recorded which families to take...\n";
# CONNECT TO THE TREEFAM DATABASE USING DEFAULT CONFIGURATION:
# GET A LIST OF ALL THE TRANSCRIPT IDS IN EACH FAMILY:
$dbc = Treefam::DBConnection->new();
for ($i = 0; $i < @$families; $i++)
{
$family_id = $families->[$i]; # eg. TF314159
# GET A FAMILY:
$famh = $dbc->get_FamilyHandle();
# GET THE OBJECT FOR THAT FAMILY:
$family = $famh->get_by_id($family_id);
if (!(defined($family)))
{
print "Do not have $family_id in the database...\n";
goto GET_GO_NEXT_FAMILY;
}
# GET THE CLEAN TREE FOR THIS FAMILY:
$tree = $family->get_tree('clean');
# LIST ALL THE LEAVES FROM THE TREE:
@nodes = $tree->get_leaves;
# GET ALL THE TRANSCRIPTS FOR IN THE TREE:
foreach $node(@nodes)
{
# GET THE TRANSCRIPT FOR THIS NODE $node:
if ($node->is_leaf) { $transcript = $node->sequence_id(); } # eg. ENSMUST00000023454.4
else { print STDERR "ERROR: node is not a leaf.\n"; exit;}
if (!($SEEN{$transcript}))
{
$SEEN{$transcript} = 1;
@transcripts = (@transcripts,$transcript);
}
}
GET_GO_NEXT_FAMILY:
}
print STDERR "Got a list of transcript ids in each family...\n";
# READ IN THE GO ANNOTATIONS FOR THE FAMILIES UNDER STUDY:
open(FILE,"$file") || die "ERROR: cannot open $file.\n";
while(<FILE>)
{
$line = $_;
chomp $line;
@temp = split(/\s+/,$line);
if (substr($line,0,6) eq 'Family') # Family TF101001 : DM CYCB has GO id GO:0000910
{
$family = $temp[1]; # eg. TF101001
$goid = $temp[8]; # eg. GO:0000910
$transcript = $temp[9]; # eg. CG3510-RA.3
if ($TAKE{$family})
{
if (!($SEEN{$transcript}))
{
print "WARNING: have GO annotation for $transcript but haven't seen it in the tree (family $family)!\n";
print STDERR "WARNING: have GO annotation for $transcript but haven't seen it in the tree (family $family)!\n";
#xxx can happen if in seed tree but not clean tree -- exit;
}
# RECORD THE GO ANNOTATIONS FOR TRANSCRIPT $transcript:
if (!($GOID{$transcript})) { $GOID{$transcript} = $goid;}
else {$GOID{$transcript} = $GOID{$transcript}.",".$goid;}
if ($VERBOSE_GOID == 1)
{
print "Family $family gene $transcript GO $GOID{$transcript}\n";
}
if (!($SEEN{$goid}))
{
@goids = (@goids,$goid);
$SEEN{$goid}= 1;
}
}
}
}
close(FILE);
print STDERR "Read in the GO annotations for the families under study...\n";
# RECORD WHICH TRANSCRIPTS COME FROM SPECIES WITH GO DATA:
$dbh = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') || return;
# GET ALL THE TRANSCRIPTS FOR EACH GENE:
for ($j = 0; $j <= $#transcripts; $j++)
{
$transcript = $transcripts[$j];
# GET THE SPECIES FOR THIS TRANSCRIPT FROM THE TREEFAM
# MYSQL DATABASE:
$table_w = "genes";
$st = "SELECT TAX_ID from $table_w WHERE ID=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($transcript) or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$taxid = $array[0]; # eg. 7227
# RECORD WHETHER THIS TRANSCRIPT IS FROM A SPECIES WITH GO DATA:
if ($taxid eq '7227' || # DROSOPHILA MELANOGASTER
$taxid eq '9031' || # GALLUS GALLUS
$taxid eq '9606' || # HOMO SAPIENS
$taxid eq '10090'|| # MUS MUSCULUS
$taxid eq '10116'|| # RATTUS NORVEGICUS
$taxid eq '6239' || # CAENORHABDITIS ELEGANS
$taxid eq '7955') # BRACHYDANIO RERIO
{
$SPECIES_WITH_DATA{$transcript} = 1;
}
}
}
}
print STDERR "Read in which transcripts come from families with GO data...\n";
# RECORD WHETHER EACH SEQUENCE HAS EACH GO ID. IN THE ARRAY @goids. THIS
# IS NEEDED FOR THE rd2 ALGORITHM:
for ($j = 0; $j <= $#transcripts; $j++)
{
$transcript = $transcripts[$j];
%SEEN = ();
if ($GOID{$transcript})
{
# FIND THE GO IDS. FOR THIS TRANSCRIPT:
$the_goids = $GOID{$transcript};
@the_goids = split(/\,/,$the_goids);
for ($i = 0; $i <= $#the_goids; $i++)
{
$the_goid = $the_goids[$i];
$SEEN{$the_goid} = 1;
}
}
else # THIS TRANSCRIPT HAS NO GO TERMS, SO WE TREAT IT AS MISSING DATA. xxx
{
$SPECIES_WITH_DATA{$transcript} = "";
}
for ($i = 0; $i <= $#goids; $i++)
{
$goid = $goids[$i];
if (!($SPECIES_WITH_DATA{$transcript}))
{
$GOID{$transcript."_".$goid} = "yes,no";
}
else
{
if ($SEEN{$goid})
{
$GOID{$transcript."_".$goid} = "yes";
}
else
{
$GOID{$transcript."_".$goid} = "no";
}
}
}
}
print STDERR "Recorded whether each transcript has each of the GO terms in that family...\n";
return(\%GOID,\@goids,\%SPECIES_WITH_DATA);
}
#------------------------------------------------------------------#
# DO A POSTORDER TRAVERSAL OF THE TREE:
sub do_postorder_traversal_rd3
{
my $tree = $_[0]; #xxx THE TREE OBJECT.
my $FEATURES = $_[1]; # THE FEATURES.
my $myfeats = $_[2]; # ARRAY OF ALL THE FEATURES IN THE WHOLE FAMILY.
my $root; #
my $root_name; #
my @nodes; #
my @parents; #
my $no_rounds; #
my $no_new_nodes; #
my %SEEN_NODE = (); #
my %SEEN_PARENT = (); #
my $node; #
my $node_name; #
my $VERBOSE_POSTORDER = 1; # xxx
my $parent; #
my $parent_name; #
my $daughter; #
my $daughter_name; #
my $i; #
my %PROVISIONAL_FEAT = (); #
my $total_cost = 0; #
my $cost = 0; #
my $success = 0; #
my %PROVISIONAL_SCORE = (); #
# FIND THE ROOT OF THE TREE:
$root = $tree->root;
$root_name = $root->internalID;
# LIST ALL THE LEAVES FROM THE TREE:
@nodes = &Avril_modules::empty_array(@nodes);
@parents = &Avril_modules::empty_array(@parents);
@nodes = $tree->get_leaves;
$no_rounds = 0;
NEXT_ROUND1_rd3:
$no_rounds++;
$no_new_nodes = 0;
%SEEN_NODE = ();
%SEEN_PARENT = ();
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1,
# THIS ARRAY CONTAINS THE LEAVES. WHEN $no_rounds=2, THIS ARRAY
# CONTAINS THEIR PARENTS. WHEN $no_rounds=3, THIS ARRAY CONTAINS
# THEIR GRANDPARENTS, AND SO ON.
foreach $node(@nodes)
{
if ($node->is_leaf) { $node_name = $node->id(); }
else { $node_name = $node->internalID(); }
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited node $node_name\n";}
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;}
# FIND THE PARENT OF NODE $node:
if (defined($node->parent)) { $parent = $node->parent; }
else { print STDERR "ERROR: do not know parent of node $node_name.\n"; exit;}
$parent_name = $parent->internalID;
# FIND THE OTHER CHILD OF NODE $parent:
if (!($parent->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $parent:
foreach $daughter ($parent->children)
{
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); }
else { $daughter_name = $daughter->internalID;}
if ($daughter_name ne $node_name)
{
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited sibling $daughter_name\n";}
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;}
}
}
}
else { print STDERR "ERROR: parent $parent_name is a sequence.\n"; exit;}
# RECORD THAT WE VISITED THE PARENT $parent:
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: saw parent $parent_name\n";}
if (!($SEEN_PARENT{$parent_name}) && $parent_name ne $root_name)
{
# IN THE POST-ORDER TRAVERSAL, FOR EACH FEATURE {A} WE RECORD THE COST OF HAVING OR
# LACKING FEATURE {A} FOR PARENT $parent. FEATURE {A} COULD BE A DOMAIN OR A GO TERM.
($success) = &assign_provisional_feats_rd3($FEATURES,$parent,$parent_name,\%PROVISIONAL_FEAT,
$VERBOSE_POSTORDER,$myfeats,\%PROVISIONAL_SCORE);
$cost = 0; #xxx
if ($success == 1)
{
@parents = (@parents,$parent);
$SEEN_PARENT{$parent_name} = 1;
$total_cost = $total_cost + $cost;
}
else
{
@parents = (@parents,$node);
$SEEN_PARENT{$node_name} = 1;
}
}
}
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds rounds: number new = $no_new_nodes\n";}
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL:
if ($no_new_nodes > 0)
{
@nodes = &Avril_modules::empty_array(@nodes);
for ($i = 0; $i <= $#parents; $i++) { @nodes = (@nodes,$parents[$i]);}
@parents = &Avril_modules::empty_array(@parents);
goto NEXT_ROUND1_rd3;
}
# VISIT ROOT:
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited root $root_name\n";}
($success) = &assign_provisional_feats_rd3($FEATURES,$root,$root_name,\%PROVISIONAL_FEAT,
$VERBOSE_POSTORDER,$myfeats,\%PROVISIONAL_SCORE);
$cost = 0;
if ($success == 0) { print STDERR "ERROR: success $success at root node.\n"; exit;}
$total_cost = $total_cost + $cost;
if ($VERBOSE_POSTORDER == 1) { print "Finished post-order traversal, total cost $total_cost\n";}
return(\%PROVISIONAL_FEAT);
}
#------------------------------------------------------------------#
# IN THE POST-ORDER TRAVERSAL, FOR EACH FEATURE {A} WE RECORD THE COST OF HAVING OR
# LACKING FEATURE {A} FOR PARENT $parent. FEATURE {A} COULD BE A DOMAIN OR A GO TERM.
# xxx
sub assign_provisional_feats_rd3
{
my $FEATURES = $_[0]; #xxx THE FEATURES.
my $parent = $_[1]; #
my $parent_name = $_[2]; #
my $PROVISIONAL_FEAT = $_[3]; #
my $VERBOSE_POSTORDER = $_[4]; #
my $myfeats = $_[5]; # ARRAY OF ALL THE FEATURES IN THE TREE.
my $PROVISIONAL_SCORE = $_[6]; #
my $daughter; #
my $daughter_name; #
my $daughter_seqname; #
my $daughter_feats; #
my @daughter_feats; #
my $cost = 0; #
my $success = 1; #
my @daughter_nodes; #
if ($VERBOSE_POSTORDER == 1) { print "___ finding provisional feats for parent $parent_name\n";}
# FIND THE CHILDREN OF NODE $parent:
if (!($parent->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $parent:
foreach $daughter ($parent->children)
{
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); }
else { $daughter_name = $daughter->internalID;}
if ($VERBOSE_POSTORDER == 1) { print "___ looking at daughter $daughter_name of parent $parent_name\n";}
# FIND THE SCORES OF NODE $daughter:
if ($daughter->is_leaf)
{
$daughter_seqname = $daughter->get_tag_value('O');
($success) = &get_daughter_score1($daughter_seqname,$FEATURES,$myfeats,$PROVISIONAL_SCORE);
if ($success == 0) { print STDERR "ERROR: could not get feats for $daughter_seqname\n"; exit;}
# xxx @daughter_feats = (@daughter_feats,$daughter_feats);
@daughter_nodes = (@daughter_nodes,$daughter_seqname);
if ($VERBOSE_POSTORDER == 1) { print "___ daughter $daughter_name ($daughter_seqname) has feats xxx\n";}
}
else # THE DAUGHTER $daughter IS AN INTERNAL NODE:
{
print "xxx aah\n";
exit; #xxx
($success,$daughter_feats) = &get_daughter_feats($daughter_name,$PROVISIONAL_FEAT,$myfeats);
if ($success == 0)
{
if ($VERBOSE_POSTORDER == 1) { print "Do not know provisional feats for $daughter_name yet ...\n";}
return($success);
}
@daughter_feats = (@daughter_feats,$daughter_feats);
@daughter_nodes = (@daughter_nodes,$daughter_name);
if ($VERBOSE_POSTORDER == 1) { print "___ daughter $daughter_name (internal) has feats $daughter_feats\n";}
}
}
}
else { print STDERR "ERROR: parent $parent_name is a sequence.\n"; exit;}
# GET THE PROVISIONAL SCORES FOR $parent_name:
&get_parent_score(\@daughter_nodes,$parent_name,$myfeats,$PROVISIONAL_FEAT,$VERBOSE_POSTORDER,$PROVISIONAL_SCORE);
return($success);
}
#------------------------------------------------------------------#
# GET THE PROVISIONAL SCORES FOR A NODE:
# xxx
sub get_daughter_score1
{
my $daughter_seqname = $_[0]; #
my $PROVISIONAL_FEAT = $_[1]; #
my $myfeats = $_[2]; #
my $PROVISIONAL_SCORE = $_[3]; #
my $daughter_feats = ""; #
my $i; #
my $key; #
my $feats; #
my $success = 1; #
my $key2; #
for ($i = 0; $i < @$myfeats; $i++)
{
$key = $daughter_seqname."_".$myfeats->[$i];
print "xxx looking at $daughter_seqname key $key\n";
if (!($PROVISIONAL_FEAT->{$key}))
{
print "xxx returning \n";
$success = 0;
# WE DO NOT HAVE THE PROVISIONAL FEATURES FOR THIS NODE YET:
return($success);
}
$feats = $PROVISIONAL_FEAT->{$key}; # yes OR no OR yes,no
print "xxx key $key feats $feats\n";
if ($feats ne 'yes' && $feats ne 'no' && $feats ne 'yes,no') { print "ERROR: feats $feats.\n"; exit;}
if ($feats eq 'yes')
{
$key2 = $key."_yes";
$PROVISIONAL_SCORE->{$key2} = 0;
print "xxx key2 $key2 feats $feats $PROVISIONAL_SCORE->{$key2}\n";
$key2 = $key."_no";
$PROVISIONAL_SCORE->{$key2} = 1000000;
print "xxx key2 $key2 feats $feats $PROVISIONAL_SCORE->{$key2}\n";
}
elsif ($feats eq 'no')
{
$key2 = $key."_yes";
$PROVISIONAL_SCORE->{$key2} = 1000000;
print "xxx key2 $key2 feats $feats $PROVISIONAL_SCORE->{$key2}\n";
$key2 = $key."_no";
$PROVISIONAL_SCORE->{$key2} = 0;
print "xxx key2 $key2 feats $feats $PROVISIONAL_SCORE->{$key2}\n";
}
elsif ($feats eq 'yes,no')
{
$key2 = $key."_yes";
$PROVISIONAL_SCORE->{$key2} = 0;
$key2 = $key."_no";
$PROVISIONAL_SCORE->{$key2} = 1000000;
print "xxx key2 $key2\n";
}
#xxx $daughter_feats = $daughter_feats."_".$feats;
}
#xxx $daughter_feats = substr($daughter_feats,1,length($daughter_feats)-1);
return($success);
}
#------------------------------------------------------------------#
# GET THE PROVISIONAL SCORES FOR $parent_name:
# xxx
sub get_parent_score
{
my $daughter_nodes = $_[0]; #xxx
my $parent_name = $_[1];
my $myfeats = $_[2];
my $PROVISIONAL_FEAT = $_[3];
my $VERBOSE_POSTORDER = $_[4];
my $PROVISIONAL_SCORE = $_[5];
my $i;
my $feat;
my $j;
my $state;
my $k;
my $daughter_name;
my $daughter_score_yes;
my $daughter_score_no;
my $score;
my $l;
my $parent_state;
my $daughter_state;
my $daughter_score;
# RECORD THE COST OF NODE $parent_name HAVING EACH STATE (A1,A2,etc.) FOR
# EACH FEATURE A:
print "xxx parent_name $parent_name\n";
for ($i = 0; $i < @$myfeats; $i++)
{
$feat = $myfeats->[$i];
print "xxx1 feat $feat i $i\n";
# FIND WHAT IS THE SCORE FOR FEATURE $feat HAVING EACH STATE:
for ($j = 1; $j <= 2; $j++)
{
if ($j == 1) { $parent_state = "yes";}
elsif ($j == 2) { $parent_state = "no"; }
print "xxx2 parent_state $parent_state\n";
#xxx $parent_score = 0; #xxx1
#xxx $score = 0;
$score = 0;
for ($k = 0; $k < @$daughter_nodes; $k++)
{
$daughter_name = $daughter_nodes->[$k];
# FIND WHAT IS THE MINIMUM SCORE FOR DAUGHTER $daughter:
$min_daughter_score = 1000000;
for ($l = 1; $l <= 2; $l++)
{
if ($l == 1) { $daughter_state = "yes";}
elsif ($l == 2) { $daughter_state = "no"; }
$key = $daughter_name."_".$feat."_".$daughter_state;
if (!($PROVISIONAL_SCORE->{$key})) { $daughter_score = 0;}
else { $daughter_score = $PROVISIONAL_SCORE->{$key}; }
print "xxx3 $daughter_name daughter_score $daughter_score daughter_state $daughter_state key $key\n";
# $score = $score + $daughter_score;
# CHECK IF THERE IS A SCORE FOR ANY CHANGE FROM $parent_state TO $daughter_state:
if ($parent_state ne $daughter_state)
{
if ($parent_state eq 'yes' && $daughter_state eq 'no') # LOSS.
{
$daughter_score = $daughter_score + 1; # xxx
}
elsif ($parent_state eq 'no' && $daughter_state eq 'yes') # GAIN.
{
$daughter_score = $daughter_score + 2; # xxx
}
}
if ($daughter_score < $min_daughter_score) { $min_daughter_score = $daughter_score; print "xxx3 min_daughter_score now $min_daughter_score\n";}
#xxx if ($score < $min_score) { $min_score = $score; print "xxx3 min_score now $min_score\n";}
#xxx print "xxx3 score $score min_score $min_score\n";
print "xxx3 daughter_score $daughter_score min_daughter_score $min_daughter_score\n";
}
$score = $score + $min_daughter_score;
print "xxx2 score $score\n";
}
$key = $parent_name."_".$feat."_".$parent_state;
$PROVISIONAL_SCORE->{$key} = $score;
print "xxx2 setting score for $key as $score\n";
#xxx print "xxx2 min_score $min_score\n";
}
}
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment