Created
March 1, 2013 16:48
-
-
Save avrilcoghlan/5065978 to your computer and use it in GitHub Desktop.
Perl script that, given a list of TreeFam families, and a file of features of the sequences in trees (eg. Pfam domains or GO terms), infers the likely features of the ancestral nodes in the trees for the families.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script treefam_infer_ancestral_features.pl | |
# Written by Avril Coghlan (alc@sanger.ac.uk) | |
# 30-May-07. | |
# | |
# This perl script infers the features of ancestral nodes | |
# in a list of TreeFam trees, given the features in the | |
# leaves. | |
# | |
# The command-line format is: | |
# % perl <treefam_infer_ancestral_features.pl> families alg feature_type GO_file | |
# where families is the list of TreeFam families, | |
# alg is the algorithm to use, | |
# feature_type is the type of feature to use (GO or Pfam), | |
# GO_file is the file containing the GO annotations for the leaves (if feature_type=GO; none otherwise). | |
# 'alg' can be 'rd1' = Richard's algorithm for GO annotations from the TreeFam meeting, | |
# 'rd2' = Traditional parsimony algorithm from Richard's book, | |
# 'rd3' = Weighted parsimony algorithm from Richard's book. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 4) | |
{ | |
print "Usage of treefam_infer_ancestral_features.pl\n\n"; | |
print "perl treefam_infer_ancestral_features.pl <families> <alg> <feature_type> <GO_file>\n"; | |
print "where <families> is the list of TreeFam families,\n"; | |
print " <alg> is the algorithm to use,\n"; | |
print " <feature_type> is the type of feature to use (GO or Pfam),\n"; | |
print " <GO_file> is the file containing the GO annotations for the leaves (if feature_type=GO; none otherwise).\n"; | |
print "NOTE: alg can be: rd1 = Richard's algorithm for GO annotations,\n"; | |
print " rd2 = traditional parsimony algorithm from Richard's book,\n"; | |
print " rd3 = weighted parsimony algorithm from Richard's book.\n"; | |
print "For example, >perl treefam_infer_ancestral_features.pl\n"; | |
print "neural_genes_2may07 rd2 GO GO_slim_terms11_2may07\n"; | |
exit; | |
} | |
# READ IN MY PERL MODULES: | |
BEGIN { | |
unshift (@INC, '/nfs/team71/phd/alc/perl/modules'); | |
} | |
use lib "/nfs/team71/phd/alc/perl/treefam/Treefam_api_3may07/"; #xxx note that this is old, I've hacked it | |
#xxx to make it use TreeFam-4. | |
use Avril_modules; | |
use Treefam::DBConnection; | |
# FIND THE NAME OF THE LIST OF TREEFAM FAMILIES: | |
$fams = $ARGV[0]; | |
# FIND THE ALGORITHM TO USE: | |
$alg = $ARGV[1]; | |
if ($alg ne 'rd1' && $alg ne 'rd2' && $alg ne 'rd3') | |
{ | |
print STDERR "ERROR: alg should be rd1 or rd2 or rd3!\n"; | |
exit; | |
} | |
# FIND THE TYPE OF FEATURE TO USE: | |
$feature_type = $ARGV[2]; | |
if ($feature_type ne 'GO' && $feature_type ne 'Pfam') | |
{ | |
print STDERR "ERROR: feature_type should be GO or Pfam!\n"; | |
exit; | |
} | |
# FIND THE NAME OF THE FILE CONTAINING THE GO ANNOTATIONS FOR THE LEAVES: | |
$GO_file = $ARGV[3]; | |
srand; | |
$VERBOSE_PFAM = 0; # PRINTS PFAM DOMAINS IN GENES. | |
$VERBOSE_GOID = 0; # PRINTS GO ANNOTATIONS FOR GENES. | |
$VERBOSE3 = 1; | |
#------------------------------------------------------------------# | |
# READ IN THE LIST OF FAMILIES: | |
($families) = &read_family_list($fams); | |
print STDERR "Read in families...\n"; | |
# READ IN THE INPUT DATA: | |
if ($feature_type eq 'Pfam') | |
{ | |
# GET THE PFAM DOMAINS FOR THE GENES IN THE TREEFAM FAMILIES: | |
($FEATURES,$features,$SPECIES_WITH_DATA) = &get_pfam($families); | |
print STDERR "Got pfam domains from TreeFam...\n"; | |
} | |
elsif ($feature_type eq 'GO') | |
{ | |
# GET THE GO ANNOTATIONS FOR THE GENES IN THE TREEFAM FAMILIES: | |
($FEATURES,$features,$SPECIES_WITH_DATA) = &get_GO($GO_file,$families); | |
print STDERR "Got GO from $GO_file...\n"; | |
} | |
else { print STDERR "ERROR: feature_type $feature_type.\n"; exit;} | |
# NOW INFER THE FEATURES OF ANCESTRAL NODES IN THESE FAMILIES: | |
&infer_ancestral_nodes($FEATURES,$families,$features,$SPECIES_WITH_DATA); | |
print STDERR "FINISHED.\n"; | |
print "FINISHED\n"; | |
#------------------------------------------------------------------# | |
# READ IN THE LIST OF FAMILIES FROM A FILE: | |
sub read_family_list | |
{ | |
my $fams = $_[0]; #> | |
my $line; #> | |
my @temp; #> | |
my $family; #> | |
my @families; #> | |
my %SEEN = (); #> | |
open(FAMS,"$fams") || die "ERROR: cannot open $fams.\n"; | |
while(<FAMS>) | |
{ | |
$line = $_; | |
chomp $line; # eg. TF101101 | |
@temp = split(/\s+/,$line); | |
if (substr($line,0,2) eq 'TF') | |
{ | |
$family = $temp[0]; | |
if (!($SEEN{$family})) | |
{ | |
@families = (@families,$family); | |
$SEEN{$family} = 1; | |
} | |
} | |
} | |
close(FAMS); | |
return(\@families); | |
} | |
#------------------------------------------------------------------# | |
# IF $node IS A DUPLICATION NODE, ONLY CONFIRM IN THE CHILDREN THOSE | |
# CONFIRMED TERMS IN THE CURRENT NODE THAT ARE PROVISIONAL TERMS IN | |
# THE CHILD NODE. | |
sub add_confirmed_terms_to_children2 | |
{ | |
my $node = $_[0]; # | |
my $node_name = $_[1]; # | |
my $CONFIRMED_FEAT = $_[2]; # | |
my $VERBOSE_PREORDER = $_[3]; # | |
my $PROVISIONAL_FEAT = $_[4]; # | |
my $FEATURES = $_[5]; # | |
my $confirmed_feat; # | |
my @confirmed_feat; # | |
my $daughter; # | |
my $daughter_name; # | |
my $provisional_feat; # | |
my @provisional_feat; # | |
my %CONFIRMED_IN_NODE = (); # | |
my $i; # | |
my $j; # | |
my $daughter_confirmed_feat; # | |
my %SEEN = (); # | |
my $daughter_seqname; # | |
if ($VERBOSE_PREORDER == 1) { print "___ adding confirmed feats for duplication node $node_name to its chidren's confirmed lists\n";} | |
# FIND THE CONFIRMED FEATURESs OF $node: | |
if ($CONFIRMED_FEAT->{$node_name}) | |
{ | |
$confirmed_feat = $CONFIRMED_FEAT->{$node_name}; | |
@confirmed_feat = split(/\,/,$confirmed_feat); | |
for ($i = 0; $i <= $#confirmed_feat; $i++) { $CONFIRMED_IN_NODE{$confirmed_feat[$i]} = 1;} | |
} | |
else # $node DOES NOT HAVE ANY CONFIRMED FEATURES. | |
{ | |
return; | |
} | |
# FIND THE CHILDREN OF NODE $node: | |
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $node: | |
foreach $daughter ($node->children) | |
{ | |
$daughter_confirmed_feat = ""; | |
%SEEN = (); | |
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); } | |
else { $daughter_name = $daughter->internalID;} | |
# FIND THE PROVISIONAL FEATURES FOR $daughter_name: | |
if ($PROVISIONAL_FEAT->{$daughter_name}) | |
{ | |
$provisional_feat = $PROVISIONAL_FEAT->{$daughter_name}; | |
@provisional_feat = split(/\,/,$provisional_feat); | |
for ($j = 0; $j <= $#provisional_feat; $j++) | |
{ | |
if ($CONFIRMED_IN_NODE{$provisional_feat[$j]}) # IF THIS FEAT IS CONFIRMED IN $node. | |
{ | |
if (!($SEEN{$provisional_feat[$j]})) | |
{ | |
$SEEN{$provisional_feat[$j]} = 1; | |
$daughter_confirmed_feat = $daughter_confirmed_feat.",".$provisional_feat[$j]; | |
} | |
} | |
} | |
} | |
else # THERE IS NO PROVISIONAL FEATURES FOR $daughter_name. $daughter_name MAY BE A LEAF. | |
{ | |
if ($daughter->is_leaf) | |
{ | |
$daughter_seqname = $daughter->get_tag_value('O'); | |
if ($FEATURES->{$daughter_seqname}) | |
{ | |
$provisional_feat = $FEATURES->{$daughter_seqname}; | |
@provisional_feat = split(/\,/,$provisional_feat); | |
for ($j = 0; $j <= $#provisional_feat; $j++) | |
{ | |
if ($CONFIRMED_IN_NODE{$provisional_feat[$j]}) # IF THIS FEATURE IS CONFIRMED IN $node. | |
{ | |
if (!($SEEN{$provisional_feat[$j]})) | |
{ | |
$SEEN{$provisional_feat[$j]} = 1; | |
$daughter_confirmed_feat = $daughter_confirmed_feat.",".$provisional_feat[$j]; | |
} | |
} | |
} | |
} | |
} | |
} | |
if ($daughter_confirmed_feat ne "") | |
{ | |
$daughter_confirmed_feat = substr($daughter_confirmed_feat,1,length($daughter_confirmed_feat)-1); | |
if ($VERBOSE_PREORDER == 1) { print "___ adding $daughter_confirmed_feat to daughter $daughter_name of $node_name\n";} | |
if (!($CONFIRMED_FEAT->{$daughter_name})) { $CONFIRMED_FEAT->{$daughter_name} = $daughter_confirmed_feat;} | |
else { print STDERR "ERROR: daughter_name $daughter_name should not have a confirmed feat already.\n"; exit;} | |
} | |
} | |
} | |
} | |
#------------------------------------------------------------------# | |
# IF $node IS A SPECIATION NODE, ADD ALL ITS CONFIRMED TERMS TO THE | |
# CONFIRMED LISTS OF BOTH ITS CHILDREN: | |
sub add_confirmed_terms_to_children | |
{ | |
my $node = $_[0]; | |
my $node_name = $_[1]; | |
my $CONFIRMED_FEAT = $_[2]; | |
my $VERBOSE_PREORDER = $_[3]; | |
my $daughter; | |
my $daughter_name; | |
my $confirmed_feat; | |
if ($VERBOSE_PREORDER == 1) { print "___ adding confirmed feats for speciation node $node_name to its chidren's confirmed lists\n";} | |
# FIND THE CONFIRMED FEATURES OF $node: | |
if ($CONFIRMED_FEAT->{$node_name}) | |
{ | |
$confirmed_feat = $CONFIRMED_FEAT->{$node_name}; | |
} | |
else # $node DOES NOT HAVE ANY CONFIRMED FEATURES. | |
{ | |
return; | |
} | |
# FIND THE CHILDREN OF NODE $node: | |
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $node: | |
foreach $daughter ($node->children) | |
{ | |
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); } | |
else { $daughter_name = $daughter->internalID;} | |
if ($VERBOSE_PREORDER == 1) { print "___ adding $confirmed_feat to daughter $daughter_name of $node_name\n";} | |
if (!($CONFIRMED_FEAT->{$daughter_name})) { $CONFIRMED_FEAT->{$daughter_name} = $confirmed_feat;} | |
else { print STDERR "ERROR: daughter_name $daughter_name should not have a confirmed feat already.\n"; exit;} | |
} | |
} | |
} | |
#------------------------------------------------------------------# | |
# RECORD THE FEATURES FOR NODE $node. IN THE PRE-ORDER TRAVERSAL, | |
# WE CHOOSE A CHARACTER FOR NODE $node. WE TRY TO PICK THE SAME RESIDUE | |
# FOR THE DAUGHTER NODES IF POSSIBLE. | |
sub assign_confirmed_feats_rd2 | |
{ | |
my $PROVISIONAL_FEAT = $_[0]; #> | |
my $node = $_[1]; #> | |
my $node_name = $_[2]; #> | |
my $CONFIRMED_FEAT = $_[3]; #> | |
my $VERBOSE_PREORDER = $_[4]; #> | |
my $FEATURES = $_[5]; #> | |
my $myfeats = $_[6]; #> | |
my $root_name = $_[7]; #> | |
my $parent; #> | |
my $parent_name; #> | |
my $i; #> | |
my $myfeat; #> | |
my $key; #> | |
my $provisional; #> | |
my $confirmed; #> | |
my $rand; #> | |
my $node_seqname; #> | |
my $parent_key; #> | |
my $parent_confirmed; #> | |
my $confirmed_feat = ""; #> | |
if ($VERBOSE_PREORDER == 1) { print "___ finding confirmed feats for node $node_name\n";} | |
# IF THIS IS NOT THE ROOT: | |
if ($node_name ne $root_name) | |
{ | |
# FIND THE PARENT OF NODE $node: | |
if (defined($node->parent)) { $parent = $node->parent; } | |
else { print STDERR "ERROR: do not know parent of node $node_name.\n"; exit;} | |
$parent_name = $parent->internalID; | |
# FIND THE PROVISIONAL FEATURES FOR THE NODE $node_name: | |
for ($i = 0; $i < @$myfeats; $i++) | |
{ | |
$myfeat = $myfeats->[$i]; | |
$key = $node_name."_".$myfeat; | |
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
if (!($PROVISIONAL_FEAT->{$key})) { print STDERR "ERROR: do not know provisional feat for $key.\n"; exit;} | |
$provisional = $PROVISIONAL_FEAT->{$key}; | |
} | |
else # THIS IS A SEQUENCE. | |
{ | |
$node_seqname = $node->get_tag_value('O'); | |
$key = $node_seqname."_".$myfeat; | |
if (!($FEATURES->{$key})) { print STDERR "ERROR: do not know feat for $key.\n"; exit;} | |
$provisional = $FEATURES->{$key}; | |
} | |
$parent_key = $parent_name."_".$myfeat; | |
if (!($CONFIRMED_FEAT->{$parent_key})) { print STDERR "ERROR: do not know confirmed feat for $parent_key.\n"; exit;} | |
$parent_confirmed = $CONFIRMED_FEAT->{$parent_key}; | |
if ($provisional eq 'yes') { $confirmed = 'yes';} | |
elsif ($provisional eq 'no') { $confirmed = 'no'; } | |
elsif ($provisional eq 'yes,no') | |
{ | |
$confirmed = $parent_confirmed; | |
} | |
else { print STDERR "ERROR: provisional $provisional\n"; exit;} | |
$CONFIRMED_FEAT->{$key} = $confirmed; | |
$confirmed_feat = $confirmed_feat."_".$confirmed; | |
} | |
} | |
else # THIS IS THE ROOT: | |
{ | |
# FIND THE PROVISIONAL FEATURES FOR THE ROOT NODE: | |
for ($i = 0; $i < @$myfeats; $i++) | |
{ | |
$myfeat = $myfeats->[$i]; | |
$key = $node_name."_".$myfeat; | |
if (!($PROVISIONAL_FEAT->{$key})) { print STDERR "ERROR: do not know provisional feat for $key.\n"; exit;} | |
$provisional = $PROVISIONAL_FEAT->{$key}; | |
if ($provisional eq 'yes') { $confirmed = 'yes';} | |
elsif ($provisional eq 'no') { $confirmed = 'no'; } | |
elsif ($provisional eq 'yes,no') | |
{ | |
$rand = int rand 2; # xxx we might want to print out all equally parsimonious trees | |
if ($rand == 0) | |
{ | |
$confirmed = 'yes'; | |
} | |
elsif ($rand == 1) | |
{ | |
$confirmed = 'no'; | |
} | |
else { print STDERR "ERROR: rand $rand.\n"; exit;} | |
print "Node $node_name : taking confirmed as 'no' for $myfeat, as it could be yes or no\n"; #xxx | |
$confirmed = 'no'; # xxx # FORCE IT TO CHOOSE GAINS OVER LOSSES. | |
} | |
else { print STDERR "ERROR: provisional $provisional\n"; exit;} | |
$CONFIRMED_FEAT->{$key} = $confirmed; | |
$confirmed_feat = $confirmed_feat."_".$confirmed; | |
} | |
} | |
if ($confirmed_feat eq '') { print STDERR "ERROR: confirmed_feat $confirmed_feat\n"; exit;} | |
$confirmed_feat = substr($confirmed_feat,1,length($confirmed_feat)-1); | |
if ($VERBOSE_PREORDER == 1) { print "___ node $node_name has confirmed feat $confirmed_feat\n";} | |
} | |
#------------------------------------------------------------------# | |
# RECORD THE FEATURES FOR NODE $node. IN THE PRE-ORDER TRAVERSAL, | |
# WE ADD TO THE CONFIRMED TERM LIST OF A NODE THOSE PROVISIONAL | |
# TERMS PRESENT IN BOTH CHILDREN OF THE CURRENT NODE: | |
sub assign_confirmed_feats_rd1 | |
{ | |
my $PROVISIONAL_FEAT = $_[0]; # | |
my $node = $_[1]; # | |
my $node_name = $_[2]; # | |
my $CONFIRMED_FEAT = $_[3]; # | |
my $VERBOSE_PREORDER = $_[4]; # | |
my $FEATURES = $_[5]; # | |
my $daughter; # | |
my $daughter_name; # | |
my $daughter_seqname; # | |
my $daughter_feats; # | |
my @daughter_feats; # | |
my $i; # | |
my $no_daughters = 0; # | |
my %DAUGHTER_FEAT = (); # | |
my %SEEN_FEAT = (); # | |
my @all_feats; # | |
my $feat; # | |
my $confirmed_feat = ""; # | |
my $shared_feat; # | |
my $j; # | |
if ($VERBOSE_PREORDER == 1) { print "___ finding confirmed feats for node $node_name\n";} | |
# FIND THE CHILDREN OF NODE $node: | |
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $node: | |
foreach $daughter ($node->children) | |
{ | |
$no_daughters++; | |
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); } | |
else { $daughter_name = $daughter->internalID;} | |
if ($VERBOSE_PREORDER == 1) { print "___ looking at daughter $daughter_name of parent $node_name\n";} | |
# FIND THE FEATURES OF NODE $daughter: | |
if ($daughter->is_leaf) | |
{ | |
$daughter_seqname = $daughter->get_tag_value('O'); | |
if ($FEATURES->{$daughter_seqname}) | |
{ | |
$daughter_feats = $FEATURES->{$daughter_seqname}; | |
if ($VERBOSE_PREORDER == 1) { print "___ daughter $daughter_name ($daughter_seqname) has feats $daughter_feats\n";} | |
@daughter_feats = split(/\,/,$daughter_feats); | |
for ($i = 0; $i <= $#daughter_feats; $i++) | |
{ | |
$DAUGHTER_FEAT{$no_daughters."_".$daughter_feats[$i]} = 1; | |
if (!($SEEN_FEAT{$daughter_feats[$i]})) | |
{ | |
$SEEN_FEAT{$daughter_feats[$i]} = 1; | |
@all_feats = (@all_feats,$daughter_feats[$i]); | |
} | |
} | |
} | |
} | |
else | |
{ | |
if ($PROVISIONAL_FEAT->{$daughter_name}) | |
{ | |
$daughter_feats = $PROVISIONAL_FEAT->{$daughter_name}; | |
if ($VERBOSE_PREORDER == 1) { print "___ daughter $daughter_name has feats $daughter_feats\n";} | |
@daughter_feats = split(/\,/,$daughter_feats); | |
for ($i = 0; $i <= $#daughter_feats; $i++) | |
{ | |
$DAUGHTER_FEAT{$no_daughters."_".$daughter_feats[$i]} = 1; | |
if (!($SEEN_FEAT{$daughter_feats[$i]})) | |
{ | |
$SEEN_FEAT{$daughter_feats[$i]} = 1; | |
@all_feats = (@all_feats,$daughter_feats[$i]); | |
} | |
} | |
} | |
} | |
} | |
} | |
# FIND THE FEATURES THAT ARE SHARED BY ALL THE DAUGHTERS OF NODE $node: | |
$confirmed_feat = ""; | |
for ($i = 0; $i <= $#all_feats; $i++) | |
{ | |
$feat = $all_feats[$i]; | |
$shared_feat = 1; | |
for ($j = 1; $j <= $no_daughters; $j++) | |
{ | |
if (!($DAUGHTER_FEAT{$j."_".$feat})) { $shared_feat = 0;} | |
} | |
if ($shared_feat == 1) | |
{ | |
$confirmed_feat = $confirmed_feat.",".$feat; | |
} | |
} | |
if ($confirmed_feat ne "") | |
{ | |
$confirmed_feat = substr($confirmed_feat,1,length($confirmed_feat)-1); | |
if ($VERBOSE_PREORDER == 1) { print "___ node $node_name has confirmed feat $confirmed_feat\n";} | |
$CONFIRMED_FEAT->{$node_name} = $confirmed_feat; | |
} | |
} | |
#------------------------------------------------------------------# | |
# GET THE PROVISIONAL FEATURES IN A NODE: | |
sub get_daughter_feats | |
{ | |
my $daughter_seqname = $_[0]; #> | |
my $PROVISIONAL_FEAT = $_[1]; #> | |
my $myfeats = $_[2]; #> | |
my $daughter_feats = ""; #> | |
my $i; #> | |
my $key; #> | |
my $feats; #> | |
my $success = 1; #> | |
for ($i = 0; $i < @$myfeats; $i++) | |
{ | |
$key = $daughter_seqname."_".$myfeats->[$i]; | |
if (!($PROVISIONAL_FEAT->{$key})) | |
{ | |
$success = 0; | |
# WE DO NOT HAVE THE PROVISIONAL FEATURES FOR THIS NODE YET: | |
return($success,$daughter_feats); | |
} | |
$feats = $PROVISIONAL_FEAT->{$key}; # yes OR no OR yes,no | |
if ($feats ne 'yes' && $feats ne 'no' && $feats ne 'yes,no') { print "ERROR: feats $feats.\n"; exit;} | |
$daughter_feats = $daughter_feats."_".$feats; | |
} | |
$daughter_feats = substr($daughter_feats,1,length($daughter_feats)-1); | |
return($success,$daughter_feats); | |
} | |
#------------------------------------------------------------------# | |
sub get_intersection | |
{ | |
my $myfeats = $_[0]; #> | |
my $i; #> | |
my $j; #> | |
my $myfeat; #> | |
my $intersection = ""; #> | |
my @temp; #> | |
my $no_feats = 0; #> | |
my %CNT = (); #> | |
for ($i = 0; $i < @$myfeats; $i++) | |
{ | |
$no_feats++; | |
$myfeat = $myfeats->[$i]; | |
@temp = split(/\,/,$myfeat); | |
for ($j = 0; $j <= $#temp; $j++) | |
{ | |
if (!($CNT{$temp[$j]})) { $CNT{$temp[$j]} = 1;} | |
else { $CNT{$temp[$j]}++; } | |
} | |
} | |
foreach $myfeat (keys %CNT) | |
{ | |
if ($CNT{$myfeat} == $no_feats) | |
{ | |
$intersection = $intersection.",".$myfeat; | |
} | |
} | |
if ($intersection ne "") | |
{ | |
$intersection = substr($intersection,1,length($intersection)-1); | |
if ($intersection eq 'no,yes') { $intersection = "yes,no";} | |
if ($intersection ne "no" && $intersection ne "yes" && $intersection ne "yes,no") | |
{ | |
print "ERROR: intersection $intersection\n"; | |
exit; | |
} | |
} | |
return $intersection; | |
} | |
#------------------------------------------------------------------# | |
sub get_union | |
{ | |
my $myfeats = $_[0]; | |
my $i; | |
my $j; | |
my $myfeat; | |
my $union = ""; | |
my @temp; | |
my %SEEN = (); | |
for ($i = 0; $i < @$myfeats; $i++) | |
{ | |
$myfeat = $myfeats->[$i]; | |
@temp = split(/\,/,$myfeat); | |
for ($j = 0; $j <= $#temp; $j++) | |
{ | |
if (!($SEEN{$temp[$j]})) | |
{ | |
$SEEN{$temp[$j]} = 1; | |
$union = $union.",".$temp[$j]; | |
} | |
} | |
} | |
if ($union eq "") { print "ERROR: union is $union !"; exit;} | |
$union = substr($union,1,length($union)-1); | |
if ($union eq 'no,yes') { $union = "yes,no";} | |
if ($union ne "no" && $union ne "yes" && $union ne "yes,no") | |
{ | |
print "ERROR: union $union\n"; | |
exit; | |
} | |
return $union; | |
} | |
#------------------------------------------------------------------# | |
# GET THE PROVISIONAL FEATURES FOR A PARENT NODE $parent_name: | |
sub get_provisional_feat | |
{ | |
my $provisional_feats = $_[0]; #> THE PROVISIONAL FEATURES FOR THE DAUGHTER NODES. | |
my $parent_name = $_[1]; #> | |
my $myfeats = $_[2]; #> | |
my $PROVISIONAL_FEAT = $_[3]; #> | |
my $VERBOSE_POSTORDER = $_[4]; #> | |
my $i; #> | |
my $provisional_feat; #> | |
my @provisional_feat; #> | |
my $j; #> | |
my @daughter_feats; #> | |
my $daughter_feat; #> | |
my $intersection = ""; #> | |
my $union = ""; #> | |
my $parent_provisional_feat = ""; #> | |
my $myfeat; #> | |
my $key; #> | |
my $cost = 0; #> | |
for ($j = 0; $j < @$myfeats; $j++) | |
{ | |
$myfeat = $myfeats->[$j]; | |
@daughter_feats = &Avril_modules::empty_array(@daughter_feats); | |
$intersection = ""; | |
$union = ""; | |
# GET THE PROVISIONAL FEATURES FOR $parent_name FOR $myfeats->[$j]: | |
for ($i = 0; $i < @$provisional_feats; $i++) | |
{ | |
# GET THE PROVISIONAL FEATURES FOR DAUGHTER $i FOR $j: | |
$provisional_feat = $provisional_feats->[$i]; # eg yes_yes_no_yes,no_no | |
@provisional_feat = split(/\_/,$provisional_feat); | |
$daughter_feat = $provisional_feat[$j]; | |
@daughter_feats = (@daughter_feats,$daughter_feat); | |
} | |
# SEE IF THE DAUGHTER NODES SHARE ANY FEATURES: | |
$union = &get_union(\@daughter_feats); | |
$intersection = &get_intersection(\@daughter_feats); | |
# IF THE INTESECTION {S} OF THE DAUGHTER NODES' FEATURES IS NON-EMPTY, | |
# THEN THE PARENT'S PROVISIONAL FEATURE SET IS SET EQUAL TO THAT INTERSECTION {S}. | |
if ($intersection ne "") | |
{ | |
$key = $parent_name."_".$myfeat; | |
$PROVISIONAL_FEAT->{$key} = $intersection; | |
$parent_provisional_feat = $parent_provisional_feat."_".$intersection; | |
} | |
# IF THE INTERSECTION S OF THE DAUGHTER NODES' FEATURES IS EMPTY, | |
# THEN SET THE PARENT'S PROVISIONAL FEATURE SET TO BE EQUAL TO THE UNION | |
# OF THE DAUGHTER NODES' FEATURES: | |
else | |
{ | |
$key = $parent_name."_".$myfeat; | |
$PROVISIONAL_FEAT->{$key} = $union; | |
$parent_provisional_feat = $parent_provisional_feat."_".$union; | |
# INCREMENT COST BY 1: | |
$cost++; | |
} | |
} | |
$parent_provisional_feat = substr($parent_provisional_feat,1,length($parent_provisional_feat)-1); | |
if ($VERBOSE_POSTORDER == 1) | |
{ | |
print "Parent $parent_name has provisional feat $parent_provisional_feat : cost $cost\n"; | |
} | |
return($cost); | |
} | |
#------------------------------------------------------------------# | |
# RECORD THE FEATURES FOR PARENT $parent. | |
# IN THE POST-ORDER TRAVERSAL, WE ASSIGN TO EACH INTERNAL NODE | |
# A SET OF PROVISIONAL TERMS, SET S: | |
# (i) IF THE INTERSECTION OF THE TERMS ASSIGNED TO ITS CHILDREN IS NOT EMPTY, | |
# THEN SET S IS THAT INTERSECTION. | |
# (ii) IF THE INTERSECTION OF THE TERMS ASSIGNED TO ITS CHILDREN IS EMPTY, | |
# THEN SET S IS THE UNION OF THE TERMS ASSIGNED TO ITS CHILDREN. | |
sub assign_provisional_feats_rd2 | |
{ | |
my $FEATURES = $_[0]; #> THE FEATURES. | |
my $parent = $_[1]; #> | |
my $parent_name = $_[2]; #> | |
my $PROVISIONAL_FEAT = $_[3]; #> | |
my $VERBOSE_POSTORDER = $_[4]; #> | |
my $myfeats = $_[5]; #> ARRAY OF ALL THE FEATURES IN THE TREE. | |
my $daughter; #> | |
my $daughter_name; #> | |
my $daughter_seqname; #> | |
my $daughter_feats; #> | |
my @daughter_feats; #> | |
my $cost = 0; #> | |
my $success = 1; #> | |
if ($VERBOSE_POSTORDER == 1) { print "___ finding provisional feats for parent $parent_name\n";} | |
# FIND THE CHILDREN OF NODE $parent: | |
if (!($parent->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $parent: | |
foreach $daughter ($parent->children) | |
{ | |
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); } | |
else { $daughter_name = $daughter->internalID;} | |
if ($VERBOSE_POSTORDER == 1) { print "___ looking at daughter $daughter_name of parent $parent_name\n";} | |
# FIND THE FEATURES OF NODE $daughter: | |
if ($daughter->is_leaf) | |
{ | |
$daughter_seqname = $daughter->get_tag_value('O'); | |
($success,$daughter_feats) = &get_daughter_feats($daughter_seqname,$FEATURES,$myfeats); | |
if ($success == 0) { print STDERR "ERROR: could not get feats for $daughter_seqname\n"; exit;} | |
@daughter_feats = (@daughter_feats,$daughter_feats); | |
if ($VERBOSE_POSTORDER == 1) { print "___ daughter $daughter_name ($daughter_seqname) has feats $daughter_feats\n";} | |
} | |
else # THE DAUGHTER $daughter IS AN INTERNAL NODE: | |
{ | |
($success,$daughter_feats) = &get_daughter_feats($daughter_name,$PROVISIONAL_FEAT,$myfeats); | |
if ($success == 0) | |
{ | |
if ($VERBOSE_POSTORDER == 1) { print "Do not know provisional feats for $daughter_name yet ...\n";} | |
return($cost,$success); | |
} | |
@daughter_feats = (@daughter_feats,$daughter_feats); | |
if ($VERBOSE_POSTORDER == 1) { print "___ daughter $daughter_name (internal) has feats $daughter_feats\n";} | |
} | |
} | |
} | |
else { print STDERR "ERROR: parent $parent_name is a sequence.\n"; exit;} | |
# GET THE PROVISIONAL FEATURES FOR $parent_name: | |
$cost = &get_provisional_feat(\@daughter_feats,$parent_name,$myfeats,$PROVISIONAL_FEAT, | |
$VERBOSE_POSTORDER); | |
return($cost,$success); | |
} | |
#------------------------------------------------------------------# | |
# RECORD THE FEATURES FOR PARENT $parent. IN THE POST-ORDER | |
# TRAVERSAL, WE ASSIGN TO EACH INTERNAL NODE A SET OF PROVISIONAL | |
# TERMS THAT IS THE UNION OF THE TERMS ASSIGNED TO ITS CHILDREN: | |
sub assign_provisional_feats_rd1 | |
{ | |
my $FEATURES = $_[0]; # THE FEATURES. | |
my $parent = $_[1]; # | |
my $parent_name = $_[2]; # | |
my $PROVISIONAL_FEAT = $_[3]; # | |
my $VERBOSE_POSTORDER = $_[4]; # | |
my $daughter; # | |
my $daughter_name; # | |
my $daughter_seqname; # | |
my $daughter_feats; # | |
my @daughter_feats; # | |
my $provisional_feat = ""; # | |
my $i; # | |
my %SEEN = (); # | |
if ($VERBOSE_POSTORDER == 1) { print "___ finding provisional feats for parent $parent_name\n";} | |
# FIND THE CHILDREN OF NODE $parent: | |
if (!($parent->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $parent: | |
foreach $daughter ($parent->children) | |
{ | |
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); } | |
else { $daughter_name = $daughter->internalID;} | |
if ($VERBOSE_POSTORDER == 1) { print "___ looking at daughter $daughter_name of parent $parent_name\n";} | |
# FIND THE FEATURES OF NODE $daughter: | |
if ($daughter->is_leaf) | |
{ | |
$daughter_seqname = $daughter->get_tag_value('O'); | |
if ($FEATURES->{$daughter_seqname}) | |
{ | |
$daughter_feats = $FEATURES->{$daughter_seqname}; | |
if ($VERBOSE_POSTORDER == 1) { print "___ daughter $daughter_name ($daughter_seqname) has feats $daughter_feats\n";} | |
@daughter_feats = split(/\,/,$daughter_feats); | |
for ($i = 0; $i <= $#daughter_feats; $i++) | |
{ | |
if (!($SEEN{$daughter_feats[$i]})) | |
{ | |
$SEEN{$daughter_feats[$i]} = 1; | |
$provisional_feat = $provisional_feat.",".$daughter_feats[$i]; | |
} | |
} | |
} | |
} | |
else | |
{ | |
if ($PROVISIONAL_FEAT->{$daughter_name}) | |
{ | |
$daughter_feats = $PROVISIONAL_FEAT->{$daughter_name}; | |
if ($VERBOSE_POSTORDER == 1) { print "___ daughter $daughter_name has feats $daughter_feats\n";} | |
@daughter_feats = split(/\,/,$daughter_feats); | |
for ($i = 0; $i <= $#daughter_feats; $i++) | |
{ | |
if (!($SEEN{$daughter_feats[$i]})) | |
{ | |
$SEEN{$daughter_feats[$i]} = 1; | |
$provisional_feat = $provisional_feat.",".$daughter_feats[$i]; | |
} | |
} | |
} | |
} | |
} | |
} | |
else { print STDERR "ERROR: parent $parent_name is a sequence.\n"; exit;} | |
if ($provisional_feat ne "") | |
{ | |
$provisional_feat = substr($provisional_feat,1,length($provisional_feat)-1); | |
if ($VERBOSE_POSTORDER == 1) { print "___ parent $parent_name has provisional feat $provisional_feat\n";} | |
$PROVISIONAL_FEAT->{$parent_name} = $provisional_feat; | |
} | |
} | |
#------------------------------------------------------------------# | |
# DO A POSTORDER TRAVERSAL OF THE TREE: | |
sub do_postorder_traversal_rd2 | |
{ | |
my $tree = $_[0]; #> THE TREE OBJECT. | |
my $FEATURES = $_[1]; #> THE FEATURES. | |
my $myfeats = $_[2]; #> ARRAY OF ALL THE FEATURES IN THE WHOLE FAMILY. | |
my $root; #> | |
my $root_name; #> | |
my @nodes; #> | |
my @parents; #> | |
my $no_rounds; #> | |
my $no_new_nodes; #> | |
my %SEEN_NODE = (); #> | |
my %SEEN_PARENT = (); #> | |
my $node; #> | |
my $node_name; #> | |
my $VERBOSE_POSTORDER = 0; #> | |
my $parent; #> | |
my $parent_name; #> | |
my $daughter; #> | |
my $daughter_name; #> | |
my $i; #> | |
my %PROVISIONAL_FEAT = (); #> | |
my $total_cost = 0; #> | |
my $cost = 0; #> | |
my $success = 0; #> | |
# FIND THE ROOT OF THE TREE: | |
$root = $tree->root; | |
$root_name = $root->internalID; | |
# LIST ALL THE LEAVES FROM THE TREE: | |
@nodes = &Avril_modules::empty_array(@nodes); | |
@parents = &Avril_modules::empty_array(@parents); | |
@nodes = $tree->get_leaves; | |
$no_rounds = 0; | |
NEXT_ROUND1_rd2: | |
$no_rounds++; | |
$no_new_nodes = 0; | |
%SEEN_NODE = (); | |
%SEEN_PARENT = (); | |
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1, | |
# THIS ARRAY CONTAINS THE LEAVES. WHEN $no_rounds=2, THIS ARRAY | |
# CONTAINS THEIR PARENTS. WHEN $no_rounds=3, THIS ARRAY CONTAINS | |
# THEIR GRANDPARENTS, AND SO ON. | |
foreach $node(@nodes) | |
{ | |
if ($node->is_leaf) { $node_name = $node->id(); } | |
else { $node_name = $node->internalID(); } | |
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited node $node_name\n";} | |
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;} | |
# FIND THE PARENT OF NODE $node: | |
if (defined($node->parent)) { $parent = $node->parent; } | |
else { print STDERR "ERROR: do not know parent of node $node_name.\n"; exit;} | |
$parent_name = $parent->internalID; | |
# FIND THE OTHER CHILD OF NODE $parent: | |
if (!($parent->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $parent: | |
foreach $daughter ($parent->children) | |
{ | |
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); } | |
else { $daughter_name = $daughter->internalID;} | |
if ($daughter_name ne $node_name) | |
{ | |
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited sibling $daughter_name\n";} | |
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;} | |
} | |
} | |
} | |
else { print STDERR "ERROR: parent $parent_name is a sequence.\n"; exit;} | |
# RECORD THAT WE VISITED THE PARENT $parent: | |
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: saw parent $parent_name\n";} | |
if (!($SEEN_PARENT{$parent_name}) && $parent_name ne $root_name) | |
{ | |
# RECORD THE FEATURES FOR PARENT $parent. | |
# IN THE POST-ORDER TRAVERSAL, WE ASSIGN TO EACH INTERNAL NODE | |
# A SET OF PROVISIONAL TERMS, SET S: | |
# (i) IF THE INTERSECTION OF THE TERMS ASSIGNED TO ITS CHILDREN IS NOT EMPTY, | |
# THEN SET S IS THAT INTERSECTION. | |
# (ii) IF THE INTERSECTION OF THE TERMS ASSIGNED TO ITS CHILDREN IS EMPTY, | |
# THEN SET S IS THE UNION OF THE TERMS ASSIGNED TO ITS CHILDREN. | |
($cost,$success) = &assign_provisional_feats_rd2($FEATURES,$parent,$parent_name,\%PROVISIONAL_FEAT, | |
$VERBOSE_POSTORDER,$myfeats); | |
if ($success == 1) | |
{ | |
@parents = (@parents,$parent); | |
$SEEN_PARENT{$parent_name} = 1; | |
$total_cost = $total_cost + $cost; | |
} | |
else # xxx need to fix for rd1 alg | |
{ | |
@parents = (@parents,$node); | |
$SEEN_PARENT{$node_name} = 1; | |
} | |
} | |
} | |
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds rounds: number new = $no_new_nodes\n";} | |
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL: | |
if ($no_new_nodes > 0) | |
{ | |
@nodes = &Avril_modules::empty_array(@nodes); | |
for ($i = 0; $i <= $#parents; $i++) { @nodes = (@nodes,$parents[$i]);} | |
@parents = &Avril_modules::empty_array(@parents); | |
goto NEXT_ROUND1_rd2; | |
} | |
# VISIT ROOT: | |
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited root $root_name\n";} | |
($cost,$success) = &assign_provisional_feats_rd2($FEATURES,$root,$root_name,\%PROVISIONAL_FEAT, | |
$VERBOSE_POSTORDER,$myfeats); | |
if ($success == 0) { print STDERR "ERROR: success $success at root node.\n"; exit;} | |
$total_cost = $total_cost + $cost; | |
if ($VERBOSE_POSTORDER == 1) { print "Finished post-order traversal, total cost $total_cost\n";} | |
return(\%PROVISIONAL_FEAT); | |
} | |
#------------------------------------------------------------------# | |
# DO A POSTORDER TRAVERSAL OF THE TREE: | |
sub do_postorder_traversal_rd1 | |
{ | |
my $tree = $_[0]; # THE TREE OBJECT. | |
my $FEATURES = $_[1]; # THE FEATURES. | |
my $root; # | |
my $root_name; # | |
my @nodes; # | |
my @parents; # | |
my $no_rounds; # | |
my $no_new_nodes; # | |
my %SEEN_NODE = (); # | |
my %SEEN_PARENT = (); # | |
my $node; # | |
my $node_name; # | |
my $VERBOSE_POSTORDER = 0; # | |
my $parent; # | |
my $parent_name; # | |
my $daughter; # | |
my $daughter_name; # | |
my $i; # | |
my %PROVISIONAL_FEAT = (); # | |
# FIND THE ROOT OF THE TREE: | |
$root = $tree->root; | |
$root_name = $root->internalID; | |
# LIST ALL THE LEAVES FROM THE TREE: | |
@nodes = &Avril_modules::empty_array(@nodes); | |
@parents = &Avril_modules::empty_array(@parents); | |
@nodes = $tree->get_leaves; | |
$no_rounds = 0; | |
NEXT_ROUND1_rd1: | |
$no_rounds++; | |
$no_new_nodes = 0; | |
%SEEN_NODE = (); | |
%SEEN_PARENT = (); | |
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1, | |
# THIS ARRAY CONTAINS THE LEAVES. WHEN $no_rounds=2, THIS ARRAY | |
# CONTAINS THEIR PARENTS. WHEN $no_rounds=3, THIS ARRAY CONTAINS | |
# THEIR GRANDPARENTS, AND SO ON. | |
foreach $node(@nodes) | |
{ | |
if ($node->is_leaf) { $node_name = $node->id(); } | |
else { $node_name = $node->internalID(); } | |
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited node $node_name\n";} | |
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;} | |
# FIND THE PARENT OF NODE $node: | |
if (defined($node->parent)) { $parent = $node->parent; } | |
else { print STDERR "ERROR: do not know parent of node $node_name.\n"; exit;} | |
$parent_name = $parent->internalID; | |
# FIND THE OTHER CHILD OF NODE $parent: | |
if (!($parent->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $parent: | |
foreach $daughter ($parent->children) | |
{ | |
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); } | |
else { $daughter_name = $daughter->internalID;} | |
if ($daughter_name ne $node_name) | |
{ | |
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited sibling $daughter_name\n";} | |
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;} | |
} | |
} | |
} | |
else { print STDERR "ERROR: parent $parent_name is a sequence.\n"; exit;} | |
# RECORD THAT WE VISITED THE PARENT $parent: | |
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: saw parent $parent_name\n";} | |
if (!($SEEN_PARENT{$parent_name}) && $parent_name ne $root_name) | |
{ | |
@parents = (@parents,$parent); | |
$SEEN_PARENT{$parent_name} = 1; | |
# RECORD THE FEATURES FOR PARENT $parent. IN THE POST-ORDER | |
# TRAVERSAL, WE ASSIGN TO EACH INTERNAL NODE A SET OF PROVISIONAL | |
# TERMS THAT IS THE UNION OF THE TERMS ASSIGNED TO ITS CHILDREN: | |
&assign_provisional_feats_rd1($FEATURES,$parent,$parent_name,\%PROVISIONAL_FEAT,$VERBOSE_POSTORDER); | |
} | |
} | |
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds rounds: number new = $no_new_nodes\n";} | |
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL: | |
if ($no_new_nodes > 0) | |
{ | |
@nodes = &Avril_modules::empty_array(@nodes); | |
for ($i = 0; $i <= $#parents; $i++) { @nodes = (@nodes,$parents[$i]);} | |
@parents = &Avril_modules::empty_array(@parents); | |
goto NEXT_ROUND1_rd1; | |
} | |
# VISIT ROOT: | |
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited root $root_name\n";} | |
&assign_provisional_feats_rd1($FEATURES,$root,$root_name,\%PROVISIONAL_FEAT,$VERBOSE_POSTORDER); | |
if ($VERBOSE_POSTORDER == 1) { print "Finished post-order traversal\n";} | |
return(\%PROVISIONAL_FEAT); | |
} | |
#------------------------------------------------------------------# | |
# GET THE DESCENDANTS OF A NODE: | |
sub get_descendants | |
{ | |
my $this_node = $_[0]; # | |
my $SPECIES_WITH_DATA = $_[1]; # | |
my @nodes; # | |
my @children; # | |
my $no_rounds; # | |
my $no_new_nodes; # | |
my %SEEN_NODE = (); # | |
my %SEEN_CHILD = (); # | |
my $node; # | |
my $node_name; # | |
my $child; # | |
my $child_name; # | |
my $i; # | |
my $descendants = ""; # | |
my $no_descendants = 0; # | |
# ARRAY @nodes STARTS BY CONTAINING JUST THE NODE $this_node: | |
@nodes = &Avril_modules::empty_array(@nodes); | |
@children = &Avril_modules::empty_array(@children); | |
@nodes = (@nodes,$this_node); | |
$no_rounds = 0; | |
GET_MORE_DESCENDANTS: | |
$no_rounds++; | |
$no_new_nodes = 0; | |
%SEEN_NODE = (); | |
%SEEN_CHILD = (); | |
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1, | |
# THIS ARRAY CONTAINS THE ROOT. WHEN $no_rounds=2, THIS ARRAY | |
# CONTAINS ITS CHILDREN. WHEN $no_rounds=3, THIS ARRAY CONTAINS | |
# ITS GRANDCHILDREN, AND SO ON. | |
foreach $node(@nodes) | |
{ | |
if ($node->is_leaf) { $node_name = $node->id(); } | |
else { $node_name = $node->internalID(); } | |
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;} | |
# FIND THE CHILDREN OF NODE $node: | |
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $node: | |
foreach $child ($node->children) | |
{ | |
if ($child->is_leaf) { $child_name = $child->id(); } | |
else { $child_name = $child->internalID;} | |
# RECORD THAT WE SAW THE CHILD $child: | |
if (!($SEEN_CHILD{$child_name})) | |
{ | |
@children = (@children,$child); | |
$SEEN_CHILD{$child_name} = 1; | |
} | |
} | |
} | |
else | |
{ | |
$node_seqname = $node->get_tag_value('O'); | |
# ONLY TAKE THIS SEQUENCE IF IT IS FROM A SPECIES WITH GO/PFAM/OTHER FEATURE ANNOTATION DATA: | |
if ($SPECIES_WITH_DATA->{$node_seqname}) | |
{ | |
$descendants = $descendants.",".$node_seqname; | |
$no_descendants++; | |
} | |
} | |
# ELSE, THIS NODE IS A SEQUENCE, SO DOES NOT HAVE ANY CHILDREN. | |
} | |
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL: | |
if ($no_new_nodes > 0) | |
{ | |
@nodes = &Avril_modules::empty_array(@nodes); | |
for ($i = 0; $i <= $#children; $i++) { @nodes = (@nodes,$children[$i]);} | |
@children = &Avril_modules::empty_array(@children); | |
goto GET_MORE_DESCENDANTS; | |
} | |
if ($descendants ne '') | |
{ | |
$descendants = substr($descendants,1,length($descendants)-1); | |
} | |
return($descendants,$no_descendants); | |
} | |
#------------------------------------------------------------------# | |
# CHECK WHETHER THE CONFIRMED FEATURES FOR NODE $node ARE | |
# DIFFERENT FROM THOSE OF ITS CHILDREN. | |
sub compare_parent_to_children_rd2 | |
{ | |
my $CONFIRMED_FEAT = $_[0]; #> | |
my $node = $_[1]; #> | |
my $node_name = $_[2]; #> | |
my $VERBOSE_CHANGES = $_[3]; #> | |
my $myfeats = $_[4]; #> | |
my $SPECIES_WITH_DATA = $_[5]; #> | |
my $daughter; #> | |
my $daughter_name; #> | |
my $daughter_seqname; #> | |
my $daughter_key; #> | |
my $j; #> | |
my $confirmed1; #> | |
my $confirmed2; #> | |
my $myfeat; #> | |
my $key; #> | |
my $leaf = 0; #> | |
my $descendants; #> | |
my $no_descendants; #> | |
my $node_type; #> | |
# COMPARE $node TO EACH OF ITS CHILDREN FOR EACH FEATURE: | |
if ($VERBOSE_CHANGES == 1) { print "___ comparing parent $node_name to its children\n";} | |
# CHECK IF $node IS A SPECIATION NODE OR A DUPLICATION NODE: | |
# $node_type HAS VALUE Y FOR DUPLICATION, N FOR SPECIATION. | |
$node_type = $node->get_tag_value('D'); | |
if ($node_type eq 'Y') { $node_type = "duplication"; } | |
elsif ($node_type eq 'N') { $node_type = "speciation"; } | |
else { print STDERR "ERROR: node_type $node_type.\n"; exit;} | |
for ($j = 0; $j < @$myfeats; $j++) | |
{ | |
$myfeat = $myfeats->[$j]; | |
$key = $node_name."_".$myfeat; | |
if (!($CONFIRMED_FEAT->{$key})) { print STDERR "ERROR: do not know confirmed feat for $key\n"; exit;} | |
$confirmed1 = $CONFIRMED_FEAT->{$key}; | |
# FIND THE TWO DAUGHTER NODES OF $node: | |
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $node: | |
foreach $daughter ($node->children) | |
{ | |
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); } | |
else { $daughter_name = $daughter->internalID;} | |
# FIND THE CONFIRMED FEATURES OF NODE $daughter: | |
if ($daughter->is_leaf) | |
{ | |
$leaf = 1; | |
$daughter_seqname = $daughter->get_tag_value('O'); | |
$daughter_key = $daughter_seqname."_".$myfeat; | |
} | |
else | |
{ | |
$leaf = 0; | |
$daughter_key = $daughter_name."_".$myfeat; | |
} | |
if (!($CONFIRMED_FEAT->{$daughter_key})) { print STDERR "ERROR: do not know confirmed feat for $daughter_key\n"; exit;} | |
$confirmed2 = $CONFIRMED_FEAT->{$daughter_key}; | |
if ($confirmed1 ne $confirmed2) | |
{ | |
if ($confirmed1 eq 'yes' && $confirmed2 eq 'no') # APPARENT FEATURE LOSS. | |
{ | |
($descendants,$no_descendants) = &get_descendants($daughter,$SPECIES_WITH_DATA); | |
if ($leaf == 1) | |
{ | |
print "*** FEATURE_LOSS_LEAF: $node_type node $node_name has feature $myfeat, that daughter $daughter_name does not have\n"; | |
print " FEATURE_LOSS_LEAF: $daughter_name has descendants $descendants\n\n"; | |
} | |
else | |
{ | |
if ($no_descendants > 1) | |
{ | |
print "*** FEATURE_LOSS_INTERNAL: $node_type node $node_name has feature $myfeat, that daughter $daughter_name does not have\n"; | |
print " FEATURE_LOSS_INTERNAL: $daughter_name has descendants $descendants\n\n"; | |
} | |
else | |
{ | |
print "*** FEATURE_LOSS_LEAF: $node_type node $node_name has feature $myfeat, that daughter $daughter_name does not have\n"; | |
print " FEATURE_LOSS_LEAF: $daughter_name has descendants $descendants\n\n"; | |
} | |
} | |
} | |
elsif ($confirmed1 eq 'no' && $confirmed2 eq 'yes') # APPARENT FEATURE GAIN. | |
{ | |
($descendants,$no_descendants) = &get_descendants($daughter,$SPECIES_WITH_DATA); | |
if ($leaf == 1) | |
{ | |
print "*** FEATURE_GAIN_LEAF: $node_type node $node_name doesn't have feature $myfeat, but daughter $daughter_name does have it\n"; | |
print "*** FEATURE_GAIN_LEAF: $daughter_name has descendants $descendants\n\n"; | |
} | |
else | |
{ | |
if ($no_descendants > 1) | |
{ | |
print "*** FEATURE_GAIN_INTERNAL: $node_type node $node_name doesn't have feature $myfeat, but daughter $daughter_name does have it\n"; | |
print " FEATURE_GAIN_INTERNAL: $daughter_name has descendants $descendants\n\n"; | |
} | |
else | |
{ | |
print "*** FEATURE_GAIN_LEAF: $node_type node $node_name doesn't have feature $myfeat, but daughter $daughter_name does have it\n"; | |
print " FEATURE_GAIN_LEAF: $daughter_name has descendants $descendants\n\n"; | |
} | |
} | |
} | |
} | |
} | |
} | |
else | |
{ | |
print STDERR "ERROR: parent $node_name is a sequence\n"; exit; | |
} | |
} | |
} | |
#------------------------------------------------------------------# | |
# CHECK WHETHER THE CONFIRMED FEATURES FOR NODE $node ARE | |
# DIFFERENT FROM THOSE OF ITS CHILDREN. | |
sub compare_parent_to_children_rd1 | |
{ | |
my $CONFIRMED_FEAT = $_[0]; # | |
my $node = $_[1]; # | |
my $node_name = $_[2]; # | |
my $VERBOSE_CHANGES = $_[3]; # | |
my $FEATURES = $_[4]; # | |
my $SPECIES_WITH_DATA = $_[5]; # | |
my $daughter; # | |
my $daughter_name; # | |
my $daughter_seqname; # | |
my $daughter_feats; # | |
my $node_feats; # | |
my @daughter_feats; # | |
my @node_feats; # | |
my %NODE_FEAT = (); # | |
my %DAUGHTER_FEAT = (); # | |
my $i; # | |
my $feat; # | |
my %SEEN = (); # | |
my $change1; # | |
my $change2; # | |
my $descendants; # | |
my $no_descendants; # | |
# FIND THE FEATURES FOR $node: | |
if (!($CONFIRMED_FEAT->{$node_name})) { $node_feats = "none"; } | |
else { $node_feats = $CONFIRMED_FEAT->{$node_name};} | |
if ($VERBOSE_CHANGES == 1) { print "___ comparing parent $node_name ($node_feats) to its children\n";} | |
# FIND THE CHILDREN OF NODE $node: | |
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $node: | |
foreach $daughter ($node->children) | |
{ | |
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); } | |
else { $daughter_name = $daughter->internalID;} | |
if ($VERBOSE_CHANGES == 1) { print "___ comparing parent $node_name to daughter $daughter_name\n";} | |
# FIND THE CONFIRMED FEATURES OF NODE $daughter: | |
$daughter_feats = ""; | |
if ($daughter->is_leaf) | |
{ | |
$daughter_seqname = $daughter->get_tag_value('O'); | |
if ($FEATURES->{$daughter_seqname}) | |
{ | |
$daughter_feats = $FEATURES->{$daughter_seqname}; | |
} | |
else { $daughter_feats = "none";} | |
if ($VERBOSE_CHANGES == 1) { print "___ daughter $daughter_name ($daughter_seqname) has feats $daughter_feats\n";} | |
} | |
else | |
{ | |
if ($CONFIRMED_FEAT->{$daughter_name}) { $daughter_feats = $CONFIRMED_FEAT->{$daughter_name};} | |
else { $daughter_feats = "none"; } | |
if ($VERBOSE_CHANGES == 1) { print "___ daughter $daughter_name has feats $daughter_feats\n";} | |
} | |
# CHECK IF $daughter_feats IS DIFFERENT FROM $node_feats: | |
@node_feats = split(/\,/,$node_feats); | |
@daughter_feats = split(/\,/,$daughter_feats); | |
%NODE_FEAT = (); | |
%DAUGHTER_FEAT = (); | |
# RECORD ALL THE FEATURES FOR $node: | |
for ($i = 0; $i <= $#node_feats; $i++) | |
{ | |
if ($node_feats[$i] ne 'none') | |
{ | |
$NODE_FEAT{$node_feats[$i]} = 1; | |
} | |
} | |
# RECORD ALL THE FEATURES FOR $daughter: | |
for ($i = 0; $i <= $#daughter_feats; $i++) | |
{ | |
if ($daughter_feats[$i] ne 'none') | |
{ | |
$DAUGHTER_FEAT{$daughter_feats[$i]} = 1; | |
} | |
} | |
# CHECK IF $node HAS ANY FEATURES THAT $daughter DOES NOT HAVE: | |
foreach $feat (keys %NODE_FEAT) | |
{ | |
if (!($DAUGHTER_FEAT{$feat})) | |
{ | |
$change1 = $node_name."_".$daughter_name."_".$feat; | |
$change2 = $daughter_name."_".$node_name."_".$feat; | |
if (!($SEEN{$change1}) && !($SEEN{$change2})) | |
{ | |
$SEEN{$change1} = 1; | |
$SEEN{$change2} = 1; | |
# xxx only print changes at internal nodes? | |
if (!($daughter->is_leaf())) # ONLY PRINT OUT APPARENT LOSSES IF $daughter IS AN INTERNAL NODE, | |
# SO THE LOSS HAS OCCURRED IN MORE THAN ONE SEQUENCE. | |
{ | |
($descendants,$no_descendants) = &get_descendants($daughter,$SPECIES_WITH_DATA); | |
if ($no_descendants > 1) | |
{ | |
print "*** INTERNAL FEATURE LOSS: node $node_name has feature $feat, that daughter $daughter_name does not have\n"; | |
print " INTERNAL FEATURE LOSS: $daughter_name has descendants $descendants\n\n"; | |
} | |
} | |
} | |
} | |
} | |
# CHECK IF $daughter HAS ANY FEATURES THAT $node DOES NOT HAVE: | |
foreach $feat (keys %DAUGHTER_FEAT) | |
{ | |
if (!($NODE_FEAT{$feat})) | |
{ | |
$change1 = $node_name."_".$daughter_name."_".$feat; | |
$change2 = $daughter_name."_".$node_name."_".$feat; | |
if (!($SEEN{$change1}) && !($SEEN{$change2})) | |
{ | |
$SEEN{$change1} = 1; | |
$SEEN{$change2} = 1; | |
# xxx only print changes at internal nodes? | |
if (!($daughter->is_leaf())) # ONLY PRINT OUT APPARENT GAINS IF THE $daughter NODE IS AN INTERNAL NODE, | |
# SO THE GAIN HAS OCCURRED IN MORE THAN ONE SEQUENCE. | |
{ | |
($descendants,$no_descendants) = &get_descendants($daughter,$SPECIES_WITH_DATA); | |
if ($no_descendants > 1) | |
{ | |
print "*** INTERNAL FEATURE GAIN: daughter $daughter_name has feature $feat, that parent $node_name does not have\n"; | |
print " INTERNAL FEATURE GAIN: $daughter_name has descendants $descendants\n"; | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
else { print STDERR "ERROR: parent $node_name is a sequence.\n"; exit;} | |
} | |
#------------------------------------------------------------------# | |
# FIND NODES IN THE TREE WHERE CONFIRMED FEATURES OF A PARENT NODE | |
# DIFFERS FROM THOSE OF ITS DAUGHTER NODES: | |
sub find_changes_in_feat_rd2 | |
{ | |
my $tree = $_[0]; #> | |
my $CONFIRMED_FEAT = $_[1]; #> | |
my $FEATURES = $_[2]; #> | |
my $myfeats = $_[3]; #> | |
my $SPECIES_WITH_DATA = $_[4]; #> | |
my $family_id = $_[5]; #> | |
my $root; #> | |
my $root_name; #> | |
my @nodes; #> | |
my @children; #> | |
my $no_rounds; #> | |
my $no_new_nodes; #> | |
my %SEEN_NODE = (); #> | |
my %SEEN_CHILD = (); #> | |
my $node; #> | |
my $node_name; #> | |
my $VERBOSE_CHANGES = 0; #> | |
my $child; #> | |
my $child_name; #> | |
my $i; #> | |
my $PRINT_ROOT = 1; #> | |
my $myfeat; #> | |
my $key; #> | |
my $confirmed; #> | |
my $root_confirmed = ""; #> | |
# FIND THE ROOT OF THE TREE: | |
$root = $tree->root; | |
$root_name = $root->internalID; | |
# PRINT THE CONFIRMED FEATURES IN THE ROOT: | |
print "\nRoot for $family_id: "; | |
for ($i = 0; $i < @$myfeats; $i++) | |
{ | |
$myfeat = $myfeats->[$i]; | |
$key = $root_name."_".$myfeat; | |
if (!($CONFIRMED_FEAT->{$key})) { print STDERR "ERROR: do not know confirmed feat for $key\n"; exit;} | |
$confirmed = $CONFIRMED_FEAT->{$key}; | |
if ($confirmed eq 'yes') { $root_confirmed = $root_confirmed." $myfeat=$confirmed";} | |
} | |
if ($root_confirmed eq '') { $root_confirmed = "none"; } | |
else { $root_confirmed = substr($root_confirmed,1,length($root_confirmed)-1);} | |
print "$root_confirmed\n\n"; | |
# ARRAY @nodes STARTS BY CONTAINING JUST THE ROOT NODE: | |
@nodes = &Avril_modules::empty_array(@nodes); | |
@children = &Avril_modules::empty_array(@children); | |
@nodes = (@nodes,$root); | |
$no_rounds = 0; | |
NEXT_ROUND3_rd2: | |
$no_rounds++; | |
$no_new_nodes = 0; | |
%SEEN_NODE = (); | |
%SEEN_CHILD = (); | |
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1, | |
# THIS ARRAY CONTAINS THE ROOT. WHEN $no_rounds=2, THIS ARRAY | |
# CONTAINS ITS CHILDREN. WHEN $no_rounds=3, THIS ARRAY CONTAINS | |
# ITS GRANDCHILDREN, AND SO ON. | |
foreach $node(@nodes) | |
{ | |
if ($node->is_leaf) { $node_name = $node->id(); } | |
else { $node_name = $node->internalID(); } | |
if ($VERBOSE_CHANGES == 1) { print "$no_rounds: visited node $node_name\n";} | |
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;} | |
# CHECK WHETHER THE CONFIRMED FEATURES FOR NODE $node ARE | |
# DIFFERENT FROM THOSE OF ITS CHILDREN. | |
if (!($node->is_leaf)) | |
{ | |
&compare_parent_to_children_rd2($CONFIRMED_FEAT,$node,$node_name,$VERBOSE_CHANGES,$myfeats,$SPECIES_WITH_DATA); | |
} | |
# FIND THE CHILDREN OF NODE $node: | |
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $node: | |
foreach $child ($node->children) | |
{ | |
if ($child->is_leaf) { $child_name = $child->id(); } | |
else { $child_name = $child->internalID;} | |
if ($VERBOSE_CHANGES == 1) { print "$no_rounds: saw child $child_name\n"; } | |
# RECORD THAT WE SAW THE CHILD $child: | |
if (!($SEEN_CHILD{$child_name})) | |
{ | |
@children = (@children,$child); | |
$SEEN_CHILD{$child_name} = 1; | |
} | |
} | |
} # ELSE, THIS NODE IS A SEQUENCE, SO DOES NOT HAVE ANY CHILDREN. | |
} | |
if ($VERBOSE_CHANGES == 1) { print "$no_rounds rounds: number new = $no_new_nodes\n";} | |
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL: | |
if ($no_new_nodes > 0) | |
{ | |
@nodes = &Avril_modules::empty_array(@nodes); | |
for ($i = 0; $i <= $#children; $i++) { @nodes = (@nodes,$children[$i]);} | |
@children = &Avril_modules::empty_array(@children); | |
goto NEXT_ROUND3_rd2; | |
} | |
if ($VERBOSE_CHANGES == 1) { print "Finished searching for changes in features\n";} | |
} | |
#------------------------------------------------------------------# | |
# FIND NODES IN THE TREE WHERE CONFIRMED FEATURES OF A PARENT NODE | |
# DIFFERS FROM THOSE OF ITS DAUGHTER NODES: | |
sub find_changes_in_feat_rd1 | |
{ | |
my $tree = $_[0]; # | |
my $CONFIRMED_FEAT = $_[1]; # | |
my $FEATURES = $_[2]; # | |
my $SPECIES_WITH_DATA = $_[3]; # | |
my $root; # | |
my $root_name; # | |
my @nodes; # | |
my @children; # | |
my $no_rounds; # | |
my $no_new_nodes; # | |
my %SEEN_NODE = (); # | |
my %SEEN_CHILD = (); # | |
my $node; # | |
my $node_name; # | |
my $VERBOSE_CHANGES = 0; # | |
my $child; # | |
my $child_name; # | |
my $i; # | |
# FIND THE ROOT OF THE TREE: | |
$root = $tree->root; | |
$root_name = $root->internalID; | |
# ARRAY @nodes STARTS BY CONTAINING JUST THE ROOT NODE: | |
@nodes = &Avril_modules::empty_array(@nodes); | |
@children = &Avril_modules::empty_array(@children); | |
@nodes = (@nodes,$root); | |
$no_rounds = 0; | |
NEXT_ROUND3_rd1: | |
$no_rounds++; | |
$no_new_nodes = 0; | |
%SEEN_NODE = (); | |
%SEEN_CHILD = (); | |
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1, | |
# THIS ARRAY CONTAINS THE ROOT. WHEN $no_rounds=2, THIS ARRAY | |
# CONTAINS ITS CHILDREN. WHEN $no_rounds=3, THIS ARRAY CONTAINS | |
# ITS GRANDCHILDREN, AND SO ON. | |
foreach $node(@nodes) | |
{ | |
if ($node->is_leaf) { $node_name = $node->id(); } | |
else { $node_name = $node->internalID(); } | |
if ($VERBOSE_CHANGES == 1) { print "$no_rounds: visited node $node_name\n";} | |
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;} | |
# CHECK WHETHER THE CONFIRMED FEATURES FOR NODE $node ARE | |
# DIFFERENT FROM THOSE OF ITS CHILDREN. | |
if (!($node->is_leaf)) | |
{ | |
&compare_parent_to_children_rd1($CONFIRMED_FEAT,$node,$node_name,$VERBOSE_CHANGES,$FEATURES,$SPECIES_WITH_DATA); | |
} | |
# FIND THE CHILDREN OF NODE $node: | |
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $node: | |
foreach $child ($node->children) | |
{ | |
if ($child->is_leaf) { $child_name = $child->id(); } | |
else { $child_name = $child->internalID;} | |
if ($VERBOSE_CHANGES == 1) { print "$no_rounds: saw child $child_name\n"; } | |
# RECORD THAT WE SAW THE CHILD $child: | |
if (!($SEEN_CHILD{$child_name})) | |
{ | |
@children = (@children,$child); | |
$SEEN_CHILD{$child_name} = 1; | |
} | |
} | |
} # ELSE, THIS NODE IS A SEQUENCE, SO DOES NOT HAVE ANY CHILDREN. | |
} | |
if ($VERBOSE_CHANGES == 1) { print "$no_rounds rounds: number new = $no_new_nodes\n";} | |
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL: | |
if ($no_new_nodes > 0) | |
{ | |
@nodes = &Avril_modules::empty_array(@nodes); | |
for ($i = 0; $i <= $#children; $i++) { @nodes = (@nodes,$children[$i]);} | |
@children = &Avril_modules::empty_array(@children); | |
goto NEXT_ROUND3_rd1; | |
} | |
if ($VERBOSE_CHANGES == 1) { print "Finished searching for changes in features\n";} | |
} | |
#------------------------------------------------------------------# | |
# DO A PREORDER TRAVERSAL OF THE TREE TO ASSIGN CONFIRMED FEATURES: | |
sub do_preorder_traversal_rd2 | |
{ | |
my $tree = $_[0]; #> | |
my $PROVISIONAL_FEAT = $_[1]; #> | |
my $FEATURES = $_[2]; #> | |
my $myfeats = $_[3]; #> | |
my %CONFIRMED_FEAT = (); #> | |
my $root; #> | |
my $root_name; #> | |
my @nodes; #> | |
my @children; #> | |
my $no_rounds = 0; #> | |
my $no_new_nodes; #> | |
my %SEEN_NODE = (); #> | |
my %SEEN_CHILD = (); #> | |
my $node; #> | |
my $node_name; #> | |
my $VERBOSE_PREORDER = 0; #> | |
my $child; #> | |
my $child_name; #> | |
my $i; #> | |
# FIND THE ROOT OF THE TREE: | |
$root = $tree->root; | |
$root_name = $root->internalID; | |
# ARRAY @nodes STARTS BY CONTAINING JUST THE ROOT NODE: | |
@nodes = &Avril_modules::empty_array(@nodes); | |
@children = &Avril_modules::empty_array(@children); | |
@nodes = (@nodes,$root); | |
$no_rounds = 0; | |
NEXT_ROUND4_rd2: | |
$no_rounds++; | |
$no_new_nodes = 0; | |
%SEEN_NODE = (); | |
%SEEN_CHILD = (); | |
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1, | |
# THIS ARRAY CONTAINS THE ROOT. WHEN $no_rounds=2, THIS ARRAY | |
# CONTAINS ITS CHILDREN. WHEN $no_rounds=3, THIS ARRAY CONTAINS | |
# ITS GRANDCHILDREN, AND SO ON. | |
foreach $node(@nodes) | |
{ | |
if ($node->is_leaf) { $node_name = $node->id(); } | |
else { $node_name = $node->internalID(); } | |
if ($VERBOSE_PREORDER == 1) { print "$no_rounds: visited node $node_name\n";} | |
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;} | |
# RECORD THE FEATURES FOR NODE $node. IN THE PRE-ORDER TRAVERSAL, | |
# WE CHOOSE A CHARACTER FOR NODE $node. WE TRY TO PICK THE SAME RESIDUE | |
# FOR THE DAUGHTER NODES IF POSSIBLE. | |
&assign_confirmed_feats_rd2($PROVISIONAL_FEAT,$node,$node_name,\%CONFIRMED_FEAT,$VERBOSE_PREORDER,$FEATURES, | |
$myfeats,$root_name); | |
# FIND THE CHILDREN OF NODE $node: | |
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $node: | |
foreach $child ($node->children) | |
{ | |
if ($child->is_leaf) { $child_name = $child->id(); } | |
else { $child_name = $child->internalID;} | |
if ($VERBOSE_PREORDER == 1) { print "$no_rounds: saw child $child_name\n"; } | |
# RECORD THAT WE SAW THE CHILD $child: | |
if (!($SEEN_CHILD{$child_name})) | |
{ | |
@children = (@children,$child); | |
$SEEN_CHILD{$child_name} = 1; | |
} | |
} | |
} # ELSE, THIS NODE IS A SEQUENCE, SO DOES NOT HAVE ANY CHILDREN. | |
} | |
if ($VERBOSE_PREORDER == 1) { print "$no_rounds rounds: number new = $no_new_nodes\n";} | |
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL: | |
if ($no_new_nodes > 0) | |
{ | |
@nodes = &Avril_modules::empty_array(@nodes); | |
for ($i = 0; $i <= $#children; $i++) { @nodes = (@nodes,$children[$i]);} | |
@children = &Avril_modules::empty_array(@children); | |
goto NEXT_ROUND4_rd2; | |
} | |
if ($VERBOSE_PREORDER == 1) { print "Finished pre-order traversal\n";} | |
return(\%CONFIRMED_FEAT); | |
} | |
#------------------------------------------------------------------# | |
# DO A PREORDER TRAVERSAL OF THE TREE TO ASSIGN CONFIRMED FEATURES: | |
sub do_preorder_traversal_rd1 | |
{ | |
my $tree = $_[0]; # | |
my $PROVISIONAL_FEAT = $_[1]; # | |
my $FEATURES = $_[2]; # | |
my %CONFIRMED_FEAT = (); # | |
my $root; # | |
my $root_name; # | |
my @nodes; # | |
my @children; # | |
my $no_rounds = 0; # | |
my $no_new_nodes; # | |
my %SEEN_NODE = (); # | |
my %SEEN_CHILD = (); # | |
my $node; # | |
my $node_name; # | |
my $VERBOSE_PREORDER = 0; # | |
my $child; # | |
my $child_name; # | |
my $i; # | |
my $node_type; # | |
# FIND THE ROOT OF THE TREE: | |
$root = $tree->root; | |
$root_name = $root->internalID; | |
# ARRAY @nodes STARTS BY CONTAINING JUST THE ROOT NODE: | |
@nodes = &Avril_modules::empty_array(@nodes); | |
@children = &Avril_modules::empty_array(@children); | |
@nodes = (@nodes,$root); | |
$no_rounds = 0; | |
NEXT_ROUND2: | |
$no_rounds++; | |
$no_new_nodes = 0; | |
%SEEN_NODE = (); | |
%SEEN_CHILD = (); | |
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1, | |
# THIS ARRAY CONTAINS THE ROOT. WHEN $no_rounds=2, THIS ARRAY | |
# CONTAINS ITS CHILDREN. WHEN $no_rounds=3, THIS ARRAY CONTAINS | |
# ITS GRANDCHILDREN, AND SO ON. | |
foreach $node(@nodes) | |
{ | |
if ($node->is_leaf) { $node_name = $node->id(); } | |
else { $node_name = $node->internalID(); } | |
if ($VERBOSE_PREORDER == 1) { print "$no_rounds: visited node $node_name\n";} | |
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;} | |
# RECORD THE FEATURES FOR NODE $node. IN THE PRE-ORDER TRAVERSAL, | |
# WE ADD TO THE CONFIRMED TERM LIST OF A NODE THOSE PROVISIONAL | |
# TERMS PRESENT IN BOTH CHILDREN OF THE CURRENT NODE: | |
&assign_confirmed_feats_rd1($PROVISIONAL_FEAT,$node,$node_name,\%CONFIRMED_FEAT,$VERBOSE_PREORDER,$FEATURES); | |
# IF THIS NODE IS AN INTERNAL NODE: | |
if (!($node->is_leaf)) | |
{ | |
# CHECK IF $node IS A SPECIATION NODE OR A DUPLICATION NODE: | |
# $node_type HAS VALUE Y FOR DUPLICATION, N FOR SPECIATION. | |
$node_type = $node->get_tag_value('D'); | |
# IF $node IS A SPECIATION NODE, ADD ALL ITS CONFIRMED TERMS TO THE | |
# CONFIRMED LISTS OF BOTH ITS CHILDREN: | |
if ($node_type eq 'N') { &add_confirmed_terms_to_children($node,$node_name,\%CONFIRMED_FEAT,$VERBOSE_PREORDER);} | |
# IF $node IS A DUPLICATION NODE, ONLY CONFIRM IN THE CHILDREN THOSE | |
# CONFIRMED TERMS IN THE CURRENT NODE THAT ARE PROVISIONAL TERMS IN | |
# THE CHILD NODE. | |
elsif ($node_type eq 'Y') { &add_confirmed_terms_to_children2($node,$node_name,\%CONFIRMED_FEAT,$VERBOSE_PREORDER, | |
$PROVISIONAL_FEAT,$FEATURES);} | |
else { print STDERR "ERROR: node_type $node_type.\n"; exit;} | |
} | |
# FIND THE CHILDREN OF NODE $node: | |
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $node: | |
foreach $child ($node->children) | |
{ | |
if ($child->is_leaf) { $child_name = $child->id(); } | |
else { $child_name = $child->internalID;} | |
if ($VERBOSE_PREORDER == 1) { print "$no_rounds: saw child $child_name\n"; } | |
# RECORD THAT WE SAW THE CHILD $child: | |
if (!($SEEN_CHILD{$child_name})) | |
{ | |
@children = (@children,$child); | |
$SEEN_CHILD{$child_name} = 1; | |
} | |
} | |
} # ELSE, THIS NODE IS A SEQUENCE, SO DOES NOT HAVE ANY CHILDREN. | |
} | |
if ($VERBOSE_PREORDER == 1) { print "$no_rounds rounds: number new = $no_new_nodes\n";} | |
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL: | |
if ($no_new_nodes > 0) | |
{ | |
@nodes = &Avril_modules::empty_array(@nodes); | |
for ($i = 0; $i <= $#children; $i++) { @nodes = (@nodes,$children[$i]);} | |
@children = &Avril_modules::empty_array(@children); | |
goto NEXT_ROUND2; | |
} | |
if ($VERBOSE_PREORDER == 1) { print "Finished pre-order traversal\n";} | |
return(\%CONFIRMED_FEAT); | |
} | |
#------------------------------------------------------------------# | |
# INFER THE FEATURES OF ANCESTRAL NODES IN THESE FAMILIES: | |
sub infer_ancestral_nodes | |
{ | |
my $FEATURES = $_[0]; #> | |
my $families = $_[1]; #> | |
my $myfeats = $_[2]; #> | |
my $SPECIES_WITH_DATA = $_[3]; #> | |
my $i; #> | |
my $family_id; #> | |
my $tree; #> | |
my $dbc; #> | |
my $family; #> | |
my $famh; #> | |
my $PROVISIONAL_FEAT; #> | |
my $CONFIRMED_FEAT; #> | |
# CONNECT TO THE TREEFAM DATABASE USING DEFAULT CONFIGURATION: | |
$dbc = Treefam::DBConnection->new(); | |
for ($i = 0; $i < @$families; $i++) | |
{ | |
$family_id = $families->[$i]; # eg. TF314159 | |
if ($VERBOSE3 == 1) { print "Looking at family $family_id\n\n";} | |
# GET A FAMILY: | |
$famh = $dbc->get_FamilyHandle(); | |
$family = $famh->get_by_id($family_id); | |
if (!(defined($family))) | |
{ | |
print "Do not have $family_id in the database...\n"; | |
goto NEXT_FAMILY1; | |
} | |
# GET THE CLEAN TREE FOR THIS FAMILY: | |
$tree = $family->get_tree('clean'); | |
if ($alg eq 'rd1') # RICHARD'S PARSIMONY ALGORITHM FOR GO ANNOTATIONS. | |
{ | |
# DO A POSTORDER TRAVERSAL OF THE TREE TO ASSIGN PROVISIONAL FEATURES: | |
$PROVISIONAL_FEAT = &do_postorder_traversal_rd1($tree,$FEATURES); | |
# DO A PREORDER TRAVERSAL OF THE TREE TO ASSIGN CONFIRMED FEATURES: | |
$CONFIRMED_FEAT = &do_preorder_traversal_rd1($tree,$PROVISIONAL_FEAT,$FEATURES); | |
} | |
elsif ($alg eq 'rd2') # THE TRADIATIONAL PARSIMONY ALGORITHM FROM RICHARD'S BOOK. | |
{ | |
# DO A POSTORDER TRAVERSAL OF THE TREE TO ASSIGN PROVISIONAL FEATURES: | |
$PROVISIONAL_FEAT = &do_postorder_traversal_rd2($tree,$FEATURES,$myfeats); | |
# DO A PREORDER TRAVERSAL OF THE TREE TO ASSIGN CONFIRMED FEATURES: | |
$CONFIRMED_FEAT = &do_preorder_traversal_rd2($tree,$PROVISIONAL_FEAT,$FEATURES,$myfeats); | |
} | |
elsif ($alg eq 'rd3') # THE WEIGHTED PARSIMONY ALGORITHM FROM RICHARD'S BOOK. | |
{ | |
# DO A POSTORDER TRAVERSAL OF THE TREE TO ASSIGN PROVISIONAL FEATURES: | |
$PROVISIONAL_FEAT = &do_postorder_traversal_rd3($tree,$FEATURES,$myfeats); | |
print "xxx stuck\n"; | |
exit; #xxx | |
} | |
else | |
{ | |
print STDERR "ERROR: alg $alg.\n"; | |
exit; | |
} | |
# FIND NODES IN THE TREE WHERE CONFIRMED FEATURES OF A PARENT NODE | |
# DIFFERS FROM THOSE OF ITS DAUGHTER NODES: | |
if ($alg eq 'rd1') | |
{ | |
&find_changes_in_feat_rd1($tree,$CONFIRMED_FEAT,$FEATURES,$SPECIES_WITH_DATA); | |
} | |
elsif ($alg eq 'rd2') | |
{ | |
&find_changes_in_feat_rd2($tree,$CONFIRMED_FEAT,$FEATURES,$myfeats,$SPECIES_WITH_DATA,$family_id); | |
} | |
elsif ($alg eq 'rd3') | |
{ | |
print "xxx stuck\n"; | |
exit; #xxx | |
} | |
else { print STDERR "ERROR: alg $alg.\n"; exit;} | |
NEXT_FAMILY1: | |
} | |
} | |
#------------------------------------------------------------------# | |
# GET THE PFAM DOMAIN ANNOTATIONS FOR GENES FROM THE TREEFAM MYSQL DATABASE: | |
sub get_pfam | |
{ | |
my $families = $_[0]; #> | |
my $dbc; #> | |
my $family_id; #> | |
my $i; #> | |
my $famh; #> | |
my $family; #> | |
my $tree; #> | |
my @nodes; #> | |
my $node; #> | |
my $transcript; #> | |
my %TRANSCRIPTS = (); #> | |
my $gh; #> | |
my $gene; #> | |
my %GENE = (); #> | |
my $dbh; #> | |
my $table_w; #> | |
my $st; #> | |
my $sth; #> | |
my $rv; #> | |
my @array; #> | |
my $transcript2; #> | |
my $transcripts_in_tree; #> | |
my @transcripts_in_tree; #> | |
my $transcript_in_tree; #> | |
my $j; #> | |
my $transcripts; #> | |
my @transcripts; #> | |
my $k; #> | |
my $domain; #> | |
my %PFAM = (); #> | |
my %SEEN = (); #> | |
my @domains; #> | |
my $the_domains; #> | |
my @the_domains; #> | |
my $the_domain; #> | |
my %SPECIES_WITH_DATA = (); #> | |
my $taxid; #> | |
# CONNECT TO THE TREEFAM DATABASE USING DEFAULT CONFIGURATION: | |
# GET A LIST OF ALL THE TRANSCRIPT IDS IN EACH FAMILY: | |
$dbc = Treefam::DBConnection->new(); | |
for ($i = 0; $i < @$families; $i++) | |
{ | |
$family_id = $families->[$i]; # eg. TF314159 | |
# GET A FAMILY: | |
$famh = $dbc->get_FamilyHandle(); | |
# GET THE OBJECT FOR THAT FAMILY: | |
$family = $famh->get_by_id($family_id); | |
if (!(defined($family))) | |
{ | |
print "Do not have $family_id in the database...\n"; | |
goto GET_PFAM_NEXT_FAMILY; | |
} | |
# GET THE CLEAN TREE FOR THIS FAMILY: | |
$tree = $family->get_tree('clean'); | |
# LIST ALL THE LEAVES FROM THE TREE: | |
@nodes = $tree->get_leaves; | |
# GET ALL THE TRANSCRIPTS FOR IN THE TREE: | |
foreach $node(@nodes) | |
{ | |
# GET THE TRANSCRIPT FOR THIS NODE $node: | |
if ($node->is_leaf) { $transcript = $node->sequence_id(); } # eg. ENSMUST00000023454.4 | |
else { print STDERR "ERROR: node is not a leaf.\n"; exit;} | |
# RECORD THAT TRANSCRIPT $transcript IS IN THIS FAMILY: | |
if (!($TRANSCRIPTS{$family_id})) { $TRANSCRIPTS{$family_id} = $transcript;} | |
else {$TRANSCRIPTS{$family_id} = $TRANSCRIPTS{$family_id}.",".$transcript;} | |
# FIND THE NAME OF THE GENE THAT TRANSCRIPT $transcript COMES FROM: | |
$gh = $node->gene; | |
$gene = $gh->ID(); # eg. ENSMUSG00000022770 | |
if ($GENE{$transcript}) { print STDERR "ERROR: already know gene for $transcript.\n"; exit;} | |
$GENE{$transcript} = $gene; | |
} | |
GET_PFAM_NEXT_FAMILY: | |
} | |
# GET THE TRANSCRIPTS FOR EACH GENE FROM THE TREEFAM MYSQL DATABASE, AND | |
# GET THE PFAM DOMAINS FOR ALL THE TRANSCRIPTS IN THE GENE. NOTE THAT WE | |
# DON'T USE THE TREEFAM PERL API FOR THIS, AS IT IS DIFFICULT TO DO WITH | |
# THE PERL API. | |
$dbh = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') || return; | |
# GET ALL THE TRANSCRIPTS FOR EACH GENE: | |
foreach $transcript (keys %GENE) | |
{ | |
# GET THE NAME OF THE GENE CORRESPONDING TO THIS TRANSCRIPT: | |
$gene = $GENE{$transcript}; # eg. ENSMUSG00000022770 | |
# GET ALL THE TRANSCRIPT IDS. FOR THIS GENE FROM THE TREEFAM | |
# MYSQL DATABASE (IN ADDITION TO THE TRANSCRIPT $transcript IN THE TREE): | |
$table_w = "genes"; | |
$st = "SELECT ID, TAX_ID from $table_w WHERE GID=?"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute($gene) or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$transcript2 = $array[0]; # eg. ENSMUST00000023454.4 | |
$taxid = $array[1]; # eg. 7227 | |
if (!($TRANSCRIPTS{$gene})) { $TRANSCRIPTS{$gene} = $transcript2;} | |
else {$TRANSCRIPTS{$gene} = $TRANSCRIPTS{$gene}.",".$transcript2;} | |
# RECORD WHETHER THIS TRANSCRIPT IS FROM A SPECIES WITH PFAM DATA: | |
if ($taxid eq '7227' || # DROSOPHILA MELANOGASTER | |
$taxid eq '7165' || # ANOPHELES GAMBIAE | |
$taxid eq '7719' || # CIONA INTESTINALIS | |
$taxid eq '51511'|| # CIONA SAVIGNYI | |
$taxid eq '69293'|| # GASTEROSTEUS ACULEATUS | |
$taxid eq '31033'|| # FUGU RUBRIPES | |
$taxid eq '13616'|| # MONODELPHIS DOMESTICA | |
$taxid eq '9615' || # CANIS FAMILIARIS | |
$taxid eq '10116'|| # RATTUS NORVEGICUS | |
$taxid eq '10090'|| # MUS MUSCULUS | |
$taxid eq '9598' || # PAN TROGLODYTES | |
$taxid eq '9913' || # BOS TAURUS | |
$taxid eq '9606' || # HOMO SAPIENS | |
$taxid eq '9544' || # MACACA MULATTA | |
$taxid eq '7955' || # BRACHYDANIO RERIO | |
$taxid eq '8090' || # ORYZIAS LATIPES | |
$taxid eq '99883'|| # TETRAODON NIGROVIRIDIS | |
$taxid eq '9031' || # GALLUS GALLUS | |
$taxid eq '8364' || # XENOPUS TROPICALIS | |
$taxid eq '7159') # AEDES AEGYPTI | |
{ | |
$SPECIES_WITH_DATA{$transcript2} = 1; | |
} | |
} | |
} | |
} | |
# GET THE PFAM DOMAINS FOR THE GENES IN EACH FAMILY FROM THE TREEFAM MYSQL DATABASE: | |
for ($i = 0; $i < @$families; $i++) | |
{ | |
$family_id = $families->[$i]; # eg. TF314159 | |
# FIND THE TRANSCRIPTS FROM THIS FAMILY IN THE TREEFAM TREE: | |
if (!($TRANSCRIPTS{$family_id})) { print STDERR "ERROR: do not know genes in $family_id.\n"; exit;} | |
$transcripts_in_tree = $TRANSCRIPTS{$family_id}; | |
@transcripts_in_tree = split(/\,/,$transcripts_in_tree); | |
for ($j = 0; $j <= $#transcripts_in_tree; $j++) | |
{ | |
$transcript_in_tree = $transcripts_in_tree[$j]; # eg. ENSGACT00000026618.1 | |
# FIND THE GENE THAT THIS TRANSCRIPT COMES FROM: | |
if (!($GENE{$transcript_in_tree})) { print STDERR "ERROR: do not know gene for $transcript_in_tree.\n"; exit;} | |
$gene = $GENE{$transcript_in_tree}; | |
# FIND ALL THE TRANSCRIPTS FOR $gene: | |
if (!($TRANSCRIPTS{$gene})) { print STDERR "ERROR: do not know transcripts for $gene.\n"; exit;} | |
$transcripts = $TRANSCRIPTS{$gene}; | |
@transcripts = split(/\,/,$transcripts); | |
for ($k = 0; $k <= $#transcripts; $k++) | |
{ | |
$transcript = $transcripts[$k]; # eg. ENSMUST00000023454.4 | |
# GET THE PFAM DOMAINS FOR TRANSCRIPT $transcript: | |
$table_w = "pfam"; | |
$st = "SELECT PFAM_ID from $table_w WHERE ID=?"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute($transcript) or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$domain = $array[0]; | |
# WE NEED TO CONSIDER DOMAINS IN THE SAME CLAN AS EQUIVALENT: | |
if ($domain eq 'PF07653') { $domain = 'PF00018';} #xxx | |
if (!($SEEN{$transcript_in_tree."=".$domain})) | |
{ | |
if (!($PFAM{$transcript_in_tree})) { $PFAM{$transcript_in_tree} = $domain;} | |
else {$PFAM{$transcript_in_tree} = $PFAM{$transcript_in_tree}.",".$domain;} | |
$SEEN{$transcript_in_tree."=".$domain} = 1; | |
} | |
# RECORD THIS DOMAIN IN ARRAY DOMAINS, IF WE HAVEN'T SEEN IT BEFORE: | |
if (!($SEEN{$domain})) | |
{ | |
@domains = (@domains,$domain); | |
$SEEN{$domain} = 1; | |
} | |
} | |
} | |
} | |
if ($VERBOSE_PFAM == 1) | |
{ | |
if ($PFAM{$transcript_in_tree}) | |
{ | |
print "Family $family_id has gene $transcript_in_tree Pfam $PFAM{$transcript_in_tree}\n"; | |
} | |
else | |
{ | |
print "Family $family_id has gene $transcript_in_tree no Pfam\n"; | |
} | |
} | |
} | |
} | |
# RECORD WHETHER EACH SEQUENCE HAS EACH DOMAIN IN THE ARRAY @domains. THIS | |
# IS NEEDED FOR THE rd2 ALGORITHM: | |
for ($j = 0; $j <= $#transcripts_in_tree; $j++) | |
{ | |
$transcript_in_tree = $transcripts_in_tree[$j]; | |
%SEEN = (); | |
if ($PFAM{$transcript_in_tree}) | |
{ | |
# FIND THE DOMAINS IN THIS TRANSCRIPT: | |
$the_domains = $PFAM{$transcript_in_tree}; | |
@the_domains = split(/\,/,$the_domains); | |
for ($i = 0; $i <= $#the_domains; $i++) | |
{ | |
$the_domain = $the_domains[$i]; | |
$SEEN{$the_domain}= 1; | |
} | |
} | |
for ($i = 0; $i <= $#domains; $i++) | |
{ | |
$domain = $domains[$i]; | |
if (!($SPECIES_WITH_DATA{$transcript_in_tree})) | |
{ | |
$PFAM{$transcript_in_tree."_".$domain} = "yes,no"; | |
} | |
else | |
{ | |
if ($SEEN{$domain}) | |
{ | |
$PFAM{$transcript_in_tree."_".$domain} = "yes"; | |
} | |
else | |
{ | |
$PFAM{$transcript_in_tree."_".$domain} = "no"; | |
} | |
} | |
} | |
} | |
return(\%PFAM,\@domains,\%SPECIES_WITH_DATA); | |
} | |
#------------------------------------------------------------------# | |
# GET THE GO ANNOTATIONS FOR GENES: | |
sub get_GO | |
{ | |
my $file = $_[0]; #> THE FILE CONTAINING THE GO IDs. | |
my $families = $_[1]; #> THE LIST OF FAMILIES UNDER STUDY. | |
my %TAKE = (); #> | |
my $i; #> | |
my $family; #> | |
my $line; #> | |
my @temp; #> | |
my $goid; #> | |
my %GOID = (); #> | |
my @goids; #> | |
my %SEEN = (); #> | |
my $j; #> | |
my $the_goids; #> | |
my @the_goids; #> | |
my $the_goid; #> | |
my $dbc; #> | |
my $family_id; #> | |
my $famh; #> | |
my $tree; #> | |
my @nodes; #> | |
my $node; #> | |
my $transcript; #> | |
my @transcripts; #> | |
my %SPECIES_WITH_DATA = (); #> | |
my $dbh; #> | |
my $table_w; #> | |
my $st; #> | |
my $sth; #> | |
my $rv; #> | |
my @array; #> | |
my $taxid; #> | |
# RECORD WHICH FAMILIES TO TAKE: | |
for ($i = 0; $i < @$families; $i++) | |
{ | |
$family = $families->[$i]; | |
$TAKE{$family} = 1; | |
} | |
print STDERR "Recorded which families to take...\n"; | |
# CONNECT TO THE TREEFAM DATABASE USING DEFAULT CONFIGURATION: | |
# GET A LIST OF ALL THE TRANSCRIPT IDS IN EACH FAMILY: | |
$dbc = Treefam::DBConnection->new(); | |
for ($i = 0; $i < @$families; $i++) | |
{ | |
$family_id = $families->[$i]; # eg. TF314159 | |
# GET A FAMILY: | |
$famh = $dbc->get_FamilyHandle(); | |
# GET THE OBJECT FOR THAT FAMILY: | |
$family = $famh->get_by_id($family_id); | |
if (!(defined($family))) | |
{ | |
print "Do not have $family_id in the database...\n"; | |
goto GET_GO_NEXT_FAMILY; | |
} | |
# GET THE CLEAN TREE FOR THIS FAMILY: | |
$tree = $family->get_tree('clean'); | |
# LIST ALL THE LEAVES FROM THE TREE: | |
@nodes = $tree->get_leaves; | |
# GET ALL THE TRANSCRIPTS FOR IN THE TREE: | |
foreach $node(@nodes) | |
{ | |
# GET THE TRANSCRIPT FOR THIS NODE $node: | |
if ($node->is_leaf) { $transcript = $node->sequence_id(); } # eg. ENSMUST00000023454.4 | |
else { print STDERR "ERROR: node is not a leaf.\n"; exit;} | |
if (!($SEEN{$transcript})) | |
{ | |
$SEEN{$transcript} = 1; | |
@transcripts = (@transcripts,$transcript); | |
} | |
} | |
GET_GO_NEXT_FAMILY: | |
} | |
print STDERR "Got a list of transcript ids in each family...\n"; | |
# READ IN THE GO ANNOTATIONS FOR THE FAMILIES UNDER STUDY: | |
open(FILE,"$file") || die "ERROR: cannot open $file.\n"; | |
while(<FILE>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
if (substr($line,0,6) eq 'Family') # Family TF101001 : DM CYCB has GO id GO:0000910 | |
{ | |
$family = $temp[1]; # eg. TF101001 | |
$goid = $temp[8]; # eg. GO:0000910 | |
$transcript = $temp[9]; # eg. CG3510-RA.3 | |
if ($TAKE{$family}) | |
{ | |
if (!($SEEN{$transcript})) | |
{ | |
print "WARNING: have GO annotation for $transcript but haven't seen it in the tree (family $family)!\n"; | |
print STDERR "WARNING: have GO annotation for $transcript but haven't seen it in the tree (family $family)!\n"; | |
#xxx can happen if in seed tree but not clean tree -- exit; | |
} | |
# RECORD THE GO ANNOTATIONS FOR TRANSCRIPT $transcript: | |
if (!($GOID{$transcript})) { $GOID{$transcript} = $goid;} | |
else {$GOID{$transcript} = $GOID{$transcript}.",".$goid;} | |
if ($VERBOSE_GOID == 1) | |
{ | |
print "Family $family gene $transcript GO $GOID{$transcript}\n"; | |
} | |
if (!($SEEN{$goid})) | |
{ | |
@goids = (@goids,$goid); | |
$SEEN{$goid}= 1; | |
} | |
} | |
} | |
} | |
close(FILE); | |
print STDERR "Read in the GO annotations for the families under study...\n"; | |
# RECORD WHICH TRANSCRIPTS COME FROM SPECIES WITH GO DATA: | |
$dbh = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') || return; | |
# GET ALL THE TRANSCRIPTS FOR EACH GENE: | |
for ($j = 0; $j <= $#transcripts; $j++) | |
{ | |
$transcript = $transcripts[$j]; | |
# GET THE SPECIES FOR THIS TRANSCRIPT FROM THE TREEFAM | |
# MYSQL DATABASE: | |
$table_w = "genes"; | |
$st = "SELECT TAX_ID from $table_w WHERE ID=?"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute($transcript) or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$taxid = $array[0]; # eg. 7227 | |
# RECORD WHETHER THIS TRANSCRIPT IS FROM A SPECIES WITH GO DATA: | |
if ($taxid eq '7227' || # DROSOPHILA MELANOGASTER | |
$taxid eq '9031' || # GALLUS GALLUS | |
$taxid eq '9606' || # HOMO SAPIENS | |
$taxid eq '10090'|| # MUS MUSCULUS | |
$taxid eq '10116'|| # RATTUS NORVEGICUS | |
$taxid eq '6239' || # CAENORHABDITIS ELEGANS | |
$taxid eq '7955') # BRACHYDANIO RERIO | |
{ | |
$SPECIES_WITH_DATA{$transcript} = 1; | |
} | |
} | |
} | |
} | |
print STDERR "Read in which transcripts come from families with GO data...\n"; | |
# RECORD WHETHER EACH SEQUENCE HAS EACH GO ID. IN THE ARRAY @goids. THIS | |
# IS NEEDED FOR THE rd2 ALGORITHM: | |
for ($j = 0; $j <= $#transcripts; $j++) | |
{ | |
$transcript = $transcripts[$j]; | |
%SEEN = (); | |
if ($GOID{$transcript}) | |
{ | |
# FIND THE GO IDS. FOR THIS TRANSCRIPT: | |
$the_goids = $GOID{$transcript}; | |
@the_goids = split(/\,/,$the_goids); | |
for ($i = 0; $i <= $#the_goids; $i++) | |
{ | |
$the_goid = $the_goids[$i]; | |
$SEEN{$the_goid} = 1; | |
} | |
} | |
else # THIS TRANSCRIPT HAS NO GO TERMS, SO WE TREAT IT AS MISSING DATA. xxx | |
{ | |
$SPECIES_WITH_DATA{$transcript} = ""; | |
} | |
for ($i = 0; $i <= $#goids; $i++) | |
{ | |
$goid = $goids[$i]; | |
if (!($SPECIES_WITH_DATA{$transcript})) | |
{ | |
$GOID{$transcript."_".$goid} = "yes,no"; | |
} | |
else | |
{ | |
if ($SEEN{$goid}) | |
{ | |
$GOID{$transcript."_".$goid} = "yes"; | |
} | |
else | |
{ | |
$GOID{$transcript."_".$goid} = "no"; | |
} | |
} | |
} | |
} | |
print STDERR "Recorded whether each transcript has each of the GO terms in that family...\n"; | |
return(\%GOID,\@goids,\%SPECIES_WITH_DATA); | |
} | |
#------------------------------------------------------------------# | |
# DO A POSTORDER TRAVERSAL OF THE TREE: | |
sub do_postorder_traversal_rd3 | |
{ | |
my $tree = $_[0]; #xxx THE TREE OBJECT. | |
my $FEATURES = $_[1]; # THE FEATURES. | |
my $myfeats = $_[2]; # ARRAY OF ALL THE FEATURES IN THE WHOLE FAMILY. | |
my $root; # | |
my $root_name; # | |
my @nodes; # | |
my @parents; # | |
my $no_rounds; # | |
my $no_new_nodes; # | |
my %SEEN_NODE = (); # | |
my %SEEN_PARENT = (); # | |
my $node; # | |
my $node_name; # | |
my $VERBOSE_POSTORDER = 1; # xxx | |
my $parent; # | |
my $parent_name; # | |
my $daughter; # | |
my $daughter_name; # | |
my $i; # | |
my %PROVISIONAL_FEAT = (); # | |
my $total_cost = 0; # | |
my $cost = 0; # | |
my $success = 0; # | |
my %PROVISIONAL_SCORE = (); # | |
# FIND THE ROOT OF THE TREE: | |
$root = $tree->root; | |
$root_name = $root->internalID; | |
# LIST ALL THE LEAVES FROM THE TREE: | |
@nodes = &Avril_modules::empty_array(@nodes); | |
@parents = &Avril_modules::empty_array(@parents); | |
@nodes = $tree->get_leaves; | |
$no_rounds = 0; | |
NEXT_ROUND1_rd3: | |
$no_rounds++; | |
$no_new_nodes = 0; | |
%SEEN_NODE = (); | |
%SEEN_PARENT = (); | |
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1, | |
# THIS ARRAY CONTAINS THE LEAVES. WHEN $no_rounds=2, THIS ARRAY | |
# CONTAINS THEIR PARENTS. WHEN $no_rounds=3, THIS ARRAY CONTAINS | |
# THEIR GRANDPARENTS, AND SO ON. | |
foreach $node(@nodes) | |
{ | |
if ($node->is_leaf) { $node_name = $node->id(); } | |
else { $node_name = $node->internalID(); } | |
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited node $node_name\n";} | |
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;} | |
# FIND THE PARENT OF NODE $node: | |
if (defined($node->parent)) { $parent = $node->parent; } | |
else { print STDERR "ERROR: do not know parent of node $node_name.\n"; exit;} | |
$parent_name = $parent->internalID; | |
# FIND THE OTHER CHILD OF NODE $parent: | |
if (!($parent->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $parent: | |
foreach $daughter ($parent->children) | |
{ | |
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); } | |
else { $daughter_name = $daughter->internalID;} | |
if ($daughter_name ne $node_name) | |
{ | |
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited sibling $daughter_name\n";} | |
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;} | |
} | |
} | |
} | |
else { print STDERR "ERROR: parent $parent_name is a sequence.\n"; exit;} | |
# RECORD THAT WE VISITED THE PARENT $parent: | |
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: saw parent $parent_name\n";} | |
if (!($SEEN_PARENT{$parent_name}) && $parent_name ne $root_name) | |
{ | |
# IN THE POST-ORDER TRAVERSAL, FOR EACH FEATURE {A} WE RECORD THE COST OF HAVING OR | |
# LACKING FEATURE {A} FOR PARENT $parent. FEATURE {A} COULD BE A DOMAIN OR A GO TERM. | |
($success) = &assign_provisional_feats_rd3($FEATURES,$parent,$parent_name,\%PROVISIONAL_FEAT, | |
$VERBOSE_POSTORDER,$myfeats,\%PROVISIONAL_SCORE); | |
$cost = 0; #xxx | |
if ($success == 1) | |
{ | |
@parents = (@parents,$parent); | |
$SEEN_PARENT{$parent_name} = 1; | |
$total_cost = $total_cost + $cost; | |
} | |
else | |
{ | |
@parents = (@parents,$node); | |
$SEEN_PARENT{$node_name} = 1; | |
} | |
} | |
} | |
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds rounds: number new = $no_new_nodes\n";} | |
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL: | |
if ($no_new_nodes > 0) | |
{ | |
@nodes = &Avril_modules::empty_array(@nodes); | |
for ($i = 0; $i <= $#parents; $i++) { @nodes = (@nodes,$parents[$i]);} | |
@parents = &Avril_modules::empty_array(@parents); | |
goto NEXT_ROUND1_rd3; | |
} | |
# VISIT ROOT: | |
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited root $root_name\n";} | |
($success) = &assign_provisional_feats_rd3($FEATURES,$root,$root_name,\%PROVISIONAL_FEAT, | |
$VERBOSE_POSTORDER,$myfeats,\%PROVISIONAL_SCORE); | |
$cost = 0; | |
if ($success == 0) { print STDERR "ERROR: success $success at root node.\n"; exit;} | |
$total_cost = $total_cost + $cost; | |
if ($VERBOSE_POSTORDER == 1) { print "Finished post-order traversal, total cost $total_cost\n";} | |
return(\%PROVISIONAL_FEAT); | |
} | |
#------------------------------------------------------------------# | |
# IN THE POST-ORDER TRAVERSAL, FOR EACH FEATURE {A} WE RECORD THE COST OF HAVING OR | |
# LACKING FEATURE {A} FOR PARENT $parent. FEATURE {A} COULD BE A DOMAIN OR A GO TERM. | |
# xxx | |
sub assign_provisional_feats_rd3 | |
{ | |
my $FEATURES = $_[0]; #xxx THE FEATURES. | |
my $parent = $_[1]; # | |
my $parent_name = $_[2]; # | |
my $PROVISIONAL_FEAT = $_[3]; # | |
my $VERBOSE_POSTORDER = $_[4]; # | |
my $myfeats = $_[5]; # ARRAY OF ALL THE FEATURES IN THE TREE. | |
my $PROVISIONAL_SCORE = $_[6]; # | |
my $daughter; # | |
my $daughter_name; # | |
my $daughter_seqname; # | |
my $daughter_feats; # | |
my @daughter_feats; # | |
my $cost = 0; # | |
my $success = 1; # | |
my @daughter_nodes; # | |
if ($VERBOSE_POSTORDER == 1) { print "___ finding provisional feats for parent $parent_name\n";} | |
# FIND THE CHILDREN OF NODE $parent: | |
if (!($parent->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $parent: | |
foreach $daughter ($parent->children) | |
{ | |
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); } | |
else { $daughter_name = $daughter->internalID;} | |
if ($VERBOSE_POSTORDER == 1) { print "___ looking at daughter $daughter_name of parent $parent_name\n";} | |
# FIND THE SCORES OF NODE $daughter: | |
if ($daughter->is_leaf) | |
{ | |
$daughter_seqname = $daughter->get_tag_value('O'); | |
($success) = &get_daughter_score1($daughter_seqname,$FEATURES,$myfeats,$PROVISIONAL_SCORE); | |
if ($success == 0) { print STDERR "ERROR: could not get feats for $daughter_seqname\n"; exit;} | |
# xxx @daughter_feats = (@daughter_feats,$daughter_feats); | |
@daughter_nodes = (@daughter_nodes,$daughter_seqname); | |
if ($VERBOSE_POSTORDER == 1) { print "___ daughter $daughter_name ($daughter_seqname) has feats xxx\n";} | |
} | |
else # THE DAUGHTER $daughter IS AN INTERNAL NODE: | |
{ | |
print "xxx aah\n"; | |
exit; #xxx | |
($success,$daughter_feats) = &get_daughter_feats($daughter_name,$PROVISIONAL_FEAT,$myfeats); | |
if ($success == 0) | |
{ | |
if ($VERBOSE_POSTORDER == 1) { print "Do not know provisional feats for $daughter_name yet ...\n";} | |
return($success); | |
} | |
@daughter_feats = (@daughter_feats,$daughter_feats); | |
@daughter_nodes = (@daughter_nodes,$daughter_name); | |
if ($VERBOSE_POSTORDER == 1) { print "___ daughter $daughter_name (internal) has feats $daughter_feats\n";} | |
} | |
} | |
} | |
else { print STDERR "ERROR: parent $parent_name is a sequence.\n"; exit;} | |
# GET THE PROVISIONAL SCORES FOR $parent_name: | |
&get_parent_score(\@daughter_nodes,$parent_name,$myfeats,$PROVISIONAL_FEAT,$VERBOSE_POSTORDER,$PROVISIONAL_SCORE); | |
return($success); | |
} | |
#------------------------------------------------------------------# | |
# GET THE PROVISIONAL SCORES FOR A NODE: | |
# xxx | |
sub get_daughter_score1 | |
{ | |
my $daughter_seqname = $_[0]; # | |
my $PROVISIONAL_FEAT = $_[1]; # | |
my $myfeats = $_[2]; # | |
my $PROVISIONAL_SCORE = $_[3]; # | |
my $daughter_feats = ""; # | |
my $i; # | |
my $key; # | |
my $feats; # | |
my $success = 1; # | |
my $key2; # | |
for ($i = 0; $i < @$myfeats; $i++) | |
{ | |
$key = $daughter_seqname."_".$myfeats->[$i]; | |
print "xxx looking at $daughter_seqname key $key\n"; | |
if (!($PROVISIONAL_FEAT->{$key})) | |
{ | |
print "xxx returning \n"; | |
$success = 0; | |
# WE DO NOT HAVE THE PROVISIONAL FEATURES FOR THIS NODE YET: | |
return($success); | |
} | |
$feats = $PROVISIONAL_FEAT->{$key}; # yes OR no OR yes,no | |
print "xxx key $key feats $feats\n"; | |
if ($feats ne 'yes' && $feats ne 'no' && $feats ne 'yes,no') { print "ERROR: feats $feats.\n"; exit;} | |
if ($feats eq 'yes') | |
{ | |
$key2 = $key."_yes"; | |
$PROVISIONAL_SCORE->{$key2} = 0; | |
print "xxx key2 $key2 feats $feats $PROVISIONAL_SCORE->{$key2}\n"; | |
$key2 = $key."_no"; | |
$PROVISIONAL_SCORE->{$key2} = 1000000; | |
print "xxx key2 $key2 feats $feats $PROVISIONAL_SCORE->{$key2}\n"; | |
} | |
elsif ($feats eq 'no') | |
{ | |
$key2 = $key."_yes"; | |
$PROVISIONAL_SCORE->{$key2} = 1000000; | |
print "xxx key2 $key2 feats $feats $PROVISIONAL_SCORE->{$key2}\n"; | |
$key2 = $key."_no"; | |
$PROVISIONAL_SCORE->{$key2} = 0; | |
print "xxx key2 $key2 feats $feats $PROVISIONAL_SCORE->{$key2}\n"; | |
} | |
elsif ($feats eq 'yes,no') | |
{ | |
$key2 = $key."_yes"; | |
$PROVISIONAL_SCORE->{$key2} = 0; | |
$key2 = $key."_no"; | |
$PROVISIONAL_SCORE->{$key2} = 1000000; | |
print "xxx key2 $key2\n"; | |
} | |
#xxx $daughter_feats = $daughter_feats."_".$feats; | |
} | |
#xxx $daughter_feats = substr($daughter_feats,1,length($daughter_feats)-1); | |
return($success); | |
} | |
#------------------------------------------------------------------# | |
# GET THE PROVISIONAL SCORES FOR $parent_name: | |
# xxx | |
sub get_parent_score | |
{ | |
my $daughter_nodes = $_[0]; #xxx | |
my $parent_name = $_[1]; | |
my $myfeats = $_[2]; | |
my $PROVISIONAL_FEAT = $_[3]; | |
my $VERBOSE_POSTORDER = $_[4]; | |
my $PROVISIONAL_SCORE = $_[5]; | |
my $i; | |
my $feat; | |
my $j; | |
my $state; | |
my $k; | |
my $daughter_name; | |
my $daughter_score_yes; | |
my $daughter_score_no; | |
my $score; | |
my $l; | |
my $parent_state; | |
my $daughter_state; | |
my $daughter_score; | |
# RECORD THE COST OF NODE $parent_name HAVING EACH STATE (A1,A2,etc.) FOR | |
# EACH FEATURE A: | |
print "xxx parent_name $parent_name\n"; | |
for ($i = 0; $i < @$myfeats; $i++) | |
{ | |
$feat = $myfeats->[$i]; | |
print "xxx1 feat $feat i $i\n"; | |
# FIND WHAT IS THE SCORE FOR FEATURE $feat HAVING EACH STATE: | |
for ($j = 1; $j <= 2; $j++) | |
{ | |
if ($j == 1) { $parent_state = "yes";} | |
elsif ($j == 2) { $parent_state = "no"; } | |
print "xxx2 parent_state $parent_state\n"; | |
#xxx $parent_score = 0; #xxx1 | |
#xxx $score = 0; | |
$score = 0; | |
for ($k = 0; $k < @$daughter_nodes; $k++) | |
{ | |
$daughter_name = $daughter_nodes->[$k]; | |
# FIND WHAT IS THE MINIMUM SCORE FOR DAUGHTER $daughter: | |
$min_daughter_score = 1000000; | |
for ($l = 1; $l <= 2; $l++) | |
{ | |
if ($l == 1) { $daughter_state = "yes";} | |
elsif ($l == 2) { $daughter_state = "no"; } | |
$key = $daughter_name."_".$feat."_".$daughter_state; | |
if (!($PROVISIONAL_SCORE->{$key})) { $daughter_score = 0;} | |
else { $daughter_score = $PROVISIONAL_SCORE->{$key}; } | |
print "xxx3 $daughter_name daughter_score $daughter_score daughter_state $daughter_state key $key\n"; | |
# $score = $score + $daughter_score; | |
# CHECK IF THERE IS A SCORE FOR ANY CHANGE FROM $parent_state TO $daughter_state: | |
if ($parent_state ne $daughter_state) | |
{ | |
if ($parent_state eq 'yes' && $daughter_state eq 'no') # LOSS. | |
{ | |
$daughter_score = $daughter_score + 1; # xxx | |
} | |
elsif ($parent_state eq 'no' && $daughter_state eq 'yes') # GAIN. | |
{ | |
$daughter_score = $daughter_score + 2; # xxx | |
} | |
} | |
if ($daughter_score < $min_daughter_score) { $min_daughter_score = $daughter_score; print "xxx3 min_daughter_score now $min_daughter_score\n";} | |
#xxx if ($score < $min_score) { $min_score = $score; print "xxx3 min_score now $min_score\n";} | |
#xxx print "xxx3 score $score min_score $min_score\n"; | |
print "xxx3 daughter_score $daughter_score min_daughter_score $min_daughter_score\n"; | |
} | |
$score = $score + $min_daughter_score; | |
print "xxx2 score $score\n"; | |
} | |
$key = $parent_name."_".$feat."_".$parent_state; | |
$PROVISIONAL_SCORE->{$key} = $score; | |
print "xxx2 setting score for $key as $score\n"; | |
#xxx print "xxx2 min_score $min_score\n"; | |
} | |
} | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment