Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 16:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5066023 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5066023 to your computer and use it in GitHub Desktop.
Perl script that, given a file with GO annotations for sequences in TreeFam families, and a list of families, infers the GO annotations for ancestral nodes in the trees for those families.
#!/usr/local/bin/perl
#
# Perl script treefam_infer_ancestral_GOids3.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 3-May-07.
#
# For the TreeFam project.
#
# This perl script infers the GO ids of ancestral nodes
# in a list of TreeFam trees, given the GO ids of the
# leaves. This is different from treefam_infer_ancestral_GOids.pl,
# because it uses the algorithm Richard designed in China.
# Note that this reads in the GO-Slim terms for the leaves of
# the tree.
#
# The command-line format is:
# % perl <treefam_infer_ancestral_GOids3.pl> GO family
# where GO has the GO ids of the leaves,
# families is the list of TreeFam families.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 2)
{
print "Usage of treefam_infer_ancestral_GOids3.pl\n\n";
print "perl treefam_infer_ancestral_GOids3.pl <GO> <families>\n";
print "where <GO> contains the GO ids of the leaves,\n";
print " <families> is the list of TreeFam families.\n";
print "For example, >perl treefam_infer_ancestral_GOids3.pl GO_slim_terms11_2may07\n";
print "neural_genes_2may07\n";
exit;
}
# READ IN MY PERL MODULES:
BEGIN {
unshift (@INC, '/nfs/team71/phd/alc/perl/modules');
}
use lib "/nfs/team71/phd/alc/perl/treefam/Treefam_api_3may07/"; #xxx note that this is old, I've hacked it
#xxx to make it use TreeFam-4.
use Avril_modules;
use Treefam::DBConnection;
# FIND THE NAME OF THE INPUT FILE OF GO TERMS ASSIGNED TO LEAVES:
$GO = $ARGV[0];
# FIND THE NAME OF THE LIST OF TREEFAM FAMILIES:
$fams = $ARGV[1];
$VERBOSE = 1;
$VERBOSE2 = 1;
$VERBOSE3 = 1;
#------------------------------------------------------------------#
# READ IN THE LIST OF FAMILIES:
$TAKE = &read_family_list($fams);
# READ IN THE INPUT FILE OF GO TERMS:
($GOID,$families) = &read_go($GO,$TAKE);
print STDERR "Read in file $GO...\n";
# NOW INFER THE GO IDS OF ANCESTRAL NODES IN THESE FAMILIES:
&infer_ancestral_nodes($GOID,$families);
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# READ IN THE LIST OF FAMILIES:
sub read_family_list
{
my $fams = $_[0];
my %TAKE = ();
my $line;
my @temp;
my $family;
open(FAMS,"$fams") || die "ERROR: cannot open $fams.\n";
while(<FAMS>)
{
$line = $_;
chomp $line;
@temp = split(/\s+/,$line);
if (substr($line,0,2) eq 'TF')
{
$family = $temp[0];
if ($family eq 'TF314159') { $TAKE{$family} = 1;} #xxx an interesting case.
#xxx $TAKE{$family} = 1;
}
}
close(FAMS);
return(\%TAKE);
}
#------------------------------------------------------------------#
# IF $node IS A DUPLICATION NODE, ONLY CONFIRM IN THE CHILDREN THOSE
# CONFIRMED TERMS IN THE CURRENT NODE THAT ARE PROVISIONAL TERMS IN
# THE CHILD NODE.
sub add_confirmed_terms_to_children2
{
my $node = $_[0]; #
my $node_name = $_[1]; #
my $CONFIRMED_GOID = $_[2]; #
my $VERBOSE_PREORDER = $_[3]; #
my $PROVISIONAL_GOID = $_[4]; #
my $GOID = $_[5]; #
my $confirmed_goid; #
my @confirmed_goid; #
my $daughter; #
my $daughter_name; #
my $provisional_goid; #
my @provisional_goid; #
my %CONFIRMED_IN_NODE = (); #
my $i; #
my $j; #
my $daughter_confirmed_goid; #
my %SEEN = (); #
my $daughter_seqname; #
if ($VERBOSE_PREORDER == 1) { print "___ adding confirmed goids for duplication node $node_name to its chidren's confirmed lists\n";}
# FIND THE CONFIRMED GOIDs OF $node:
if ($CONFIRMED_GOID->{$node_name})
{
$confirmed_goid = $CONFIRMED_GOID->{$node_name};
@confirmed_goid = split(/\,/,$confirmed_goid);
for ($i = 0; $i <= $#confirmed_goid; $i++) { $CONFIRMED_IN_NODE{$confirmed_goid[$i]} = 1;}
}
else # $node DOES NOT HAVE ANY CONFIRMED GOIDs.
{
return;
}
# FIND THE CHILDREN OF NODE $node:
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $node:
foreach $daughter ($node->children)
{
$daughter_confirmed_goid = "";
%SEEN = ();
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); }
else { $daughter_name = $daughter->internalID;}
# FIND THE PROVISIONAL GOIDs FOR $daughter_name:
if ($PROVISIONAL_GOID->{$daughter_name})
{
$provisional_goid = $PROVISIONAL_GOID->{$daughter_name};
@provisional_goid = split(/\,/,$provisional_goid);
for ($j = 0; $j <= $#provisional_goid; $j++)
{
if ($CONFIRMED_IN_NODE{$provisional_goid[$j]}) # IF THIS GOID IS CONFIRMED IN $node.
{
if (!($SEEN{$provisional_goid[$j]}))
{
$SEEN{$provisional_goid[$j]} = 1;
$daughter_confirmed_goid = $daughter_confirmed_goid.",".$provisional_goid[$j];
}
}
}
}
else # THERE IS NO PROVISIONAL GO ID $daughter_name. $daughter_name MAY BE A LEAF.
{
if ($daughter->is_leaf)
{
$daughter_seqname = $daughter->get_tag_value('O');
if ($GOID->{$daughter_seqname})
{
$provisional_goid = $PROVISIONAL_GOID->{$daughter_name};
@provisional_goid = split(/\,/,$provisional_goid);
for ($j = 0; $j <= $#provisional_goid; $j++)
{
if ($CONFIRMED_IN_NODE{$provisional_goid[$j]}) # IF THIS GOID IS CONFIRMED IN $node.
{
if (!($SEEN{$provisional_goid[$j]}))
{
$SEEN{$provisional_goid[$j]} = 1;
$daughter_confirmed_goid = $daughter_confirmed_goid.",".$provisional_goid[$j];
}
}
}
}
}
}
if ($daughter_confirmed_goid ne "")
{
$daughter_confirmed_goid = substr($daughter_confirmed_goid,1,length($daughter_confirmed_goid)-1);
if ($VERBOSE_PREORDER == 1) { print "___ adding $daughter_confirmed_goid to daughter $daughter_name of $node_name\n";}
if (!($CONFIRMED_GOID->{$daughter_name})) { $CONFIRMED_GOID->{$daughter_name} = $daughter_confirmed_goid;}
else { print STDERR "ERROR: daughter_name $daughter_name should not have a confirmed goid already.\n"; exit;}
}
}
}
}
#------------------------------------------------------------------#
# IF $node IS A SPECIATION NODE, ADD ALL ITS CONFIRMED TERMS TO THE
# CONFIRMED LISTS OF BOTH ITS CHILDREN:
sub add_confirmed_terms_to_children
{
my $node = $_[0];
my $node_name = $_[1];
my $CONFIRMED_GOID = $_[2];
my $VERBOSE_PREORDER = $_[3];
my $daughter;
my $daughter_name;
my $confirmed_goid;
if ($VERBOSE_PREORDER == 1) { print "___ adding confirmed goids for speciation node $node_name to its chidren's confirmed lists\n";}
# FIND THE CONFIRMED GOIDs OF $node:
if ($CONFIRMED_GOID->{$node_name})
{
$confirmed_goid = $CONFIRMED_GOID->{$node_name};
}
else # $node DOES NOT HAVE ANY CONFIRMED GOIDs.
{
return;
}
# FIND THE CHILDREN OF NODE $node:
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $node:
foreach $daughter ($node->children)
{
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); }
else { $daughter_name = $daughter->internalID;}
if ($VERBOSE_PREORDER == 1) { print "___ adding $confirmed_goid to daughter $daughter_name of $node_name\n";}
if (!($CONFIRMED_GOID->{$daughter_name})) { $CONFIRMED_GOID->{$daughter_name} = $confirmed_goid;}
else { print STDERR "ERROR: daughter_name $daughter_name should not have a confirmed goid already.\n"; exit;}
}
}
}
#------------------------------------------------------------------#
# RECORD THE GO TERMS FOR NODE $node. IN THE PRE-ORDER TRAVERSAL,
# WE ADD TO THE CONFIRMED TERM LIST OF A NODE THOSE PROVISIONAL
# TERMS PRESENT IN BOTH CHILDREN OF THE CURRENT NODE:
sub assign_confirmed_go_terms
{
my $PROVISIONAL_GOID = $_[0]; #
my $node = $_[1]; #
my $node_name = $_[2]; #
my $CONFIRMED_GOID = $_[3]; #
my $VERBOSE_PREORDER = $_[4]; #
my $GOID = $_[5]; #
my $daughter; #
my $daughter_name; #
my $daughter_seqname; #
my $daughter_goids; #
my @daughter_goids; #
my $i; #
my $no_daughters = 0; #
my %DAUGHTER_GOID = (); #
my %SEEN_GOID = (); #
my @all_goids; #
my $goid; #
my $confirmed_goid = ""; #
my $shared_goid; #
my $j; #
if ($VERBOSE_PREORDER == 1) { print "___ finding confirmed goids for node $node_name\n";}
# FIND THE CHILDREN OF NODE $node:
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $node:
foreach $daughter ($node->children)
{
$no_daughters++;
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); }
else { $daughter_name = $daughter->internalID;}
if ($VERBOSE_PREORDER == 1) { print "___ looking at daughter $daughter_name of parent $node_name\n";}
# FIND THE GO TERMS OF NODE $daughter:
if ($daughter->is_leaf)
{
$daughter_seqname = $daughter->get_tag_value('O');
if ($GOID->{$daughter_seqname})
{
$daughter_goids = $GOID->{$daughter_seqname};
if ($VERBOSE_PREORDER == 1) { print "___ daughter $daughter_name ($daughter_seqname) has goids $daughter_goids\n";}
@daughter_goids = split(/\,/,$daughter_goids);
for ($i = 0; $i <= $#daughter_goids; $i++)
{
$DAUGHTER_GOID{$no_daughters."_".$daughter_goids[$i]} = 1;
if (!($SEEN_GOID{$daughter_goids[$i]}))
{
$SEEN_GOID{$daughter_goids[$i]} = 1;
@all_goids = (@all_goids,$daughter_goids[$i]);
}
}
}
}
else
{
if ($PROVISIONAL_GOID->{$daughter_name})
{
$daughter_goids = $PROVISIONAL_GOID->{$daughter_name};
if ($VERBOSE_PREORDER == 1) { print "___ daughter $daughter_name has goids $daughter_goids\n";}
@daughter_goids = split(/\,/,$daughter_goids);
for ($i = 0; $i <= $#daughter_goids; $i++)
{
$DAUGHTER_GOID{$no_daughters."_".$daughter_goids[$i]} = 1;
if (!($SEEN_GOID{$daughter_goids[$i]}))
{
$SEEN_GOID{$daughter_goids[$i]} = 1;
@all_goids = (@all_goids,$daughter_goids[$i]);
}
}
}
}
}
}
# FIND THE GO IDS THAT ARE SHARED BY ALL THE DAUGHTERS OF NODE $node:
$confirmed_goid = "";
for ($i = 0; $i <= $#all_goids; $i++)
{
$goid = $all_goids[$i];
$shared_goid = 1;
for ($j = 1; $j <= $no_daughters; $j++)
{
if (!($DAUGHTER_GOID{$j."_".$goid})) { $shared_goid = 0;}
}
if ($shared_goid == 1)
{
$confirmed_goid = $confirmed_goid.",".$goid;
}
}
if ($confirmed_goid ne "")
{
$confirmed_goid = substr($confirmed_goid,1,length($confirmed_goid)-1);
if ($VERBOSE_PREORDER == 1) { print "___ node $node_name has confirmed goid $confirmed_goid\n";}
$CONFIRMED_GOID->{$node_name} = $confirmed_goid;
}
}
#------------------------------------------------------------------#
# RECORD THE GO TERMS FOR PARENT $parent. IN THE POST-ORDER
# TRAVERSAL, WE ASSIGN TO EACH INTERNAL NODE A SET OF PROVISIONAL
# TERMS THAT IS THE UNION OF THE TERMS ASSIGNED TO ITS CHILDREN:
sub assign_provisional_go_terms
{
my $GOID = $_[0]; # THE GO TERMS.
my $parent = $_[1]; #
my $parent_name = $_[2]; #
my $PROVISIONAL_GOID = $_[3]; #
my $VERBOSE_POSTORDER = $_[4]; #
my $daughter; #
my $daughter_name; #
my $daughter_seqname; #
my $daughter_goids; #
my @daughter_goids; #
my $provisional_goid = ""; #
my $i; #
my %SEEN = (); #
if ($VERBOSE_POSTORDER == 1) { print "___ finding provisional goids for parent $parent_name\n";}
# FIND THE CHILDREN OF NODE $parent:
if (!($parent->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $parent:
foreach $daughter ($parent->children)
{
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); }
else { $daughter_name = $daughter->internalID;}
if ($VERBOSE_POSTORDER == 1) { print "___ looking at daughter $daughter_name of parent $parent_name\n";}
# FIND THE GO TERMS OF NODE $daughter:
if ($daughter->is_leaf)
{
$daughter_seqname = $daughter->get_tag_value('O');
if ($GOID->{$daughter_seqname})
{
$daughter_goids = $GOID->{$daughter_seqname};
if ($VERBOSE_POSTORDER == 1) { print "___ daughter $daughter_name ($daughter_seqname) has goids $daughter_goids\n";}
@daughter_goids = split(/\,/,$daughter_goids);
for ($i = 0; $i <= $#daughter_goids; $i++)
{
if (!($SEEN{$daughter_goids[$i]}))
{
$SEEN{$daughter_goids[$i]} = 1;
$provisional_goid = $provisional_goid.",".$daughter_goids[$i];
}
}
}
}
else
{
if ($PROVISIONAL_GOID->{$daughter_name})
{
$daughter_goids = $PROVISIONAL_GOID->{$daughter_name};
if ($VERBOSE_POSTORDER == 1) { print "___ daughter $daughter_name has goids $daughter_goids\n";}
@daughter_goids = split(/\,/,$daughter_goids);
for ($i = 0; $i <= $#daughter_goids; $i++)
{
if (!($SEEN{$daughter_goids[$i]}))
{
$SEEN{$daughter_goids[$i]} = 1;
$provisional_goid = $provisional_goid.",".$daughter_goids[$i];
}
}
}
}
}
}
else { print STDERR "ERROR: parent $parent_name is a sequence.\n"; exit;}
if ($provisional_goid ne "")
{
$provisional_goid = substr($provisional_goid,1,length($provisional_goid)-1);
if ($VERBOSE_POSTORDER == 1) { print "___ parent $parent_name has provisional goid $provisional_goid\n";}
$PROVISIONAL_GOID->{$parent_name} = $provisional_goid;
}
}
#------------------------------------------------------------------#
# DO A POSTORDER TRAVERSAL OF THE TREE:
sub do_postorder_traversal
{
my $tree = $_[0]; # THE TREE OBJECT.
my $GOID = $_[1]; # THE GO IDS.
my $root; #
my $root_name; #
my @nodes; #
my @parents; #
my $no_rounds; #
my $no_new_nodes; #
my %SEEN_NODE = (); #
my %SEEN_PARENT = (); #
my $node; #
my $node_name; #
my $VERBOSE_POSTORDER = 1; #
my $parent; #
my $parent_name; #
my $daughter; #
my $daughter_name; #
my $i; #
my %PROVISIONAL_GOID = (); #
# FIND THE ROOT OF THE TREE:
$root = $tree->root;
$root_name = $root->internalID;
# LIST ALL THE LEAVES FROM THE TREE:
@nodes = &Avril_modules::empty_array(@nodes);
@parents = &Avril_modules::empty_array(@parents);
@nodes = $tree->get_leaves;
$no_rounds = 0;
NEXT_ROUND1:
$no_rounds++;
$no_new_nodes = 0;
%SEEN_NODE = ();
%SEEN_PARENT = ();
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1,
# THIS ARRAY CONTAINS THE LEAVES. WHEN $no_rounds=2, THIS ARRAY
# CONTAINS THEIR PARENTS. WHEN $no_rounds=3, THIS ARRAY CONTAINS
# THEIR GRANDPARENTS, AND SO ON.
foreach $node(@nodes)
{
if ($node->is_leaf) { $node_name = $node->id(); }
else { $node_name = $node->internalID(); }
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited node $node_name\n";}
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;}
# FIND THE PARENT OF NODE $node:
if (defined($node->parent)) { $parent = $node->parent; }
else { print STDERR "ERROR: do not know parent of node $node_name.\n"; exit;}
$parent_name = $parent->internalID;
# FIND THE OTHER CHILD OF NODE $parent:
if (!($parent->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $parent:
foreach $daughter ($parent->children)
{
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); }
else { $daughter_name = $daughter->internalID;}
if ($daughter_name ne $node_name)
{
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited sibling $daughter_name\n";}
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;}
}
}
}
else { print STDERR "ERROR: parent $parent_name is a sequence.\n"; exit;}
# RECORD THAT WE VISITED THE PARENT $parent:
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: saw parent $parent_name\n";}
if (!($SEEN_PARENT{$parent_name}) && $parent_name ne $root_name)
{
@parents = (@parents,$parent);
$SEEN_PARENT{$parent_name} = 1;
# RECORD THE GO TERMS FOR PARENT $parent. IN THE POST-ORDER
# TRAVERSAL, WE ASSIGN TO EACH INTERNAL NODE A SET OF PROVISIONAL
# TERMS THAT IS THE UNION OF THE TERMS ASSIGNED TO ITS CHILDREN:
&assign_provisional_go_terms($GOID,$parent,$parent_name,\%PROVISIONAL_GOID,$VERBOSE_POSTORDER);
}
}
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds rounds: number new = $no_new_nodes\n";}
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL:
if ($no_new_nodes > 0)
{
@nodes = &Avril_modules::empty_array(@nodes);
for ($i = 0; $i <= $#parents; $i++) { @nodes = (@nodes,$parents[$i]);}
@parents = &Avril_modules::empty_array(@parents);
goto NEXT_ROUND1;
}
# VISIT ROOT:
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited root $root_name\n";}
&assign_provisional_go_terms($GOID,$root,$root_name,\%PROVISIONAL_GOID,$VERBOSE_POSTORDER);
if ($VERBOSE_POSTORDER == 1) { print "Finished post-order traversal\n";}
return(\%PROVISIONAL_GOID);
}
#------------------------------------------------------------------#
# CHECK WHETHER THE CONFIRMED GO TERMS FOR NODE $node ARE
# DIFFERENT FROM THOSE OF ITS CHILDREN.
sub compare_parent_to_children
{
my $CONFIRMED_GOID = $_[0]; #
my $node = $_[1]; #
my $node_name = $_[2]; #
my $VERBOSE_CHANGES = $_[3]; #
my $GOID = $_[4]; #
my $daughter; #
my $daughter_name; #
my $daughter_seqname; #
my $daughter_goids; #
my $node_goids; #
my @daughter_goids; #
my @node_goids; #
my %NODE_GOID = (); #
my %DAUGHTER_GOID = (); #
my $i; #
my $goid; #
# FIND THE GOIDS FOR $node:
if (!($CONFIRMED_GOID->{$node_name})) { $node_goids = "none"; }
else { $node_goids = $CONFIRMED_GOID->{$node_name};}
if ($VERBOSE_CHANGES == 1) { print "___ comparing parent $node_name ($node_goids) to its children\n";}
# FIND THE CHILDREN OF NODE $node:
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $node:
foreach $daughter ($node->children)
{
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); }
else { $daughter_name = $daughter->internalID;}
if ($VERBOSE_CHANGES == 1) { print "___ comparing parent $node_name to daughter $daughter_name\n";}
# FIND THE CONFIRMED GO TERMS OF NODE $daughter:
$daughter_goids = "";
if ($daughter->is_leaf)
{
$daughter_seqname = $daughter->get_tag_value('O');
if ($GOID->{$daughter_seqname})
{
$daughter_goids = $GOID->{$daughter_seqname};
}
else { $daughter_goids = "none";}
if ($VERBOSE_CHANGES == 1) { print "___ daughter $daughter_name ($daughter_seqname) has goids $daughter_goids\n";}
}
else
{
if ($CONFIRMED_GOID->{$daughter_name}) { $daughter_goids = $CONFIRMED_GOID->{$daughter_name};}
else { $daughter_goids = "none"; }
if ($VERBOSE_CHANGES == 1) { print "___ daughter $daughter_name has goids $daughter_goids\n";}
}
# CHECK IF $daughter_goids IS DIFFERENT FROM $node_goids:
@node_goids = split(/\,/,$node_goids);
@daughter_goids = split(/\,/,$daughter_goids);
%NODE_GOID = ();
%DAUGHTER_GOID = ();
# RECORD ALL THE GOIDS FOR $node:
for ($i = 0; $i <= $#node_goids; $i++)
{
if ($node_goids[$i] ne 'none')
{
$NODE_GOID{$node_goids[$i]} = 1;
}
}
# RECORD ALL THE GOIDS FOR $daughter:
for ($i = 0; $i <= $#daughter_goids; $i++)
{
if ($daughter_goids[$i] ne 'none')
{
$DAUGHTER_GOID{$daughter_goids[$i]} = 1;
}
}
# CHECK IF $node HAS ANY GOIDS THAT $daughter DOES NOT HAVE:
foreach $goid (keys %NODE_GOID)
{
if (!($DAUGHTER_GOID{$goid}))
{
print "*** node $node_name has GOID $goid, that daughter $daughter_name does not have\n";
}
}
# CHECK IF $daughter HAS ANY GOIDS THAT $node DOES NOT HAVE:
foreach $goid (keys %DAUGHTER_GOID)
{
if (!($NODE_GOID{$goid}))
{
print "*** daughter $daughter_name has GOID $goid, that parent $node_name does not have\n";
}
}
}
}
else { print STDERR "ERROR: parent $node_name is a sequence.\n"; exit;}
}
#------------------------------------------------------------------#
# FIND NODES IN THE TREE WHERE CONFIRMED GO TERMS OF A PARENT NODE
# DIFFERS FROM THOSE OF ITS DAUGHTER NODES:
sub find_changes_in_goid
{
my $tree = $_[0]; #
my $CONFIRMED_GOID = $_[1]; #
my $GOID = $_[2]; #
my $root; #
my $root_name; #
my @nodes; #
my @children; #
my $no_rounds; #
my $no_new_nodes; #
my %SEEN_NODE = (); #
my %SEEN_CHILD = (); #
my $node; #
my $node_name; #
my $VERBOSE_CHANGES = 1; #
my $child; #
my $child_name; #
my $i; #
# FIND THE ROOT OF THE TREE:
$root = $tree->root;
$root_name = $root->internalID;
# ARRAY @nodes STARTS BY CONTAINING JUST THE ROOT NODE:
@nodes = &Avril_modules::empty_array(@nodes);
@children = &Avril_modules::empty_array(@children);
@nodes = (@nodes,$root);
$no_rounds = 0;
NEXT_ROUND3:
$no_rounds++;
$no_new_nodes = 0;
%SEEN_NODE = ();
%SEEN_CHILD = ();
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1,
# THIS ARRAY CONTAINS THE ROOT. WHEN $no_rounds=2, THIS ARRAY
# CONTAINS ITS CHILDREN. WHEN $no_rounds=3, THIS ARRAY CONTAINS
# ITS GRANDCHILDREN, AND SO ON.
foreach $node(@nodes)
{
if ($node->is_leaf) { $node_name = $node->id(); }
else { $node_name = $node->internalID(); }
if ($VERBOSE_CHANGES == 1) { print "$no_rounds: visited node $node_name\n";}
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;}
# CHECK WHETHER THE CONFIRMED GO TERMS FOR NODE $node ARE
# DIFFERENT FROM THOSE OF ITS CHILDREN.
if (!($node->is_leaf))
{
&compare_parent_to_children($CONFIRMED_GOID,$node,$node_name,$VERBOSE_CHANGES,$GOID);
}
# FIND THE CHILDREN OF NODE $node:
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $node:
foreach $child ($node->children)
{
if ($child->is_leaf) { $child_name = $child->id(); }
else { $child_name = $child->internalID;}
if ($VERBOSE_CHANGES == 1) { print "$no_rounds: saw child $child_name\n"; }
# RECORD THAT WE SAW THE CHILD $child:
if (!($SEEN_CHILD{$child_name}))
{
@children = (@children,$child);
$SEEN_CHILD{$child_name} = 1;
}
}
} # ELSE, THIS NODE IS A SEQUENCE, SO DOES NOT HAVE ANY CHILDREN.
}
if ($VERBOSE_CHANGES == 1) { print "$no_rounds rounds: number new = $no_new_nodes\n";}
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL:
if ($no_new_nodes > 0)
{
@nodes = &Avril_modules::empty_array(@nodes);
for ($i = 0; $i <= $#children; $i++) { @nodes = (@nodes,$children[$i]);}
@children = &Avril_modules::empty_array(@children);
goto NEXT_ROUND3;
}
if ($VERBOSE_CHANGES == 1) { print "Finished searching for changes in GO id\n";}
}
#------------------------------------------------------------------#
# DO A PREORDER TRAVERSAL OF THE TREE TO ASSIGN CONFIRMED GO TERMS:
sub do_preorder_traversal
{
my $tree = $_[0]; #
my $PROVISIONAL_GOID = $_[1]; #
my $GOID = $_[2]; #
my %CONFIRMED_GOID = (); #
my $root; #
my $root_name; #
my @nodes; #
my @children; #
my $no_rounds = 0; #
my $no_new_nodes; #
my %SEEN_NODE = (); #
my %SEEN_CHILD = (); #
my $node; #
my $node_name; #
my $VERBOSE_PREORDER = 1; #
my $child; #
my $child_name; #
my $i; #
my $node_type; #
# FIND THE ROOT OF THE TREE:
$root = $tree->root;
$root_name = $root->internalID;
# ARRAY @nodes STARTS BY CONTAINING JUST THE ROOT NODE:
@nodes = &Avril_modules::empty_array(@nodes);
@children = &Avril_modules::empty_array(@children);
@nodes = (@nodes,$root);
$no_rounds = 0;
NEXT_ROUND2:
$no_rounds++;
$no_new_nodes = 0;
%SEEN_NODE = ();
%SEEN_CHILD = ();
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1,
# THIS ARRAY CONTAINS THE ROOT. WHEN $no_rounds=2, THIS ARRAY
# CONTAINS ITS CHILDREN. WHEN $no_rounds=3, THIS ARRAY CONTAINS
# ITS GRANDCHILDREN, AND SO ON.
foreach $node(@nodes)
{
if ($node->is_leaf) { $node_name = $node->id(); }
else { $node_name = $node->internalID(); }
if ($VERBOSE_PREORDER == 1) { print "$no_rounds: visited node $node_name\n";}
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;}
# RECORD THE GO TERMS FOR NODE $node. IN THE PRE-ORDER TRAVERSAL,
# WE ADD TO THE CONFIRMED TERM LIST OF A NODE THOSE PROVISIONAL
# TERMS PRESENT IN BOTH CHILDREN OF THE CURRENT NODE:
&assign_confirmed_go_terms($PROVISIONAL_GOID,$node,$node_name,\%CONFIRMED_GOID,$VERBOSE_PREORDER,$GOID);
# IF THIS NODE IS AN INTERNAL NODE:
if (!($node->is_leaf))
{
# CHECK IF $node IS A SPECIATION NODE OR A DUPLICATION NODE:
# $node_type HAS VALUE Y FOR DUPLICATION, N FOR SPECIATION.
$node_type = $node->get_tag_value('D');
# IF $node IS A SPECIATION NODE, ADD ALL ITS CONFIRMED TERMS TO THE
# CONFIRMED LISTS OF BOTH ITS CHILDREN:
if ($node_type eq 'N') { &add_confirmed_terms_to_children($node,$node_name,\%CONFIRMED_GOID,$VERBOSE_PREORDER);}
# IF $node IS A DUPLICATION NODE, ONLY CONFIRM IN THE CHILDREN THOSE
# CONFIRMED TERMS IN THE CURRENT NODE THAT ARE PROVISIONAL TERMS IN
# THE CHILD NODE.
elsif ($node_type eq 'Y') { &add_confirmed_terms_to_children2($node,$node_name,\%CONFIRMED_GOID,$VERBOSE_PREORDER,
$PROVISIONAL_GOID,$GOID);}
else { print STDERR "ERROR: node_type $node_type.\n"; exit;}
}
# FIND THE CHILDREN OF NODE $node:
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
# GET THE TWO DAUGHTER NODES OF $node:
foreach $child ($node->children)
{
if ($child->is_leaf) { $child_name = $child->id(); }
else { $child_name = $child->internalID;}
if ($VERBOSE_PREORDER == 1) { print "$no_rounds: saw child $child_name\n"; }
# RECORD THAT WE SAW THE CHILD $child:
if (!($SEEN_CHILD{$child_name}))
{
@children = (@children,$child);
$SEEN_CHILD{$child_name} = 1;
}
}
} # ELSE, THIS NODE IS A SEQUENCE, SO DOES NOT HAVE ANY CHILDREN.
}
if ($VERBOSE_PREORDER == 1) { print "$no_rounds rounds: number new = $no_new_nodes\n";}
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL:
if ($no_new_nodes > 0)
{
@nodes = &Avril_modules::empty_array(@nodes);
for ($i = 0; $i <= $#children; $i++) { @nodes = (@nodes,$children[$i]);}
@children = &Avril_modules::empty_array(@children);
goto NEXT_ROUND2;
}
if ($VERBOSE_PREORDER == 1) { print "Finished pre-order traversal\n";}
return(\%CONFIRMED_GOID);
}
#------------------------------------------------------------------#
# INFER THE GO IDS OF ANCESTRAL NODES IN THESE FAMILIES:
sub infer_ancestral_nodes
{
my $GOID = $_[0]; #
my $families = $_[1]; #>
my $i; #>
my $family_id; #>
my $tree; #>
my $dbc; #>
my $family; #>
my $famh; #>
my $leaf; #>
my $leaf2; #
my $leaf_name; #>
my $leaf2_name; #
my $leaf2_species; #
my $leaf_species; #
my $leaf2_id; #
my $leaf_id; #
my @temp; #
my $lca; #
my $lca_name; #
my $lca_type; #
my $parent; #>
my $parent_name; #>
my $parent_type; #
my $duplications1; #
my $duplications2; #
my $leaf_goids; #
my $leaf2_goids; #
my @leaf_goids; #
my @leaf2_goids; #
my $root; #
my $root_name; #
my $leaf_steps_to_root; #
my $leaf2_steps_to_root; #
my $k; #
my $m; #
my %SEEN_GOID = (); #
my @nodes; #
my $node; #
my $node_name; #
my $node_type; #
my $no_daughters; #
my $daughter1; #
my $daughter2; #
my $daughter1_name; #
my $daughter2_name; #
my $daughter; #>
my $daughter_name; #>
my @subnodes; #
my %SEEN_SUBNODE = (); #
my $subnode; #
my $subnode_name; #
my $new; #
my $p; #
my $subsubnode; #
my @parents;
my $no_rounds;
my $no_new_nodes;
my %SEEN_NODE = ();
my %SEEN_PARENT = ();
my $PROVISIONAL_GOID;
my $CONFIRMED_GOID;
# CONNECT TO THE TREEFAM DATABASE USING DEFAULT CONFIGURATION:
$dbc = Treefam::DBConnection->new();
for ($i = 0; $i < @$families; $i++)
{
$family_id = $families->[$i]; # eg. TF314159
if ($VERBOSE3 == 1) { print "Looking at family $family_id\n";}
# GET A FAMILY:
$famh = $dbc->get_FamilyHandle();
$family = $famh->get_by_id($family_id);
if (!(defined($family)))
{
print "Do not have $family_id in the database...\n";
goto NEXT_FAMILY;
}
# GET THE CLEAN TREE FOR THIS FAMILY:
$tree = $family->get_tree('clean');
# DO A POSTORDER TRAVERSAL OF THE TREE TO ASSIGN PROVISIONAL GO TERMS:
$PROVISIONAL_GOID = &do_postorder_traversal($tree,$GOID);
# DO A PREORDER TRAVERSAL OF THE TREE TO ASSIGN CONFIRMED GO TERMS:
$CONFIRMED_GOID = &do_preorder_traversal($tree,$PROVISIONAL_GOID,$GOID);
# FIND NODES IN THE TREE WHERE CONFIRMED GO TERMS OF A PARENT NODE
# DIFFERS FROM THOSE OF ITS DAUGHTER NODES:
&find_changes_in_goid($tree,$CONFIRMED_GOID,$GOID);
exit; #xxx
# LIST ALL THE LEAVES FROM THE TREE:
foreach $leaf($tree->get_leaves)
{
# COMPARE TO ALL OTHER LEAVES IN THE TREE:
foreach $leaf2($tree->get_leaves)
{
# SEE IF $leaf AND $leaf2 HAVE GO IDS ASSIGNED:
$leaf_name = $leaf->id();
$leaf2_name = $leaf2->id();
$leaf_id = $leaf->get_tag_value('O');
$leaf2_id = $leaf2->get_tag_value('O');
@temp = split(/_/,$leaf_name);
$leaf_species = $temp[1];
@temp = split(/_/,$leaf2_name);
$leaf2_species = $temp[1];
# IF THESE LEAVES BELONG TO SPECIES THAT WE ARE INTERESTED IN:
if (($leaf_species eq 'DROME' || $leaf_species eq 'MOUSE' || $leaf_species eq 'RAT' ||
$leaf_species eq 'CAEEL' || $leaf_species eq 'HUMAN' || $leaf_species eq 'BRARE' ||
$leaf_species eq 'CHICK') &&
($leaf2_species eq 'DROME' || $leaf2_species eq 'MOUSE' || $leaf2_species eq 'RAT' ||
$leaf2_species eq 'CAEEL' || $leaf2_species eq 'HUMAN' || $leaf2_species eq 'BRARE' ||
$leaf2_species eq 'CHICK'))
{
# IF BOTH LEAVES HAVE GO IDS ANNOTATED:
if ($GOID->{$leaf_id} && $GOID->{$leaf2_id} && $leaf_id ne $leaf2_id)
{
# FIND THE LAST COMMON ANCESTOR NODE OF $leaf AND $leaf2:
$lca = $tree->get_last_common_ancestor($leaf,$leaf2);
$lca_name= $lca->internalID;
$lca_type = $lca->get_tag_value('D');
if ($lca_type eq 'Y') # THE ANCESTRAL NODE IS A DUPLICATION:
{
$duplications1 = 1; $duplications2 = 1;
$leaf_steps_to_root = "-1"; $leaf2_steps_to_root = -1;
goto FOUND_ROOT2;
}
# SEE IF THERE ARE ANY DUPLICATION NODES BETWEEN $leaf AND $lca:
$duplications1 = 0;
if (defined($leaf->parent)) { $parent = $leaf->parent; }
else { print STDERR "ERROR: do not know parent of leaf $leaf_id.\n"; exit;}
# $parent_type HAS VALUE Y FOR DUPLICATION, N FOR SPECIATION.
$parent_type = $parent->get_tag_value('D');
if ($parent_type eq 'Y') { $duplications1 = 1;}
$parent_name = $parent->internalID;
if ($parent_name eq $lca_name) { goto FOUND_LCA1;}
FIND_PARENT1:
if (defined($parent->parent))
{
$parent = $parent->parent;
$parent_type = $parent->get_tag_value('D');
if ($parent_type eq 'Y') { $duplications1 = 1;}
$parent_name = $parent->internalID;
if ($parent_name eq $lca_name) { goto FOUND_LCA1;}
goto FIND_PARENT1;
}
FOUND_LCA1:
# FIND HOW MANY STEPS THERE ARE BETWEEN $leaf AND THE ROOT $root:
$leaf_steps_to_root = 0;
if (defined($leaf->parent)) { $parent = $leaf->parent; }
else { print STDERR "ERROR: do not know parent of leaf $leaf_id.\n"; exit;}
$parent_type = $parent->get_tag_value('D');
if ($parent_type eq 'N') { $leaf_steps_to_root++;}
$parent_name = $parent->internalID;
if ($parent_name eq $root_name) { goto FOUND_ROOT1;}
FIND_ROOT1:
if (defined($parent->parent))
{
$parent = $parent->parent;
$parent_type = $parent->get_tag_value('D');
if ($parent_type eq 'N') { $leaf_steps_to_root++;}
$parent_name = $parent->internalID;
if ($parent_name eq $root_name) { goto FOUND_ROOT1;}
goto FIND_ROOT1;
}
FOUND_ROOT1:
# SEE IF THERE ARE ANY DUPLICATION NODES BETWEEN $leaf2 AND $lca:
$duplications2 = 0;
if (defined($leaf2->parent)) { $parent = $leaf2->parent; }
else { print STDERR "ERROR: do not know parent of leaf2 $leaf2_id.\n"; exit;}
# $parent_type HAS VALUE Y FOR DUPLICATION, N FOR SPECIATION.
$parent_type = $parent->get_tag_value('D');
if ($parent_type eq 'Y') { $duplications2 = 1;}
$parent_name = $parent->internalID;
if ($parent_name eq $lca_name) { goto FOUND_LCA2;}
FIND_PARENT2:
if (defined($parent->parent))
{
$parent = $parent->parent;
$parent_type = $parent->get_tag_value('D');
if ($parent_type eq 'Y') { $duplications2 = 1;}
$parent_name = $parent->internalID;
if ($parent_name eq $lca_name) { goto FOUND_LCA2;}
goto FIND_PARENT2;
}
FOUND_LCA2:
# FIND HOW MANY STEPS THERE ARE BETWEEN $leaf2 AND THE ROOT $root:
$leaf2_steps_to_root = 0;
if (defined($leaf2->parent)) { $parent = $leaf2->parent; }
else { print STDERR "ERROR: do not know parent of leaf2 $leaf2_id.\n"; exit;}
$parent_type = $parent->get_tag_value('D');
if ($parent_type eq 'N') { $leaf2_steps_to_root++;}
$parent_name = $parent->internalID;
if ($parent_name eq $root_name) { goto FOUND_ROOT2;}
FIND_ROOT2:
if (defined($parent->parent))
{
$parent = $parent->parent;
$parent_type = $parent->get_tag_value('D');
if ($parent_type eq 'N') { $leaf2_steps_to_root++;}
$parent_name = $parent->internalID;
if ($parent_name eq $root_name) { goto FOUND_ROOT2;}
goto FIND_ROOT2;
}
FOUND_ROOT2:
# IF THERE ARE NO DUPLICATIONS, WE CAN INFER THAT THE NODE $lca
# SHARES THE GO IDS THAT $leaf1 AND $leaf2 HAVE IN COMMON:
if ($VERBOSE == 1)
{
print "\n$family_id $leaf_id $GOID->{$leaf_id} $leaf2_id $GOID->{$leaf2_id}\n";
print "$family_id duplications1 $duplications1 duplications2 $duplications2 lca_type $lca_type lca_name $lca_name\n";
print "$family_id leaf_steps_to_root $leaf_steps_to_root leaf2_steps_to_root $leaf2_steps_to_root\n";
}
if (($duplications1 == 0 && $duplications2 == 0) || # THERE ARE NO DUPLICATIONS BETWEEN THE GENES
($duplications1 == 1 && $duplications2 == 0 &&
$leaf_steps_to_root > $leaf2_steps_to_root) || # THERE ARE DUPLICATIONS BETWEEN
# LCA AND $leaf, BUT $leaf2 IS AN OUTGROUP
($duplications1 == 0 && $duplications2 == 1 &&
$leaf_steps_to_root < $leaf2_steps_to_root)) # THERE ARE DUPLICATIONS BETWEEN
# LCA AND $leaf2, BUT $leaf IS AN OUTGROUP
{
$leaf_goids = $GOID->{$leaf_id};
$leaf2_goids= $GOID->{$leaf2_id};
@leaf_goids = split(/\,/,$leaf_goids);
@leaf2_goids= split(/\,/,$leaf2_goids);
# CHECK WHETHER THE TWO SEQUENCES HAVE ANY GO IDS IN COMMON:
for ($k = 0; $k <= $#leaf_goids; $k++)
{
for ($m = 0; $m <= $#leaf2_goids; $m++)
{
if ($leaf_goids[$k] eq $leaf2_goids[$m])
{
if (!($SEEN_GOID{$family_id."=".$lca_name."=".$leaf_goids[$k]}))
{
$SEEN_GOID{$family_id."=".$lca_name."=".$leaf_goids[$k]} = 1;
print "$family_id: assigning $leaf_goids[$k] to node $lca_name ($leaf_id $leaf2_id)\n";
if (!($GOID->{$family_id."=".$lca_name})) { $GOID->{$family_id."=".$lca_name} = $leaf_goids[$k];}
else { $GOID->{$family_id."=".$lca_name} = $GOID->{$family_id."=".$lca_name}.",".$leaf_goids[$k];}
}
}
}
}
}
}
}
}
}
# GET ALL THE NODES IN THE TREE:
print "Getting all nodes in tree for $family_id...\n";
@nodes = $tree->get_all_nodes;
foreach $node(@nodes)
{
$node_name = $node->internalID;
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
$node_type = $node->get_tag_value('D');
if ($node_type eq 'Y') # THE NODE IS A DUPLICATION:
{
# GET THE TWO DAUGHTER NODES OF $node:
$no_daughters = 0;
$daughter1 = "";
$daughter2 = "";
foreach $daughter ($node->children)
{
$no_daughters++;
$daughter_name = $daughter->internalID;
if ($no_daughters == 1) { $daughter1_name = $daughter_name;}
elsif ($no_daughters == 2) { $daughter2_name = $daughter_name;}
@subnodes = &Avril_modules::empty_array(@subnodes);
@subnodes = (@subnodes,$daughter);
%SEEN_SUBNODE = ();
$SEEN_SUBNODE{$daughter} = 1;
# GET ALL THE DESCENDANTS OF $daughter:
LOOP_AGAIN1:
$new = 0;
for ($p = 0; $p <= $#subnodes; $p++)
{
$subnode = $subnodes[$p];
if (!($subnode->is_leaf))
{
foreach $subsubnode ($subnode->children)
{
if (!($SEEN_SUBNODE{$subsubnode}))
{
$SEEN_SUBNODE{$subsubnode} = 1;
$new++;
@subnodes = (@subnodes,$subsubnode);
}
}
}
}
if ($new > 0) { goto LOOP_AGAIN1;}
# FIND ALL THE GO IDS OF THE DESCENDANTS OF $daughter:
for ($p = 0; $p <= $#subnodes; $p++)
{
$subnode = $subnodes[$p];
if (!($subnode->is_leaf)) # IF IT IS NOT A LEAF NODE.
{
$subnode_name = $subnode->internalID;
$subnode_name = $family_id."=".$subnode_name;
# CHECK IF THERE ARE GO SLIM IDS FOR THIS NODE:
if ($GOID->{$subnode_name})
{
print "$family_id = node $node_name => $daughter_name => $subnode_name: go $GOID->{$subnode_name}\n";
if ($no_daughters == 1) { $daughter1 = $daughter1.",".$GOID->{$subnode_name};}
elsif ($no_daughters == 2) { $daughter2 = $daughter2.",".$GOID->{$subnode_name};}
}
}
}
}
# COMPARE THE GO IDS FOR $daughter1 AND $daughter2:
if ($daughter1 ne '' && $daughter2 ne '')
{
&compare_goids($daughter1,$daughter2,$node_name,$family_id,$daughter1_name,$daughter2_name);
}
}
}
}
NEXT_FAMILY:
}
}
#------------------------------------------------------------------#
# COMPARE THE GO IDS FOR $daughter1 AND $daughter2:
sub compare_goids
{
my $daughter1 = $_[0];
my $daughter2 = $_[1];
my $node_name = $_[2];
my $family_id = $_[3];
my $daughter1_name = $_[4];
my $daughter2_name = $_[5];
my @ids1;
my @ids2;
my %ID1 = ();
my $i;
my $same = 1;
my %SEEN = ();
$daughter1 = substr($daughter1,1,length($daughter1)-1);
$daughter2 = substr($daughter2,1,length($daughter2)-1);
@ids1 = split(/\,/,$daughter1);
@ids2 = split(/\,/,$daughter2);
%SEEN = ();
$daughter1 = "";
for ($i = 0; $i <= $#ids1; $i++)
{
if (!($SEEN{$ids1[$i]})) { $daughter1 = $daughter1.",".$ids1[$i]; $SEEN{$ids1[$i]} = 1;}
}
$daughter1 = substr($daughter1,1,length($daughter1)-1);
%SEEN = ();
$daughter2 = "";
for ($i = 0; $i <= $#ids2; $i++)
{
if (!($SEEN{$ids2[$i]})) { $daughter2 = $daughter2.",".$ids2[$i]; $SEEN{$ids2[$i]} = 1;}
}
$daughter2 = substr($daughter2,1,length($daughter2)-1);
@ids1 = &Avril_modules::empty_array(@ids1);
@ids2 = &Avril_modules::empty_array(@ids2);
@ids1 = split(/\,/,$daughter1);
@ids2 = split(/\,/,$daughter2);
for ($i = 0; $i <= $#ids1; $i++) { $ID1{$ids1[$i]} = 1;}
for ($i = 0; $i <= $#ids2; $i++) { if (!($ID1{$ids2[$i]})) { $same = 0;}}
if ($same == 0)
{
print "$family_id node $node_name : daughter1 $daughter1 ($daughter1_name) and daughter2 $daughter2 ($daughter2_name) have different GO ids! ***\n";
}
else
{
print "$family_id node $node_name : daughter1 $daughter1 and daughter2 $daughter2 have same GO ids.\n";
}
}
#------------------------------------------------------------------#
# READ IN THE INPUT FILE OF GO TERMS:
sub read_go
{
my $file = $_[0];
my $TAKE = $_[1];
my $line;
my @temp;
my $family;
my $species;
my $goid;
my %GOID = ();
my %SEEN = ();
my @families;
my $name;
open(FILE,"$file") || die "ERROR: cannot open $file.\n";
while(<FILE>)
{
$line = $_;
chomp $line;
@temp = split(/\s+/,$line);
if (substr($line,0,6) eq 'Family') # Family TF101001 : DM CYCB has GO id GO:0000910
{
$family = $temp[1]; # eg. TF101001
$species = $temp[3]; # eg. DM
$goid = $temp[8]; # eg. GO:0000910
$name = $temp[9]; # eg. CG3510-RA.3
if ($TAKE->{$family})
{
if (!($GOID{$name})) { $GOID{$name} = $goid;}
else {$GOID{$name} = $GOID{$name}.",".$goid;}
if ($VERBOSE2 == 1)
{
print "Family $family gene $name GO $GOID{$name}\n";
}
if (!($SEEN{$family}))
{
$SEEN{$family} = 1;
@families = (@families,$family);
}
}
}
}
close(FILE);
return(\%GOID,\@families);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment