Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 16:52
Show Gist options
  • Save avrilcoghlan/5066009 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5066009 to your computer and use it in GitHub Desktop.
Perl script that, given a file with GO annotations for sequences in TreeFam families, and a list of families, infers the GO annotations for ancestral nodes in the trees for those families.
#!/usr/local/bin/perl
#
# Perl script treefam_infer_ancestral_GOids.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 2-Mar-07.
#
# For the TreeFam project.
#
# This perl script infers the GO ids of ancestral nodes
# in a list of TreeFam trees, given the GO ids of the
# leaves.
#
# The command-line format is:
# % perl <treefam_infer_ancestral_GOids.pl> GO families
# where GO has the GO ids of the leaves,
# families is the list of TreeFam families.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 2)
{
print "Usage of treefam_infer_ancestral_GOids.pl\n\n";
print "perl treefam_infer_ancestral_GOids.pl <GO> <families>\n";
print "where <GO> contains the GO ids of the leaves,\n";
print " <families> is the list of TreeFam families.\n";
print "For example, >perl treefam_infer_ancestral_GOids.pl GO_slim_terms11 neural_genes\n";
exit;
}
# READ IN MY PERL MODULES:
BEGIN {
unshift (@INC, '/nfs/team71/phd/alc/perl/modules');
}
use lib "/nfs/team71/phd/alc/perl/treefam/Treefam_api_2mar07/";
use Avril_modules;
use Treefam::DBConnection;
# FIND THE NAME OF THE INPUT FILE OF GO TERMS ASSIGNED TO LEAVES:
$GO = $ARGV[0];
# FIND THE NAME OF THE LIST OF TREEFAM FAMILIES:
$fams = $ARGV[1];
$VERBOSE = 0;
#------------------------------------------------------------------#
# READ IN THE LIST OF FAMILIES:
%TAKE = ();
open(FAMS,"$fams") || die "ERROR: cannot open $fams.\n";
while(<FAMS>)
{
$line = $_;
chomp $line;
@temp = split(/\s+/,$line);
if (substr($line,0,2) eq 'TF')
{
$family = $temp[0];
$TAKE{$family} = 1;
}
}
close(FAMS);
# READ IN THE INPUT FILE OF GO TERMS:
($GOID,$families) = &read_go($GO,\%TAKE);
print STDERR "Read in file $GO...\n";
# NOW INFER THE GO IDS OF ANCESTRAL NODES IN THESE FAMILIES:
&infer_ancestral_nodes($GOID,$families);
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# INFER THE GO IDS OF ANCESTRAL NODES IN THESE FAMILIES:
sub infer_ancestral_nodes
{
my $GOID = $_[0]; #
my $families = $_[1]; #
my $i; #
my $family_id; #
my $tree; #
my $dbc; #
my $family; #
my $famh; #
my $leaf; #
my $leaf2; #
my $leaf_name; #
my $leaf2_name; #
my $leaf2_species; #
my $leaf_species; #
my $leaf2_id; #
my $leaf_id; #
my @temp; #
my $lca; #
my $lca_name; #
my $lca_type; #
my $parent; #
my $parent_name; #
my $parent_type; #
my $duplications1; #
my $duplications2; #
my $leaf_goids; #
my $leaf2_goids; #
my @leaf_goids; #
my @leaf2_goids; #
my $root; #
my $root_name; #
my $leaf_steps_to_root; #
my $leaf2_steps_to_root; #
my $k; #
my $m; #
my %SEEN_GOID = (); #
my @nodes; #
my $node; #
my $node_name; #
my $node_type; #
my $no_daughters; #
my $daughter1; #
my $daughter2; #
my $daughter1_name; #
my $daughter2_name; #
my $daughter; #
my $daughter_name; #
my @subnodes; #
my %SEEN_SUBNODE = (); #
my $subnode; #
my $subnode_name; #
my $new; #
my $p; #
my $subsubnode; #
# CONNECT TO THE TREEFAM DATABASE USING DEFAULT CONFIGURATION:
$dbc = Treefam::DBConnection->new();
for ($i = 0; $i < @$families; $i++)
{
$family_id = $families->[$i];
# GET A FAMILY:
$famh = $dbc->get_FamilyHandle();
$family = $famh->get_by_id($family_id);
if (!(defined($family)))
{
print "Do not have $family_id in the database...\n";
goto NEXT_FAMILY;
}
# GET THE CLEAN TREE FOR THIS FAMILY:
$tree = $family->get_tree('clean');
# FIND THE ROOT OF THE TREE:
$root = $tree->root;
$root_name = $root->internalID;
# LIST ALL THE LEAVES FROM THE TREE:
foreach $leaf($tree->get_leaves)
{
# COMPARE TO ALL OTHER LEAVES IN THE TREE:
foreach $leaf2($tree->get_leaves)
{
# SEE IF $leaf AND $leaf2 HAVE GO IDS ASSIGNED:
$leaf_name = $leaf->id();
$leaf2_name = $leaf2->id();
$leaf_id = $leaf->get_tag_value('O');
$leaf2_id = $leaf2->get_tag_value('O');
@temp = split(/_/,$leaf_name);
$leaf_species = $temp[1];
@temp = split(/_/,$leaf2_name);
$leaf2_species = $temp[1];
# IF THESE LEAVES BELONG TO SPECIES THAT WE ARE INTERESTED IN:
if (($leaf_species eq 'DROME' || $leaf_species eq 'MOUSE' || $leaf_species eq 'RAT' ||
$leaf_species eq 'CAEEL' || $leaf_species eq 'HUMAN' || $leaf_species eq 'BRARE' ||
$leaf_species eq 'CHICK') &&
($leaf2_species eq 'DROME' || $leaf2_species eq 'MOUSE' || $leaf2_species eq 'RAT' ||
$leaf2_species eq 'CAEEL' || $leaf2_species eq 'HUMAN' || $leaf2_species eq 'BRARE' ||
$leaf2_species eq 'CHICK'))
{
# IF BOTH LEAVES HAVE GO IDS ANNOTATED:
if ($GOID->{$leaf_id} && $GOID->{$leaf2_id} && $leaf_id ne $leaf2_id)
{
# FIND THE LAST COMMON ANCESTOR NODE OF $leaf AND $leaf2:
$lca = $tree->get_last_common_ancestor($leaf,$leaf2);
$lca_name= $lca->internalID;
$lca_type = $lca->get_tag_value('D');
if ($lca_type eq 'Y') # THE ANCESTRAL NODE IS A DUPLICATION:
{
$duplications1 = 1; $duplications2 = 1;
$leaf_steps_to_root = "-1"; $leaf2_steps_to_root = -1;
goto FOUND_ROOT2;
}
# SEE IF THERE ARE ANY DUPLICATION NODES BETWEEN $leaf AND $lca:
$duplications1 = 0;
if (defined($leaf->parent)) { $parent = $leaf->parent; }
else { print STDERR "ERROR: do not know parent of leaf $leaf_id.\n"; exit;}
# $parent_type HAS VALUE Y FOR DUPLICATION, N FOR SPECIATION.
$parent_type = $parent->get_tag_value('D');
if ($parent_type eq 'Y') { $duplications1 = 1;}
$parent_name = $parent->internalID;
if ($parent_name eq $lca_name) { goto FOUND_LCA1;}
FIND_PARENT1:
if (defined($parent->parent))
{
$parent = $parent->parent;
$parent_type = $parent->get_tag_value('D');
if ($parent_type eq 'Y') { $duplications1 = 1;}
$parent_name = $parent->internalID;
if ($parent_name eq $lca_name) { goto FOUND_LCA1;}
goto FIND_PARENT1;
}
FOUND_LCA1:
# FIND HOW MANY STEPS THERE ARE BETWEEN $leaf AND THE ROOT $root:
$leaf_steps_to_root = 0;
if (defined($leaf->parent)) { $parent = $leaf->parent; }
else { print STDERR "ERROR: do not know parent of leaf $leaf_id.\n"; exit;}
$parent_type = $parent->get_tag_value('D');
if ($parent_type eq 'N') { $leaf_steps_to_root++;}
$parent_name = $parent->internalID;
if ($parent_name eq $root_name) { goto FOUND_ROOT1;}
FIND_ROOT1:
if (defined($parent->parent))
{
$parent = $parent->parent;
$parent_type = $parent->get_tag_value('D');
if ($parent_type eq 'N') { $leaf_steps_to_root++;}
$parent_name = $parent->internalID;
if ($parent_name eq $root_name) { goto FOUND_ROOT1;}
goto FIND_ROOT1;
}
FOUND_ROOT1:
# SEE IF THERE ARE ANY DUPLICATION NODES BETWEEN $leaf2 AND $lca:
$duplications2 = 0;
if (defined($leaf2->parent)) { $parent = $leaf2->parent; }
else { print STDERR "ERROR: do not know parent of leaf2 $leaf2_id.\n"; exit;}
# $parent_type HAS VALUE Y FOR DUPLICATION, N FOR SPECIATION.
$parent_type = $parent->get_tag_value('D');
if ($parent_type eq 'Y') { $duplications2 = 1;}
$parent_name = $parent->internalID;
if ($parent_name eq $lca_name) { goto FOUND_LCA2;}
FIND_PARENT2:
if (defined($parent->parent))
{
$parent = $parent->parent;
$parent_type = $parent->get_tag_value('D');
if ($parent_type eq 'Y') { $duplications2 = 1;}
$parent_name = $parent->internalID;
if ($parent_name eq $lca_name) { goto FOUND_LCA2;}
goto FIND_PARENT2;
}
FOUND_LCA2:
# FIND HOW MANY STEPS THERE ARE BETWEEN $leaf2 AND THE ROOT $root:
$leaf2_steps_to_root = 0;
if (defined($leaf2->parent)) { $parent = $leaf2->parent; }
else { print STDERR "ERROR: do not know parent of leaf2 $leaf2_id.\n"; exit;}
$parent_type = $parent->get_tag_value('D');
if ($parent_type eq 'N') { $leaf2_steps_to_root++;}
$parent_name = $parent->internalID;
if ($parent_name eq $root_name) { goto FOUND_ROOT2;}
FIND_ROOT2:
if (defined($parent->parent))
{
$parent = $parent->parent;
$parent_type = $parent->get_tag_value('D');
if ($parent_type eq 'N') { $leaf2_steps_to_root++;}
$parent_name = $parent->internalID;
if ($parent_name eq $root_name) { goto FOUND_ROOT2;}
goto FIND_ROOT2;
}
FOUND_ROOT2:
# IF THERE ARE NO DUPLICATIONS, WE CAN INFER THAT THE NODE $lca
# SHARES THE GO IDS THAT $leaf1 AND $leaf2 HAVE IN COMMON:
if ($VERBOSE == 1)
{
print "\n$family_id $leaf_id $GOID->{$leaf_id} $leaf2_id $GOID->{$leaf2_id}\n";
print "$family_id duplications1 $duplications1 duplications2 $duplications2 lca_type $lca_type lca_name $lca_name\n";
print "$family_id leaf_steps_to_root $leaf_steps_to_root leaf2_steps_to_root $leaf2_steps_to_root\n";
}
if (($duplications1 == 0 && $duplications2 == 0) || # THERE ARE NO DUPLICATIONS BETWEEN THE GENES
($duplications1 == 1 && $duplications2 == 0 &&
$leaf_steps_to_root > $leaf2_steps_to_root) || # THERE ARE DUPLICATIONS BETWEEN
# LCA AND $leaf, BUT $leaf2 IS AN OUTGROUP
($duplications1 == 0 && $duplications2 == 1 &&
$leaf_steps_to_root < $leaf2_steps_to_root)) # THERE ARE DUPLICATIONS BETWEEN
# LCA AND $leaf2, BUT $leaf IS AN OUTGROUP
{
$leaf_goids = $GOID->{$leaf_id};
$leaf2_goids= $GOID->{$leaf2_id};
@leaf_goids = split(/\,/,$leaf_goids);
@leaf2_goids= split(/\,/,$leaf2_goids);
# CHECK WHETHER THE TWO SEQUENCES HAVE ANY GO IDS IN COMMON:
for ($k = 0; $k <= $#leaf_goids; $k++)
{
for ($m = 0; $m <= $#leaf2_goids; $m++)
{
if ($leaf_goids[$k] eq $leaf2_goids[$m])
{
if (!($SEEN_GOID{$family_id."=".$lca_name."=".$leaf_goids[$k]}))
{
$SEEN_GOID{$family_id."=".$lca_name."=".$leaf_goids[$k]} = 1;
print "$family_id: assigning $leaf_goids[$k] to node $lca_name ($leaf_id $leaf2_id)\n";
if (!($GOID->{$family_id."=".$lca_name})) { $GOID->{$family_id."=".$lca_name} = $leaf_goids[$k];}
else { $GOID->{$family_id."=".$lca_name} = $GOID->{$family_id."=".$lca_name}.",".$leaf_goids[$k];}
}
}
}
}
}
}
}
}
}
# GET ALL THE NODES IN THE TREE:
print "Getting all nodes in tree for $family_id...\n";
@nodes = $tree->get_all_nodes;
foreach $node(@nodes)
{
$node_name = $node->internalID;
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE.
{
$node_type = $node->get_tag_value('D');
if ($node_type eq 'Y') # THE NODE IS A DUPLICATION:
{
# GET THE TWO DAUGHTER NODES OF $node:
$no_daughters = 0;
$daughter1 = "";
$daughter2 = "";
foreach $daughter ($node->children)
{
$no_daughters++;
$daughter_name = $daughter->internalID;
if ($no_daughters == 1) { $daughter1_name = $daughter_name;}
elsif ($no_daughters == 2) { $daughter2_name = $daughter_name;}
@subnodes = &Avril_modules::empty_array(@subnodes);
@subnodes = (@subnodes,$daughter);
%SEEN_SUBNODE = ();
$SEEN_SUBNODE{$daughter} = 1;
# GET ALL THE DESCENDANTS OF $daughter:
LOOP_AGAIN1:
$new = 0;
for ($p = 0; $p <= $#subnodes; $p++)
{
$subnode = $subnodes[$p];
if (!($subnode->is_leaf))
{
foreach $subsubnode ($subnode->children)
{
if (!($SEEN_SUBNODE{$subsubnode}))
{
$SEEN_SUBNODE{$subsubnode} = 1;
$new++;
@subnodes = (@subnodes,$subsubnode);
}
}
}
}
if ($new > 0) { goto LOOP_AGAIN1;}
# FIND ALL THE GO IDS OF THE DESCENDANTS OF $daughter:
for ($p = 0; $p <= $#subnodes; $p++)
{
$subnode = $subnodes[$p];
if (!($subnode->is_leaf)) # IF IT IS NOT A LEAF NODE.
{
$subnode_name = $subnode->internalID;
$subnode_name = $family_id."=".$subnode_name;
# CHECK IF THERE ARE GO SLIM IDS FOR THIS NODE:
if ($GOID->{$subnode_name})
{
print "$family_id = node $node_name => $daughter_name => $subnode_name: go $GOID->{$subnode_name}\n";
if ($no_daughters == 1) { $daughter1 = $daughter1.",".$GOID->{$subnode_name};}
elsif ($no_daughters == 2) { $daughter2 = $daughter2.",".$GOID->{$subnode_name};}
}
}
}
}
# COMPARE THE GO IDS FOR $daughter1 AND $daughter2:
if ($daughter1 ne '' && $daughter2 ne '')
{
&compare_goids($daughter1,$daughter2,$node_name,$family_id,$daughter1_name,$daughter2_name);
}
}
}
}
NEXT_FAMILY:
}
}
#------------------------------------------------------------------#
# COMPARE THE GO IDS FOR $daughter1 AND $daughter2:
sub compare_goids
{
my $daughter1 = $_[0];
my $daughter2 = $_[1];
my $node_name = $_[2];
my $family_id = $_[3];
my $daughter1_name = $_[4];
my $daughter2_name = $_[5];
my @ids1;
my @ids2;
my %ID1 = ();
my $i;
my $same = 1;
my %SEEN = ();
$daughter1 = substr($daughter1,1,length($daughter1)-1);
$daughter2 = substr($daughter2,1,length($daughter2)-1);
@ids1 = split(/\,/,$daughter1);
@ids2 = split(/\,/,$daughter2);
%SEEN = ();
$daughter1 = "";
for ($i = 0; $i <= $#ids1; $i++)
{
if (!($SEEN{$ids1[$i]})) { $daughter1 = $daughter1.",".$ids1[$i]; $SEEN{$ids1[$i]} = 1;}
}
$daughter1 = substr($daughter1,1,length($daughter1)-1);
%SEEN = ();
$daughter2 = "";
for ($i = 0; $i <= $#ids2; $i++)
{
if (!($SEEN{$ids2[$i]})) { $daughter2 = $daughter2.",".$ids2[$i]; $SEEN{$ids2[$i]} = 1;}
}
$daughter2 = substr($daughter2,1,length($daughter2)-1);
@ids1 = &Avril_modules::empty_array(@ids1);
@ids2 = &Avril_modules::empty_array(@ids2);
@ids1 = split(/\,/,$daughter1);
@ids2 = split(/\,/,$daughter2);
for ($i = 0; $i <= $#ids1; $i++) { $ID1{$ids1[$i]} = 1;}
for ($i = 0; $i <= $#ids2; $i++) { if (!($ID1{$ids2[$i]})) { $same = 0;}}
if ($same == 0)
{
print "$family_id node $node_name : daughter1 $daughter1 ($daughter1_name) and daughter2 $daughter2 ($daughter2_name) have different GO ids! ***\n";
}
else
{
print "$family_id node $node_name : daughter1 $daughter1 and daughter2 $daughter2 have same GO ids.\n";
}
}
#------------------------------------------------------------------#
# READ IN THE INPUT FILE OF GO TERMS:
sub read_go
{
my $file = $_[0];
my $TAKE = $_[1];
my $line;
my @temp;
my $family;
my $species;
my $goid;
my %GOID = ();
my %SEEN = ();
my @families;
my $name;
open(FILE,"$file") || die "ERROR: cannot open $file.\n";
while(<FILE>)
{
$line = $_;
chomp $line;
@temp = split(/\s+/,$line);
if (substr($line,0,6) eq 'Family') # Family TF101001 : DM CYCB has GO id GO:0000910
{
$family = $temp[1];
$species = $temp[3];
$goid = $temp[8];
$name = $temp[9];
#if ($species eq 'DM') { $name = $name."_DROME";}
#elsif ($species eq 'HS') { $name = $name."_HUMAN";}
#elsif ($species eq 'MM') { $name = $name."_MOUSE";}
#elsif ($species eq 'RN') { $name = $name."_RAT"; }
#elsif ($species eq 'DR') { $name = $name."_BRARE";}
#elsif ($species eq 'CE') { $name = $name."_CAEEL";}
#elsif ($species eq 'GG') { $name = $name."_CHICK";}
if (!($GOID{$name})) { $GOID{$name} = $goid;}
else {$GOID{$name} = $GOID{$name}.",".$goid;}
if (!($SEEN{$family}) && $TAKE->{$family})
{
$SEEN{$family} = 1;
@families = (@families,$family);
}
}
}
close(FILE);
return(\%GOID,\@families);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment