Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 15:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5065269 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065269 to your computer and use it in GitHub Desktop.
Perl script that, given a list of Caenorhabditis elegans paralog pairs, uses the TreeFam tree that they are in to calculate information about the paralogs and the tree
#! /usr/bin/perl
#
# Perl script count_worm_paralogs2b.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 1-Oct-08.
#
# This perl script reads in the trees in TreeFam-6 that a pair of
# worm paralogs are in, and calculates information about the paralogs
# and the tree.
#
# The command-line format is:
# % perl <count_worm_paralogs2.pl> <list> <taxa>
# where <list> is the list of paralog pairs,
# <taxa> is the list of taxa in which to count duplications/speciations.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 2)
{
print "Usage of count_worm_paralogs2b.pl\n\n";
print "perl count_worm_paralogs2b.pl <list> <taxa>\n";
print "where <list> is the list of paralog pairs,\n";
print " <taxa> is the list of taxa in which to count duplications/speciations\n";
print "For example, >perl count_worm_paralogs2b.pl ancient_paralogs treefam56_species\n";
exit;
}
# READ IN MY PERL MODULES:
BEGIN {
unshift (@INC, '/nfs/team71/phd/alc/perl/modules');
}
use lib "/nfs/team71/phd/alc/perl/treefam/Treefam_api_3may07/"; # xxx make sure it points to treefam-6
use Avril_modules;
use Treefam::DBConnection;
use DBI;
# FIND THE LIST OF PARALOG PAIRS:
$list = $ARGV[0];
# FIND THE LIST OF TAXA IN WHICH TO COUNT SPECIATIONS/DUPLICATIONS:
$taxa = $ARGV[1];
#------------------------------------------------------------------#
# IF $VERBOSE == 1, PRINT OUT EXTRA INFORMATION:
$VERBOSE = 0;
#------------------------------------------------------------------#
$version = "6"; # xxx
$db = "dbi:mysql:treefam_".$version.":db.treefam.org:3308";
$dbh = DBI->connect("$db", 'anonymous', '') || return;
# READ IN THE LIST OF TAXA IN WHICH TO COUNT SPECIATIONS/DUPLICATIONS:
%TAKE = ();
open(TAXA,"$taxa") || die "ERROR: cannot open $taxa\n";
while(<TAXA>)
{
$line = $_;
chomp $line;
@temp = split(/_/,$line);
$taxid = $temp[0]; # eg. 10090
$taxname = $temp[1]; # eg. Mus musculus
# SEE IF THERE IS A SWCODE FOR THIS $taxid:
$table_w = 'species';
$st = "SELECT SWCODE from $table_w WHERE TAX_ID=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($taxid) or die "Cannot execute the query: $sth->errstr";
$SWCODE = "none";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$SWCODE = $array[0];
}
}
$taxname =~ tr/[a-z]/[A-Z]/;
$SWCODE =~ tr/[a-z]/[A-Z]/;
# WE WANT TO IGNORE SPECIATIONS GIVING RISE TO THE DROSOPHILA SPECIES OR PLATYPUS, SINCE THESE
# SPECIATIONS ARE NOT SHARED BETWEEN TREEFAM5/6.
# KEEP THERIA, DISCARD MAMMALIA. THIS IS BECAUSE THE BRANCH OFF TO PLATYPUS IS USUALLY MAMMALIA.
# SOPHOPHORA AND DROSOPHILA SPECIATIONS OCCUR IN EXTRA SPECIATIONS ONLY FOUND IN TREEFAM-6.
if ($SWCODE eq 'NONE' && $taxname ne 'SOPHOPHORA' && $taxname ne 'MAMMALIA' && $taxname ne 'DROSOPHILA')
{
$TAKE{$taxname} = 1; # eg. HOMO/PAN/GORILLA GROUP
@temp = split(/\s+/,$taxname);
$taxname = $temp[0]; # eg. HOMO/PAN/GORILLA
$TAKE{$taxname} = 1;
}
else
{
$TAKE{$SWCODE} = 1;
}
}
close(TAXA);
#------------------------------------------------------------------#
# READ IN THE GENES IN THE LIST OF PARALOG PAIRS:
%SEEN_GENE = ();
open(LIST,"$list") || die "ERROR: cannot open $list\n";
while(<LIST>)
{
$line = $_;
chomp $line;
@temp = split(/\s+/,$line);
$gene1 = $temp[0];
$gene2 = $temp[1];
if (!($SEEN_GENE{$gene1})) { @genes = (@genes,$gene1); $SEEN_GENE{$gene1} = 1;}
if (!($SEEN_GENE{$gene2})) { @genes = (@genes,$gene2); $SEEN_GENE{$gene2} = 1;}
}
close(LIST);
$dbh = DBI->connect("dbi:mysql:treefam_6:db.treefam.org:3308", 'anonymous', '') || return;
#------------------------------------------------------------------#
%GENES = (); # HASH TABLE TO RECORD THE PARALOGS THAT FAMILIES CONTAIN.
%FAMILY = ();
%GID = ();
# FOR EACH GENE IN THE LIST OF GENES, FIND OUT WHAT FAMILY IT IS IN:
for ($i = 0; $i <= $#genes; $i++)
{
$gene = $genes[$i];
# FIND OUT WHAT IS THE TREEFAM ID FOR THIS GENE:
$table_w = "genes";
$st = "SELECT ID,GID from $table_w WHERE TID=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($gene) or die "Cannot execute the query: $sth->errstr";
$ID = "none";
$GID = "none";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$ID = $array[0];
$GID = $array[1];
}
}
# SOME CASES ARE STORED IN TREEFAM-6 UNDER ISOFORM NAMES:
if ($gene eq 'F22D6.3') { $ID = 'F22D6.3a'; $GID = 'WBGene00003815';}
elsif ($gene eq 'C24B5.2') { $ID = 'C24B5.2a'; $GID = 'WBGene00016045';}
elsif ($gene eq 'Y47D9A.1'){ $ID = 'Y47D9A.1a'; $GID = 'WBGene00021628';}
elsif ($gene eq 'F25B5.4') { $ID = 'F25B5.4a'; $GID = 'WBGene00006727';}
elsif ($gene eq 'F54H12.1'){ $ID = 'F54H12.1a'; $GID = 'WBGene00000041';}
elsif ($gene eq 'Y77E11A.13'){$ID= 'Y77E11A.13a'; $GID = 'WBGene00003806';}
elsif ($gene eq 'Y17G7B.5'){ $ID = 'Y17G7B.5a'; $GID = 'WBGene00003154';}
elsif ($gene eq 'T10B5.5') { $ID = 'T10B5.5a'; $GID = 'WBGene00020391';}
elsif ($gene eq 'C07G2.3') { $ID = 'C07G2.3a'; $GID = 'WBGene00000380';}
elsif ($gene eq 'T19A6.2') { $ID = 'T19A6.2a'; $GID = 'WBGene00003596';}
if ($ID eq 'none' || $GID eq 'none') { print STDERR "ERROR: did not find ID for gene $gene\n"; exit;}
if ($GID{$gene}) { print STDERR "ERROR: already have GID for $gene\n"; exit;}
$GID{$gene} = $GID;
# FIND OUT WHAT FAMILY THIS GENE BELONGS TO:
$table_w = "fam_genes";
$st = "SELECT AC from $table_w WHERE ID=?";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute($ID) or die "Cannot execute the query: $sth->errstr";
$ID = "none";
$the_AC = "none";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[0];
if ($the_AC eq 'none') { $the_AC = $AC;}
elsif ($the_AC ne 'none')
{
if ($AC =~ /TF1/ && ($the_AC =~ /TF3/ || $the_AC =~ /TF5/)) { $the_AC = $AC;}
elsif ($AC =~ /TF3/ && $the_AC =~ /TF5/) { $the_AC = $AC;}
}
}
}
if ($the_AC eq 'none') { print STDERR "ERROR: did not find family for gene $gene ID $ID\n"; exit;}
# RECORD THE PARALOGS IN THIS FAMILY:
if ($GID eq 'none') { print STDERR "ERROR: gene $gene GID $GID\n"; exit;}
if (!($GENES{$the_AC})) { $GENES{$the_AC} = $GID; }
else { $GENES{$the_AC} = $GENES{$the_AC}.",".$GID;}
# RECORD THE FAMILY FOR THIS GENE:
if (!($FAMILY{$gene})) { $FAMILY{$gene} = $the_AC;}
else { print STDERR "ERROR: already have family for $gene\n"; exit;}
}
print "FAMILY SPECIES GENEI GENEJ GENES SPECIESGENES DUPNODES SPECNNODES GENESCLADE SPECIESGENESCLADE DUPNODESCLADE SPECNODESCLADE DUPNODESI SPECNODESI DUPNODESJ SPECNODESJ DUPNODESIROOT SPECNODESIROOT DUPNODESJROOT SPECNODESJROOT SPECIATIONS SPECIATIONSI SPECIATIONSJ SPECIATIONSIROOT SPECIATIONSJROOT\n";
$dbc = Treefam::DBConnection->new();
%NUM_GENES_IN_TREE = ();
%NUM_SPECIES_GENES_IN_TREE = ();
%NUM_DUP_NODES_IN_TREE = ();
%NUM_SPEC_NODES_IN_TREE = ();
%SPECIATIONS_IN_TREE = ();
%NUM_DUP_NODES_I = ();
%NUM_SPEC_NODES_I = ();
%SPECIATIONS_I = ();
# LOOK AT PARALOGS WITHIN EACH FAMILY:
foreach $treefam_family (keys %GENES)
{
print STDERR "Looking at family $treefam_family... \n";
$famh = $dbc->get_FamilyHandle();
if (!(defined($famh->get_by_id($treefam_family))))
{
print "WARNING: there is no family $treefam_family...\n";
goto NEXT_FAMILY;
}
$family = $famh->get_by_id($treefam_family);
if (!(defined($family->ID())))
{
print "WARNING: do not have ID stored for family $treefam_family...\n";
goto NEXT_FAMILY;
}
$AC = $family->ID(); # GET THE FAMILY ID.
if ($AC ne $treefam_family) { print STDERR "ERROR: AC $AC treefam_family $treefam_family\n"; exit;}
$no_families++;
print "\n$no_families: reading family $AC...\n";
# GET THE TREE FOR THE FAMILY:
if (!(defined($family->get_tree('clean'))))
{
print "WARNING: AC $AC: there is no tree!\n";
goto NEXT_FAMILY;
}
$tree = $family->get_tree('clean'); # GET THE TREEFAM CLEAN TREE.
@nodes = $tree->get_all_nodes();
#---------------------------------------------------------------------------------#
# COUNT HOW MANY GENES, AND HOW MANY GENES FROM C. ELEGANS, ARE IN THE WHOLE TREE:
$num_genes_in_tree = 0;
$num_species_genes_in_tree = 0;
%SEEN_GENE = ();
foreach $node(@nodes)
{
if (defined($node->sequence_id())) # IF IT IS A LEAF.
{
# FIND THE SPECIES FOR THIS TRANSCRIPT:
if (defined($node->get_tag_value('S')))
{
$species = $node->get_tag_value('S');
}
else
{
print "WARNING: AC $AC: do not know species of node\n";
$species = "none";
}
$species =~ tr/[a-z]/[A-Z]/;
# SEE IF THIS IS FROM A SPECIES THAT WE ARE INCLUDING:
if ($TAKE{$species})
{
# FIND THE GENE FOR THIS TRANSCRIPT:
$gh = $node->gene;
$gene = $gh->ID(); # eg. ENSMUSG00000022770
if (!($SEEN_GENE{$gene}))
{
$num_genes_in_tree++;
if ($species eq 'CAEEL') { $num_species_genes_in_tree++;}
$SEEN_GENE{$gene} = 1;
}
}
else
{
if ($VERBOSE == 1) { print "treefam_family $treefam_family excluding $species gene\n";}
}
}
}
if ($NUM_GENES_IN_TREE{$treefam_family}) { print STDERR "ERROR: already have #genes in $treefam_family\n"; exit;}
if ($NUM_SPECIES_GENES_IN_TREE{$treefam_family}) { print STDERR "ERROR: already have #CAEEL genes in $treefam_family\n"; exit;}
$NUM_GENES_IN_TREE{$treefam_family} = $num_genes_in_tree;
$NUM_SPECIES_GENES_IN_TREE{$treefam_family} = $num_species_genes_in_tree;
#---------------------------------------------------------------------------------#
# COUNT HOW MANY DUPLICATION/SPECIATION NODES ARE IN THE WHOLE TREE:
$num_dup_nodes_in_tree = 0;
$num_spec_nodes_in_tree = 0;
foreach $node(@nodes)
{
if (!(defined($node->sequence_id()))) # IF IT IS NOT A LEAF
{
# FIND THE SPECIES FOR THIS NODE:
if (defined($node->get_tag_value('S')))
{
$species = $node->get_tag_value('S');
}
else
{
print "WARNING: treefam_family $treefam_family: do not know species of node\n";
$species = "none";
}
$species =~ tr/[a-z]/[A-Z]/;
# IF THIS IS A SPECIES WE WANT TO TAKE:
if ($TAKE{$species})
{
if (defined($node->get_tag_value('D')))
{
$node_type = $node->get_tag_value('D'); # Y FOR DUPLICATION, N FOR SPECIATION
# IF IT IS A DUPLICATION NODE (Y)
if ($node_type eq 'Y') { $num_dup_nodes_in_tree++; }
elsif ($node_type eq 'N') # IT IS A SPECIATION EVENT:
{
$num_spec_nodes_in_tree++;
if (!($SPECIATIONS_IN_TREE{$treefam_family}))
{
$SPECIATIONS_IN_TREE{$treefam_family} = $species;
}
else
{
$SPECIATIONS_IN_TREE{$treefam_family} = $SPECIATIONS_IN_TREE{$treefam_family}.",".$species;
}
}
else { print STDERR "ERROR: node_type $node_type treefam_family $treefam_family\n"; exit;}
}
else # THIS MUST BE A LEAF NODE OR HAS NOT BEEN MARKED AS A SPECIATION BY MISTAKE.
{
$num_spec_nodes_in_tree++;
if (!($SPECIATIONS_IN_TREE{$treefam_family}))
{
$SPECIATIONS_IN_TREE{$treefam_family} = $species;
}
else
{
$SPECIATIONS_IN_TREE{$treefam_family} = $SPECIATIONS_IN_TREE{$treefam_family}.",".$species;
}
}
}
else
{
if ($VERBOSE == 1) { print "treefam_family $treefam_family excluding node $species\n";}
}
}
}
if ($NUM_DUP_NODES_IN_TREE{$treefam_family}) { print STDERR "ERROR: already have #dup nodes in $treefam_family\n"; exit;}
if ($NUM_SPEC_NODES_IN_TREE{$treefam_family}) { print STDERR "ERROR: already have #spec nodes in $treefam_family\n"; exit;}
$NUM_DUP_NODES_IN_TREE{$treefam_family} = $num_dup_nodes_in_tree;
$NUM_SPEC_NODES_IN_TREE{$treefam_family} = $num_spec_nodes_in_tree;
#---------------------------------------------------------------------------------#
# GET ALL THE TRANSCRIPTS IN THIS FAMILY CORRESPONDING TO THE PARALOGS THAT WE
# ARE INTERESTED IN:
$species_genes = $GENES{$treefam_family};
@species_genes = split(/\,/,$species_genes);
@species_nodes = &Avril_modules::empty_array(@species_nodes);
@leaves = $tree->get_leaves;
foreach $species_gene(@species_genes)
{
$species_node = "none";
foreach $leaf(@leaves)
{
# FIND THE GENE FOR THIS NODE:
$gh = $leaf->gene;
$gene = $gh->ID(); # eg. ENSMUSG00000022770
if ($gene eq $species_gene)
{
$species_node = $leaf;
goto FOUND_NODE;
}
}
FOUND_NODE:
if ($species_node eq "none") { print STDERR "ERROR: did not find node for $species_gene\n"; exit;}
@species_nodes = (@species_nodes,$species_node);
}
# GET THE ROOT NODE OF THE TREE:
$root_node = $tree->root();
# FOR EACH OF THE PARALOGS OF INTEREST IN THE TREE, GATHER SOME INFORMATION:
for ($i = 0; $i <= $#species_nodes; $i++)
{
$species_node_i = $species_nodes[$i];
$species_gene_i = $species_genes[$i];
#---------------------------------------------------------------------------#
# FIND THE NUMBER OF DUPLICATIONS/SPECIATIONS BETWEEN $species_node_i AND $root_node:
$num_dup_nodes_i_root = 0;
$num_spec_nodes_i_root = 0;
$node_i = $species_node_i;
FIND_NUM_DUP_NODES_I_ROOT:
# NOTE THAT WE WANT TO INCLUDE THE ROOT NODE OF THESE TREES IN THE COUNT.
if (defined($node_i->parent))
{
$parent = $node_i->parent;
# FIND THE SPECIES FOR THIS NODE:
if (defined($parent->get_tag_value('S')))
{
$species = $parent->get_tag_value('S');
}
else
{
print "WARNING: treefam_family $treefam_family: do not know species of node\n";
$species = "none";
}
$species =~ tr/[a-z]/[A-Z]/;
# IF THIS IS A SPECIES WE WANT TO TAKE:
if ($TAKE{$species})
{
if (defined($parent->get_tag_value('D')))
{
$type_i = $parent->get_tag_value('D'); # Y FOR DUPLICATION, N FOR SPECIATION
# IF IT IS A DUPLICATION NODE (Y)
if ($type_i eq 'Y') { $num_dup_nodes_i_root++; }
elsif ($type_i eq 'N')
{
$num_spec_nodes_i_root++;
if (!($SPECIATIONS_I{$species_gene_i}))
{
$SPECIATIONS_I{$species_gene_i} = $species;
}
else
{
$SPECIATIONS_I{$species_gene_i} = $SPECIATIONS_I{$species_gene_i}.",".$species;
}
}
else { print STDERR "ERROR: type_i $type_i treefam_family $treefam_family\n"; exit;}
}
else # IF THE NODE TYPE IS NOT DEFINED, ASSUME IT IS A SPECIATION NODE.
{
$num_spec_nodes_i_root++;
if (!($SPECIATIONS_I{$species_gene_i}))
{
$SPECIATIONS_I{$species_gene_i} = $species;
}
else
{
$SPECIATIONS_I{$species_gene_i} = $SPECIATIONS_I{$species_gene_i}.",".$species;
}
}
}
else
{
if ($VERBOSE == 1) { print "treefam_family $treefam_family excluding parent node $species...\n";}
}
$node_i = $parent;
if ($parent eq $root_node)
{
goto FOUND_ROOT1;
}
goto FIND_NUM_DUP_NODES_I_ROOT;
}
FOUND_ROOT1:
# RECORD $num_dup_nodes_i:
if ($NUM_DUP_NODES_I{$species_gene_i}) { print STDERR "ERROR: already have num_dup_nodes_i for $species_gene_i\n"; exit;}
$NUM_DUP_NODES_I{$species_gene_i} = $num_dup_nodes_i_root;
if ($NUM_SPEC_NODES_I{$species_gene_i}) { print STDERR "ERROR: already have num_spec_nodes_i for $species_gene_i\n"; exit;}
$NUM_SPEC_NODES_I{$species_gene_i} = $num_spec_nodes_i_root;
#---------------------------------------------------------------------------#
}
NEXT_FAMILY:
}
#------------------------------------------------------------------#
# PRINT OUT THE INFORMATION FOR EACH PAIR OF GENES:
open(LIST,"$list") || die "ERROR: cannot open $list\n";
while(<LIST>)
{
$line = $_;
chomp $line;
@temp = split(/\s+/,$line);
$gene1 = $temp[0];
$gene2 = $temp[1];
# GET THE FAMILY FOR EACH GENE:
if (!($FAMILY{$gene1})) { print STDERR "ERROR: do not know family for $gene1\n"; exit;}
if (!($FAMILY{$gene2})) { print STDERR "ERROR: do not know family for $gene2\n"; exit;}
$AC1 = $FAMILY{$gene1};
$AC2 = $FAMILY{$gene2};
$AC = $AC1."/".$AC2;
# GET THE NUMBER OF GENES IN THE TREE FOR EACH FAMILY:
if (!($NUM_GENES_IN_TREE{$AC1})) { print STDERR "ERROR: do not know #genes in $AC1\n"; exit;}
if (!($NUM_GENES_IN_TREE{$AC2})) { print STDERR "ERROR: do not know #genes in $AC2\n"; exit;}
$num_genes_in_tree1 = $NUM_GENES_IN_TREE{$AC1};
$num_genes_in_tree2 = $NUM_GENES_IN_TREE{$AC2};
$num_genes_in_tree = $num_genes_in_tree1 + $num_genes_in_tree2;
# GET THE NUMBER OF C. ELEGANS GENES IN THE TREE FOR EACH FAMILY:
if (!($NUM_SPECIES_GENES_IN_TREE{$AC1})) { print STDERR "ERROR: do not know #CAEEL genes in $AC1\n"; exit;}
if (!($NUM_SPECIES_GENES_IN_TREE{$AC2})) { print STDERR "ERROR: do not know #CAEEL genes in $AC2\n"; exit;}
$num_species_genes_in_tree1 = $NUM_SPECIES_GENES_IN_TREE{$AC1};
$num_species_genes_in_tree2 = $NUM_SPECIES_GENES_IN_TREE{$AC2};
$num_species_genes_in_tree = $num_species_genes_in_tree1 + $num_species_genes_in_tree2;
# GET THE NUMBER OF DUPLICATION AND SPECIATION NODES IN EACH FAMILY:
if (!($NUM_DUP_NODES_IN_TREE{$AC1})) { $num_dup_nodes_in_tree1 = 0; }
else { $num_dup_nodes_in_tree1 = $NUM_DUP_NODES_IN_TREE{$AC1};}
if (!($NUM_DUP_NODES_IN_TREE{$AC2})) { $num_dup_nodes_in_tree2 = 0; }
else { $num_dup_nodes_in_tree2 = $NUM_DUP_NODES_IN_TREE{$AC2};}
# WE ADD ONE EXTRA DUPLICATION NODE FOR THE INFERRED DUPLICATION NODE THAT GAVE RISE TO THE TWO PARALOGS:
$num_dup_nodes_in_tree = $num_dup_nodes_in_tree1 + $num_dup_nodes_in_tree2 + 1;
if (!($NUM_SPEC_NODES_IN_TREE{$AC1})) { $num_spec_nodes_in_tree1 = 0; }
else { $num_spec_nodes_in_tree1 = $NUM_SPEC_NODES_IN_TREE{$AC1};}
if (!($NUM_SPEC_NODES_IN_TREE{$AC2})) { $num_spec_nodes_in_tree2 = 0; }
else { $num_spec_nodes_in_tree2 = $NUM_SPEC_NODES_IN_TREE{$AC2};}
$num_spec_nodes_in_tree = $num_spec_nodes_in_tree1 + $num_spec_nodes_in_tree2;
# GET THE NUMBER OF DUPLICATION AND SPECIATION NODES FOR THE CLADE DEFINED BY THE DUPLICATION
# NODE THAT GAVE RISE TO THE TWO PARALOGS (TWO FAMILIES):
# $num_genes_in_clade IS THE NUMBER OF GENES IN BOTH TREES TOGETHER:
$num_genes_in_clade = $num_genes_in_tree;
# $num_species_genes_in_clade IS THE NUMBER OF C. ELEGANS GENES IN BOTH TREES TOGETHER:
$num_species_genes_in_clade = $num_species_genes_in_tree;
# $num_dup_nodes_in_clade IS THE NUMBER OF DUPLICATION NODES IN BOTH TREES TOGETHER:
$num_dup_nodes_in_clade = $num_dup_nodes_in_tree;
# $num_spec_nodes_in_clade IS THE NUMBER OF SPECIATION NODES IN BOTH TREES TOGETHER:
$num_spec_nodes_in_clade = $num_spec_nodes_in_tree;
# GET THE NUMBER OF DUPLICATION NODES FROM $gene1 TO THE ROOT OF TREE 1:
if (!($GID{$gene1})) { print STDERR "ERROR: do not know GID for $gene1\n"; exit;}
$GID1 = $GID{$gene1};
if (!($NUM_DUP_NODES_I{$GID1})) { $num_dup_nodes1 = 0; }
else { $num_dup_nodes1 = $NUM_DUP_NODES_I{$GID1}; }
if (!($GID{$gene2})) { print STDERR "ERROR: do not know GID for $gene2\n"; exit;}
$GID2 = $GID{$gene2};
if (!($NUM_DUP_NODES_I{$GID2})) { $num_dup_nodes2 = 0; }
else { $num_dup_nodes2 = $NUM_DUP_NODES_I{$GID2}; }
# GET THE NUMBER OF SPECIATION NODES FROM $gene1 TO THE ROOT OF THE TREE 1:
if (!($NUM_SPEC_NODES_I{$GID1})) { $num_spec_nodes1 = 0; }
else { $num_spec_nodes1 = $NUM_SPEC_NODES_I{$GID1};}
if (!($NUM_SPEC_NODES_I{$GID2})) { $num_spec_nodes2 = 0; }
else { $num_spec_nodes2 = $NUM_SPEC_NODES_I{$GID2};}
# GET THE NUMBER OF SPECIATION EVENTS FROM $gene1 TO THE ROOT OF THE TREE 1:
if (!($SPECIATIONS_I{$GID1})) { $num_spec_events1 = 0; }
else
{
$num_spec_events1 = $SPECIATIONS_I{$GID1};
$num_spec_events1 = &count_speciations($num_spec_events1);
}
if (!($SPECIATIONS_I{$GID2})) { $num_spec_events2 = 0; }
else
{
$num_spec_events2 = $SPECIATIONS_I{$GID2};
$num_spec_events2 = &count_speciations($num_spec_events2);
}
# FIND THE NUMBER OF SPECIATIONS IN THE WHOLE TREE:
if (!($SPECIATIONS_IN_TREE{$AC1})) { print STDERR "ERROR: do not know speciations in $AC1 tree\n"; exit;}
if (!($SPECIATIONS_IN_TREE{$AC2})) { print STDERR "ERROR: do not know speciations in $AC2 tree\n"; exit;}
$speciations1 = $SPECIATIONS_IN_TREE{$AC1};
$speciations2 = $SPECIATIONS_IN_TREE{$AC2};
$speciations = $speciations1.",".$speciations2;
$num_speciations = &count_speciations($speciations);
$num_speciations1 = &count_speciations($speciations1);
$num_speciations2 = &count_speciations($speciations2);
# PRINT OUT THE INFORMATION FOR THIS PAIR OF PARALOGS:
print "$AC CAEEL $gene1 $gene2 $num_genes_in_tree $num_species_genes_in_tree $num_dup_nodes_in_tree $num_spec_nodes_in_tree $num_genes_in_clade $num_species_genes_in_clade $num_dup_nodes_in_clade $num_spec_nodes_in_clade $num_dup_nodes1 $num_spec_nodes1 $num_dup_nodes2 $num_spec_nodes2 $num_dup_nodes1 $num_spec_nodes1 $num_dup_nodes2 $num_spec_nodes2 $num_speciations $num_speciations1 $num_speciations2 $num_spec_events1 $num_spec_events2\n";
}
close(LIST);
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# COUNT SPECIATION EVENTS:
sub count_speciations
{
my $speciations = $_[0];
my @speciations = split(/\,/,$speciations);
my $num = 0;
my %SEEN = ();
my $i;
my $speciation;
for ($i = 0; $i <= $#speciations; $i++)
{
$speciation = $speciations[$i];
if (!($SEEN{$speciation}))
{
$num++;
$SEEN{$speciation} = 1;
}
}
return($num);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment