Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Perl script to read a Newick tree file from TreeFam, and print out the orthologs of a particular input gene based on the tree.
#!/usr/bin/perl
#---------------------------------------------------------------------------------------------#
#
# A Perl script to read a Newick tree file from Treefam, and
# print out the orthologues of a particular input gene.
#
# Avril Coghlan. 6-Dec-04.
# avril.coghlan@ucd.ie
#
# For the Treefam project.
#
# The command-line format id:
# % perl <get_orths_from_newick_tree.pl> <tree> <mygene>
# where tree is the tree file,
# mygene is the gene of interest.
#
#---------------------------------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 2)
{
print "Usage of get_orths_from_newick_tree.pl\n\n";
print "perl get_orths_from_newick_tree.pl <tree> <mygene>\n";
print "where <tree> is the tree file,\n";
print " <mygene> is the gene of interest.\n";
print "For example, >perl get_orths_from_newick_tree.pl /nfs/treefam/work/families/TF300307/full.nhx\n";
print "KO1C8.10.1_CAEEL\n";
exit;
}
# FIND THE NAME OF THE INPUT TREE FILE:
$tree_file = $ARGV[0];
# FIND THE NAME OF THE GENE OF INTEREST:
$mygene = $ARGV[1];
#------------------------------------------------------------------#
# PARAMETERS:
$PRINT_BRACKET_DETAILS = 0; # PRINTS OUT EXTRA DETAILS OF THE RUN.
$PRINT_FORK_DETAILS = 0;
# READ IN THE NEWICK TREE:
$tree = read_newick_tree($tree_file);
# FIND THE BIFURCATIONS IN THE TREE:
&find_tree_bifurcations($tree);
# GO THROUGH EACH OF THE BIFURCATIONS AND SEE WHICH BRANCHES THEY SPLIT INTO.
# EACH BIFURCATION IS A NODE WHICH SPLITS TO TWO THINGS, EITHER TWO MORE
# NODES, OR ONE SEQUENCE AND ONE NODE, OR TWO SEQUENCES.
# EACH NODE OR SEQUENCE HAS A PARENT NODE FROM WHICH IT SPLIT OFF,
# AND EACH PARENT NODE HAS TWO CHILD NODES.
&find_what_nodes_split_into;
# FIND THE ORTHOLOGUES FOR EACH SEQUENCE IN THE TREE:
&find_orthologues;
print STDERR "FINISHED.\n";
#---------------------------------------------------------------------------------------------#
sub find_connecting_nodes
{
# FIND THE NODES THAT CONNECT TO A CERTAIN INPUT NODE:
my $node = $_[0];
my @temp;
my @nodes;
my $connecting;
if ($PARENT{$node})
{
$connecting = $PARENT{$node};
if ($connecting ne $node && !($SEEN{$connecting}))
{
@nodes = (@nodes,$connecting);
}
}
if ($node =~ /NODE/) { @temp = split(/NODE/,$node); $node = $temp[1];}
if ($SPLITS_INTO{$node})
{
$connecting = $SPLITS_INTO{$node};
@temp = split(/\,/,$connecting);
if ($temp[0] ne $node && !($SEEN{$temp[0]}))
{
@nodes = (@nodes,$temp[0]);
}
if ($temp[1] ne $node && !($SEEN{$temp[1]}))
{
@nodes = (@nodes,$temp[1]);
}
}
return @nodes;
}
#---------------------------------------------------------------------------------------------#
sub check_if_node_above_outgroup
{
# CHECK IF A CERTAIN NODE IS ABOVE THE OUTGROUP:
my $node = $_[0];
my @seqs;
my $no_seqs;
my $i;
my $node_above_outgroup = 0;
# FIND THE SEQUENCES IN THE CLADE BELOW $node:
@seqs = find_seqs_in_clade($node);
$no_seqs = $#seqs + 1;
for ($i = 0; $i < $no_seqs; $i++)
{
if ($seqs[$i] =~ /YEAST/ || $seqs[$i] =~ /SCHPO/ || $seqs[$i] =~ /ARATH/)
{
$node_above_outgroup = 1;
}
}
return $node_above_outgroup;
}
#---------------------------------------------------------------------------------------------#
sub check_if_node_is_outgroup
{
# CHECK IF A CERTAIN NODE IS ONLY ABOVE OUTGROUP SEQUENCES:
my $node = $_[0];
my @seqs;
my $no_seqs;
my $i;
my $node_is_outgroup = 1;
# FIND THE SEQUENCES IN THE CLADE BELOW $node:
@seqs = find_seqs_in_clade($node);
$no_seqs = $#seqs + 1;
for ($i = 0; $i < $no_seqs; $i++)
{
if (!($seqs[$i] =~ /YEAST/ || $seqs[$i] =~ /SCHPO/ || $seqs[$i] =~ /ARATH/))
{
$node_is_outgroup = 0;
}
}
return $node_is_outgroup;
}
#---------------------------------------------------------------------------------------------#
sub find_orthologues
{
# GO THROUGH EACH SEQUENCE IN THE TREE AND FIND ITS ORTHOLOGUES:
my $no_sequences = $#sequences + 1;
my $i;
my $sequence;
my @nodes;
my $node;
my $found_outgroup;
my $no_nodes;
my $node1;
my $node2;
my $node1_above_outgroup;
my $node2_above_outgroup;
my $node1_is_outgroup;
my $node2_is_outgroup;
my @seqs;
my $no_seqs;
my $j;
my $found_orth;
for ($i = 0; $i < $no_sequences; $i++)
{
$sequence = $sequences[$i];
$found_orth = 0;
if ($sequence ne $mygene) { goto FOUND_OUTGROUP;}
if ($sequence =~ /\_YEAST/ || $sequence =~ /\_SCHPO/ || $sequence =~ /\_ARATH/) { goto FOUND_OUTGROUP;}
print "----------------------------------------------------------\n";
print "FINDING ORTHOLOGUES OF $sequence ...\n";
# FIND THE ORTHOLOGUES FOR $sequence IN THE TREE, BY MOVING UP THE TREE
# NODE BY NODE, UPWARDS FROM $sequence:
$node = $sequence;
$found_outgroup = 0;
%SEEN = ();
while($found_outgroup == 0)
{
# FIND WHICH NODES ARE CONNECTED TO NODE $node;
$SEEN{$node} = 1;
@nodes = find_connecting_nodes($node);
# FIND WHAT SEQUENCES ARE BELOW EACH NODE THAT CONNECTS TO $node:
$no_nodes = $#nodes + 1;
if ($no_nodes == 1)
{
$node = $nodes[0];
}
elsif ($no_nodes == 2)
{
# WE WANT TO CHECK WHETHER EACH OF THE NODES BELOW $node CONTAINS
# THE OUTGROUP OR NOT:
$node1 = $nodes[0];
$node2 = $nodes[1];
# CHECK IF $node1 OR $node2 CONTAINS ONLY OUTGROUP SEQUENCES:
$node1_is_outgroup = check_if_node_is_outgroup($node1);
$node2_is_outgroup = check_if_node_is_outgroup($node2);
# CHECK IF $node1 OR $node2 IS ABOVE THE OUTGROUP:
$node1_above_outgroup = check_if_node_above_outgroup($node1);
$node2_above_outgroup = check_if_node_above_outgroup($node2);
if ($node1_above_outgroup == 1 && $node2_above_outgroup == 0)
{
# ALL THE SEQUENCES UNDER $node2 ARE ORTHOLOGUES OF $sequence:
$found_orth = 1;
@seqs = find_seqs_in_clade($node2);
$no_seqs = $#seqs + 1;
for ($j = 0; $j < $no_seqs; $j++) { print "$sequence ---> $seqs[$j]\n";}
$node = $node1;
}
elsif ($node1_above_outgroup == 0 && $node2_above_outgroup == 1)
{
# ALL THE SEQUENCES UNDER $node1 ARE ORTHOLOGUES OF $sequence:
$found_orth = 1;
@seqs = find_seqs_in_clade($node1);
$no_seqs = $#seqs + 1;
for ($j = 0; $j < $no_seqs; $j++) { print "$sequence ---> $seqs[$j]\n";}
$node = $node2;
}
elsif ($node1_above_outgroup == 1 && $node2_above_outgroup == 1)
{
# THE TREE HAS MORE THAN ONE OUTGROUP:
$found_outgroup = 1;
goto FOUND_OUTGROUP;
}
else
{
# xxx NO OUTGROUP IN THE TREE - WE NEED TO FIX THIS.
print STDERR "ERROR: node1_above $node1_above_outgroup node2_above $node2_above_outgroup.\n";
exit;
}
if ($node1_is_outgroup == 1 || $node2_is_outgroup == 1) { $found_outgroup = 1; goto FOUND_OUTGROUP;}
}
elsif ($no_nodes == 3)
{
print STDERR "ERROR: 3 nodes below $node.\n"; exit;
}
else
{
# IF THERE IS NO OUTGROUP IN THE TREE - WE NEED TO FIX THIS xxx
print STDERR "ERROR: $no_nodes below $node.\n"; exit;
}
}
FOUND_OUTGROUP:
if ($found_orth == 0 && $sequence eq $mygene) { print "Did not find any orthologue for $sequence.\n";}
if ($i == $#sequences) { goto END;}
}
END:
}
#---------------------------------------------------------------------------------------------#
sub find_seqs_in_clade
{
my $node = $_[0];
my @temp = split(/\s+/,$node);
if ($#temp > 0) { $node = $temp[1];}
my @nodes;
my $no_nodes;
my $i;
my $j;
my @seqs;
my @new_nodes;
my $no_new_nodes;
my $added = 0;
@nodes = (@nodes,$node);
my %CHECKED = ();
# CHECK IF $node IS A SEQUENCE:
if (!($node =~ /NODE/))
{
@seqs = (@seqs,$node);
return @seqs;
}
FIND_SEQS_UNDER_NODES:
$no_nodes = $#nodes + 1;
$added = 0;
for ($i = 0; $i < $no_nodes; $i++)
{
# FIND ALL THE NODES THAT $nodes[$i] CONNECTS TO:
if ($CHECKED{$nodes[$i]}) { goto NEXT_NODEI;}
@new_nodes = find_connecting_nodes($nodes[$i]);
# GO THROUGH ALL THE NODES THAT CONNECT TO $nodes[$i]:
$no_new_nodes = $#new_nodes + 1;
for ($j = 0; $j < $no_new_nodes; $j++)
{
if (!($new_nodes[$j] =~ /NODE/))
{
@seqs = (@seqs,$new_nodes[$j]);
}
else
{
@nodes = (@nodes,$new_nodes[$j]);
$added = 1;
}
}
$CHECKED{$nodes[$i]} = 1;
NEXT_NODEI:
}
if ($added == 1) { goto FIND_SEQS_UNDER_NODES;}
return @seqs;
}
#---------------------------------------------------------------------------------------------#
# WE HAVE FOUND A COMMA IN THE NEWICK TREE. NOW LOOK TO THE RIGHT FOR THE NEAREST ) AND TO
# THE LEFT FOR THE NEAREST ( AND FIND THE REGION (CONTAINING ,) BRACKETED BY ().
sub find_bracketed_region
{
my $i = $_[0]; # POSITION OF THE COMMA OF INTEREST.
my $len = $_[1]; # LENGTH OF THE WHOLE TREE.
my $tree = $_[2];
my $j;
my $char;
my $bracket_start;
my $bracket_end;
my $bracket_len;
my $bracketed;
my $num_brackets;
my $x;
my $look_as_far;
my $bracket_num;
# THE INDEX OF THE COMMA IN THE NEWICK TREE IS CHARACTER $i.
#------------------------------------------------------------------------------------------#
# LOOK FOR THE INDEX OF THE FIRST ( BEFORE THE COMMA:
for ($j = $i; $j >= 0; $j--)
{
$char = substr($tree,$j,1);
if ($char eq '(' && !($USED_BRACKET{$j}))
{
$bracket_start = $j; # THE INDEX OF THE PREVIOUS ( BEFORE THE COMMA.
goto FOUND_STARTING_BRACKET;
}
}
FOUND_STARTING_BRACKET:
#------------------------------------------------------------------------------------------#
# LOOK FOR THE INDEX OF THE FIRST ) AFTER THE COMMA:
for ($j = $i; $j < $len; $j++)
{
$char = substr($tree,$j,1);
if ($char eq ')' && !($USED_BRACKET{$j}))
{
$bracket_end = $j; # THE INDEX OF THE NEXT ) AFTER THE COMMA.
goto FOUND_ENDING_BRACKET;
}
}
FOUND_ENDING_BRACKET:
#------------------------------------------------------------------------------------------#
# FIND THE LENGTH OF THE REGION ENCLOSED BY THE PARENTHESES ():
# (xxxx)
# 123456
# eg., $bracket_len = 6 - 1 + 1 = 6, ie., $bracket_len INCLUDES THE ().
$bracket_len = $bracket_end - $bracket_start + 1;
if ($bracket_len < 0)
{
goto COMMA_NOT_OKAY;
}
#------------------------------------------------------------------------------------------#
# FIND THE REGION ENCLOSED BY THE PARENTHESES:
$bracketed = substr($tree,$bracket_start,$bracket_len);
# COUNT THE NUMBER OF BRACKETS IN THE BRACKETED REGION: IF THERE ARE MORE )s THAN (s SOMETHING IS WRONG:
$num_brackets = 0;
for ($x = 0; $x < $bracket_len; $x++)
{
$char = substr($bracketed,$x,1);
if ($char eq '(') { $num_brackets++;}
elsif ($char eq ')') { $num_brackets--;}
if ($num_brackets < 0) # THERE ARE MORE )s THAN (s, SOMETHING IS WRONG.
{
goto COMMA_NOT_OKAY;
}
}
#------------------------------------------------------------------------------------------#
# THE BRACKETED REGION SHOULD NOT CONTAIN ANY BRACKETS THAT WE HAVE NOT ALREADY EXAMINED, AS
# WE WANT TO LOOK AT INTERNAL BIFURCATIONS, AND WORK UPWARDS TOWARDS THE ROOT OF THE TREE:
$look_as_far = $bracket_len - 1; # LOOK UP TO JUST BEFORE THE END ) BRACKET.
for ($x = 1; $x < $look_as_far; $x++)
{
$char = substr($bracketed,$x,1);
if ($char eq '(' || $char eq ')')
{
# FIND THE INDEX OF THIS BRACKET IN THE WHOLE TREE:
$bracket_num = $x + $bracket_start;
if (!($USED_BRACKET{$bracket_num})) # THERE IS AN INTERNAL BRACKET THAT HAS NOT YET BEEN EXAMINED.
{
goto COMMA_NOT_OKAY;
}
}
}
#------------------------------------------------------------------------------------------#
# IF THERE IS AN EVEN NUMBER OF BRACKETS AND WE HAVE NOT ALREADY GOT THIS BRACKETED REGION,
# THEN ADD THIS BRACKETED REGION INTO OUR ARRAY @bracketeds OF BRACKETED REGIONS:
if (!($HAVE_BRACKETED{$bracketed}) && $num_brackets == 0)
{
if ($PRINT_BRACKET_DETAILS == 1)
{
print "The bracketed bit is $bracketed.\n";
}
$USED_BRACKET{$bracket_start} = 1;
$USED_BRACKET{$bracket_end} = 1;
$USED_COMMA{$i} = 1;
$HAVE_BRACKETED{$bracketed} = 1;
(@bracketeds) = (@bracketeds,$bracketed);
}
COMMA_NOT_OKAY:
}
#---------------------------------------------------------------------------------------------#
# NIBBLE FORWARD AND BACK TO THE FIRST INSTANCES OF THE STRING $thechar.
sub nibble_to_characters
{
my $name = $_[0];
my $thechar = $_[1];
if ($name =~ /$thechar/)
{
$name =~ s/$thechar//g;
}
return $name;
}
#---------------------------------------------------------------------------------------------#
# TAKE OFF STARTING OR ENDING SINGLE CHARACTERS OF $thechar.
sub nibble_off_characters
{
my $name = $_[0];
my $thechar = $_[1];
my $len;
my $index;
my $x;
my $char;
$len = length($name);
# TAKE OFF DASHES AT THE START:
$index = 0;
for ($x = 0; $x < $len; $x++)
{
$char= substr($name,$x,1);
if ($char eq $thechar) { $index = $x + 1; }
else { goto FOUND_START;}
}
FOUND_START:
$name = substr($name,$index,$len-$index);
# TAKE OFF DASHES AT THE END:
$len = length($name);
$index = $len;
for ($x = $len-1; $x >= 0; $x--)
{
$char= substr($name,$x,1);
if ($char eq $thechar) { $index = $x-1; }
else { goto FOUND_END; }
}
FOUND_END:
$name = substr($name,0,$index);
return $name;
}
#---------------------------------------------------------------------------------------------#
# GO THROUGH EACH OF THE BIFURCATIONS AND SEE WHICH BRANCHES THEY SPLIT INTO.
# EACH BIFURCATION IS A NODE WHICH SPLITS TO TWO THINGS, EITHER TWO MORE
# NODES, OR ONE SEQUENCE AND ONE NODE, OR TWO SEQUENCES. EACH NODE OR SEQUENCE
# HAS A PARENT NODE FROM WHICH IT SPLIT OFF, AND EACH PARENT NODE HAS TWO CHILD NODES.
sub find_what_nodes_split_into
{
%SPLITS_INTO = (); # HASH TABLE TO RECORD WHICH NODES A NODE SPLITS INTO.
%PARENT = (); # HASH TABLE TO RECORD THE PARENT NODE OF EACH NODE.
%SISTER = (); # HASH TABLE TO RECORD THE SISTER NODE OF EACH NODE.
my %HAVE = ();
my %CNT = ();
%BRANCHLENGTH = (); # THE BRANCHLENGTH.
my $i;
my $num_bracketeds;
my $no_loops;
my $bracketed;
my @temp;
my $num_things;
my $first;
my $second;
my $branchlength;
my $last_letter;
my $first_one;
my $second_one;
my $inside_node;
my $inside;
my $x;
my $y;
my $start;
my $len;
my $j;
$num_bracketeds = $#bracketeds + 1;
$no_loops = 0;
for ($i = $num_bracketeds - 1; $i > -1; $i--)
{
if ($PRINT_FORK_DETAILS == 1) {print "=========================================== \n";}
$bracketed = $bracketeds[$i];
#---------------------------------------------------------------------------------------#
# SEE WHICH TWO THINGS THIS NODE SPLITS INTO:
RETRY:
$no_loops++;
if ($no_loops > 500) { print "ERROR: $tree_file: have looped $no_loops times now.\n"; exit;}
@temp = split(/,/,$bracketed);
$num_things = $#temp + 1;
if ($num_things == 2) # THIS NODE SPLITS UP INTO TWO SEQUENCES:
{
$first = $temp[0];
$second = $temp[1];
$first = nibble_off_characters($first,"(");
$first = nibble_off_characters($first,"_");
@temp = split(/:/,$first);
if ($temp[0] =~ /[A-Z]/ || $temp[0] =~ /[a-z]/) { $first = $temp[0]; $branchlength = $temp[1];}
elsif ($temp[1] =~ /[A-Z]/ || $temp[1] =~ /[a-z]/) { $first = $temp[1]; $branchlength = $temp[0];}
$first =~ tr/[a-z]/[A-Z]/;
$last_letter = substr($first,length($first)-1,1);
if ($last_letter =~ /[A-Z]/ && !($first =~ /_/)) { chop($first);}
@temp = split(/\s+/,$first);
if ($#temp > 0) { $first = $temp[1];}
# TAKE OFF ENDING CHARACTERS THAT ARE NOT NUMBERS:
if ($first =~ /NODE/)
{
$first = nibble_off_letters($first);
}
if ($HAVE{$first}) { if (!($CNT{$first})){$CNT{$first} = 2;} else { $CNT{$first}++;} $first = $first.$CNT{$first};}
$HAVE{$first}= 1;
$BRANCHLENGTH{$first}= $branchlength;
$second = nibble_off_characters($second,")");
$second = nibble_off_characters($second,"_");
@temp = split(/:/,$second);
if ($temp[0] =~ /[A-Z]/ || $temp[0] =~ /[a-z]/) { $second = $temp[0]; $branchlength = $temp[1];}
elsif ($temp[1] =~ /[A-Z]/ || $temp[1] =~ /[a-z]/) { $second = $temp[1]; $branchlength = $temp[0];}
$second =~ tr/[a-z]/[A-Z]/;
$last_letter = substr($second,length($second)-1,1);
if ($last_letter =~ /[A-Z]/ && !($second =~ /_/)) { chop($second);}
@temp = split(/\s+/,$second);
if ($#temp > 0) { $second = $temp[1];}
# TAKE OFF ENDING CHARACTERS THAT ARE NOT NUMBERS:
if ($second =~ /NODE/)
{
$second = nibble_off_letters($second);
}
if ($HAVE{$second}) { if (!($CNT{$second})){$CNT{$second} = 2;} else { $CNT{$second}++;} $second = $second.$CNT{$second};}
$HAVE{$second} = 1;
$BRANCHLENGTH{$second} = $branchlength;
$first_one = $first;
$second_one = $second;
$SPLITS_INTO{$i} = $first_one.",".$second_one; # NODE $i SPLITS INTO $first_one AND $second_one.
$PARENT{$first_one} = "NODE".$i;
$PARENT{$second_one} = "NODE".$i;
$SISTER{$first_one} = $second_one;
$SISTER{$second_one} = $first_one;
$BRANCHLENGTH{$first_one} = $BRANCHLENGTH{$first};
$BRANCHLENGTH{$second_one}= $BRANCHLENGTH{$second};
if ($PRINT_FORK_DETAILS == 1)
{
print "Node i $i ($PARENT{$first_one}) splits into $first_one ($BRANCHLENGTH{$first_one}) and $second_one ($BRANCHLENGTH{$second_one})\n";
}
#---------------------------------------------------------------------------------------------#
# CHECK WHETHER THE NODE SPLITS INTO SEQUENCES OR INTO MORE NODES:
if (!($first =~ /node/) && !($first =~ /NODE/))
{
@sequences = (@sequences,$first);
}
if (!($second =~ /node/) && !($second =~ /NODE/))
{
@sequences = (@sequences,$second);
}
} # END OF IF THIS NODE SPLITS UP INTO TWO SEQUENCES.
else # THERE ARE OTHER NODES INSIDE THIS NODE.
{
# CHECK IF ANY OTHER NODES ARE INSIDE THIS NODE, AND IF THERE ARE,
# DELETE THEM FROM THE OUTER BRACKETED REGION.
for ($j = $num_bracketeds-1; $j >=0; $j--)
{
if ($j != $i)
{
$inside_node= $bracketeds[$j];
# CHECK WHETHER $inside_node IS WITHIN $bracketed:
$inside = index($bracketed,$inside_node);
if ($inside != -1) # $inside_node IS WITHIN $bracketed:
{
# FIND THE LENGTH OF $inside_node (WE CANNOT USE THE length() FUNCTION AS BRACKETS CONFUSE IT):
$x = 0;
while(substr($inside_node,$x,1) ne '')
{
$x++;
}
# FIND THE LENGTH OF $bracketed:
$y = 0;
while(substr($bracketed,$y,1) ne '')
{
$y++;
}
# FIND THE START OF $inside_node IN $bracketed:
$start = $x + $inside;
# FIND THE LENGTH OF $inside_node IN $bracketed:
$len = $y - $x - ($inside - 1);
# MAKE $bracketed BE A SUBSTRING OF ITSELF WITH $inside_node DELETED:
$bracketed = substr($bracketed,0,$inside)."NODE$j".substr($bracketed,$start,$len);
}
}
}
goto RETRY;
} # END OF IF THERE ARE OTHER NODES INSIDE THIS NODE.
if ($i == 0) { goto JUMP_OUT_OF_LOOP;}
} # END OF LOOKING THROUGH EACH OF THE BIFURCATIONS TO SEE WHAT THEY SPLIT INTO.
JUMP_OUT_OF_LOOP:
}
#---------------------------------------------------------------------------------------------#
sub nibble_off_letters
{
# REMOVE ANY LETTERS AT THE END OF A STRING:
my $string = $_[0];
my $new = "";
my @temp;
my $i;
my $char;
@temp = split(/NODE/,$string);
$string = $temp[1];
$length = length($string);
for ($i = 1; $i <= $length; $i++)
{
$char = substr($string,$i-1,1);
if ($char =~ /[0-9]/)
{
$new = $new.$char;
}
else
{
$new = "NODE".$new;
return $new;
}
}
$new = "NODE".$new;
return $new;
}
#---------------------------------------------------------------------------------------------#
# READ IN THE NEWICK TREE:
sub read_newick_tree
{
my $tree_file = $_[0];
my $line;
my $tree;
open(TREE,"$tree_file") || die "Cannot open $tree_file";
$tree = '';
while(<TREE>)
{
$line = $_;
chomp($line);
$tree = $tree.$line;
}
close(TREE);
return $tree;
}
#---------------------------------------------------------------------------------------------#
# FIND THE BIFURCATIONS IN THE TREE:
sub find_tree_bifurcations
{
# THE TREE IS STORED IN A VARIABLE $tree:
my $tree = $_[0];
my $len;
my $char;
my $round;
%USED_BRACKET = (); # HASH TABLE TO SAY WHETHER WE HAVE EXAMINED A PARTICULAR BRACKET YET.
%USED_COMMA = (); # HASH TABLE TO SAY WHETHER WE HAVE EXAMINED A PARTICULAR COMMA YET.
%HAVE_BRACKETED = ();
my $i;
$len = length($tree);
$char = '';
$round = 0;
#------------------------------------------------------------------------------------------#
RESTART:
$round++;
if ($PRINT_BRACKET_DETAILS == 1)
{
print "\n======================================================\n";
print "ROUND $round of finding tree bifurcations:\n\n";
}
if ($round > 100) { print STDERR "ERROR: $tree_file: have reached round $round.\n\n"; exit;}
# GO THROUGH THE TREE TO FIND COMMAS, AS COMMAS ARE BIFURCATIONS IN THE TREE. WE JUST
# GO THROUGH THE TREE FROM LEFT TO RIGHT:
for ($i = 0; $i < $len; $i++)
{
$char = substr($tree,$i,1);
if ($char eq ',') # THERE IS A BIFURCATION HERE.
{
# LOOK TO THE RIGHT OF THE , FOR THE NEAREST ) AND TO THE LEFT OF THE
# , FOR THE NEAREST ( AND FIND THE REGION BRACKETED BY THESE BRACKETS, ie., (reg,ion), WHERE
# THE REGION CONTAINS THE COMMA:
if (!($USED_COMMA{$i})) # IF WE HAVE NOT FOUND THE REGION AROUND THIS COMMA SO FAR:
{
&find_bracketed_region($i,$len,$tree);
}
}
}
#..........................................................................................#
# NOW WE HAVE LOOKED THROUGH THE TREE FROM LEFT TO RIGHT LOOKING FOR COMMAS. WE MAY NOT
# HAVE FULLY EXAMINED ALL COMMAS YET HOWEVER, AS SOME MAY HAVE BEEN REJECTED IN THE
# SUBROUTINE find_bracketed_region. SO NOW WE SEARCH FOR ANY COMMAS (BIFURCATIONS) LEFT
# UNTIL ALL COMMAS HAVE BEEN ANALYSED:
for ($i = 0; $i < $len; $i++)
{
$char = substr($tree,$i,1);
if ($char eq ')' || $char eq '(')
{
if (!($USED_BRACKET{$i})) # THERE IS A BRACKET (AND SO A COMMA) WHICH HAS NOT BE ANALYSED YET.
{
goto RESTART;
}
}
}
#------------------------------------------------------------------------------------------#
}
#---------------------------------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment