Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 16:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5065796 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5065796 to your computer and use it in GitHub Desktop.
Perl script that connects to the TreeFam mysql database, and parses the trees using Bioperl.
#!/usr/local/bin/perl
#
# Perl script parse_treefam_bioperl.pl
# Written by Avril Coghlan (alc@sanger.ac.uk).
# 23-Feb-06.
#
# For the TreeFam project.
#
# This perl script connects to the MYSQL database of TreeFam families and
# parses the trees using Bioperl.
#
# The command-line format is:
# % perl <parse_treefam_bioperl.pl> tree_type db
# where tree_type is the type of tree to look at (CLEAN/FULL/SEED/all),
# db is the TreeFam database to look at (A/B/all).
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 2)
{
print "Usage of parse_treefam_bioperl.pl <tree_type> <db>\n\n";
print "where <tree_type> is the type of tree to look at (CLEAN/FULL/SEED/all),\n";
print " <db> is the TreeFam database to look at (A/B/all).\n";
print "perl -w parse_treefam_bioperl.pl\n";
print "For example, >perl -w parse_treefam_bioperl.pl SEED A\n";
exit;
}
# DECLARE MYSQL USERNAME AND HOST:
use DBI;
use lib "/nfs/disk100/wormpub2/Avril/downloadnov05/bioperl/bioperl-1.5.1/";
use Bio::TreeIO;
use IO::String;
# FIND THE TYPE OF TREE TO LOOK AT:
$tree_type = $ARGV[0];
if ($tree_type ne 'CLEAN' && $tree_type ne 'FULL' && $tree_type ne 'SEED' && $tree_type ne 'all')
{
print STDERR "ERROR: tree_type should be CLEAN/FULL/SEED/all.\n";
exit;
}
# FIND THE TREEFAM DATABASE TO LOOK AT:
$db = $ARGV[1];
if ($db ne 'A' && $db ne 'B')
{
print STDERR "ERROR: db should be A/B/all.\n";
exit;
}
#------------------------------------------------------------------#
# GET THE TREEFAM-A TREES FROM THE MYSQL DATABASE:
$dbh = DBI->connect("dbi:mysql:treefam_2:db.treefam.org:3308", 'anonymous', '') || return;
$table_w = 'trees';
# THIS TABLE HAS THE ACCESSION NUMBER, TYPE OF TREE (SEED/FULL/WHOLE/CLEAN) and the TREE ITSELF (NEWICK FORMAT).
$st = "SELECT AC, TYPE, TREE from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array) {
$AC = $array[0]; # eg., TF101001
$TYPE = $array[1]; # eg., SEED
$TREE = $array[2]; # THE TREE IN NEWICK FORMAT.
# NOW READ IN THE NEWICK FORMAT TREE:
if ($AC eq '') { print STDERR "ERROR: AC is $AC.\n"; exit;}
if ($TYPE eq '') { print STDERR "ERROR: TYPE is $TYPE.\n"; exit;}
# CHECK THAT THIS IS THE TYPE OF TREE (CLEAN/FULL/SEED/all) THAT WE WANT TO LOOK AT:
if ($TYPE eq $tree_type || $tree_type eq 'all')
{
# CHECK THAT THIS TREE COMES FROM THE TREEFAM DATABASE THAT WE WANT TO LOOK AT (A/B/all):
if ((substr($AC,0,3) eq 'TF1' && $db eq 'A') || (substr($AC,0,3) eq 'TF3' && $db eq 'B') || $db eq 'all')
{
# CHECK THE NHX FORMAT OF THE TREE IS FINE:
$TREE = &check_nhx_format($TREE,$AC,$TYPE);
# PARSE THE TREE USING THE BIOPERL TREE PARSER:
if ($TREE ne 'none') # IF THE FORMAT OF THE TREE IS OK.
{
$parsed = &read_tree($TREE,$AC,$TYPE);
if ($parsed == 1)
{
print STDERR "Tree $AC $TYPE parsed correctly...\n";
}
}
}
}
}
}
$rc = $dbh->disconnect();
$rc = "";
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# SUBROUTINE TO CHECK THE NHX FORMAT OF THE TREE, TO SEE IF IT WILL
# BE ABLE TO BE READ BY THE BIOPERL TREE PARSER. SPECIFICALLY, THIS
# CHECKS TO SEE IF THE TREE CONTAINS ANY INCORRECT SQUARE BRACKETS, ie.
# [&&NHX] (WITHOUT ANY TAG INFORMATION).
# 23-FEB-06.
sub check_nhx_format
{
my $tree = $_[0]; #> THE PHYLOGENETIC TREE.
my $AC = $_[1]; #> THE TREEFAM ACCESSION NUMBER.
my $TYPE = $_[2]; #> THE TYPE OF TREEFAM TREE.
my $length = length($tree); #> LENGTH OF $tree.
my $i; #>
my $take = 1; #>
my $char; #> A CHARACTER IN $tree.
my $bracket = ""; #>
my $new_tree = ""; #>
my $seen_semicolon = 0; #> SAYS WHETHER THERE IS A ; AT THE END OF THE TREE.
# LOOK AT THE TREE $tree, CHARACTER-BY-CHARACTER:
for ($i = 0; $i < $length; $i++)
{
$char = substr($tree,$i,1);
# CHECK WHETHER THIS IS A SEMICOLON:
if ($char eq ';') { $seen_semicolon = 1;}
# RECORD THE CHARACTERS WITHIN SQUARE BRACKETS:
if ($take == 0 || $char eq '[' || $char eq ']') { $bracket = $bracket.$char;}
# CHECK IF THIS IS THE START OR END OF A REGION WITHIN SQUARE BRACKETS:
if ($char eq '[') # THE START OF A NHX TAG.
{
$take = 0;
}
elsif ($char eq ']') # THE END OF A NHX TAG.
{
$take = 1;
# CHECK IF THERE IS TAG INFORMATION WITHIN THE BRACKETED REGION:
if ($bracket eq '[&&NHX]')
{
$bracket = '[&&NHX:S=Hello]';
}
$new_tree = $new_tree.$bracket;
$bracket = "";
}
# CHECK IF THIS IS A REGION OUTSIDE SQUARE BRACKETS:
if ($take == 1 && $char ne ']')
{
$new_tree = $new_tree.$char;
}
}
# CHECK WHETHER THE TREE ENDED EARLY:
if ($seen_semicolon == 0)
{
print STDERR "ERROR: tree $AC $TYPE is truncated.\n";
print "ERROR: tree $AC $TYPE is truncated.\n";
return "none";
}
return $new_tree;
}
#------------------------------------------------------------------#
# SUBROUTINE TO READ IN THE NEWICK FORMAT TREE, AND PARSE IT USING BIOPERL.
# 23-FEB-06.
sub read_tree
{
my $tree = $_[0]; #> THE TREE IN NEWICK FORMAT.
my $AC = $_[1]; #> THE ACCESSION NUMBER FOR THE TREEFAM FAMILY.
my $TYPE = $_[2]; #> THE TYPE OF TREE (SEED/FULL/CLEAN).
my $treefh; #>
my $input; #>
my $new_tree = ''; #>
my $rootnode; #>
my $parent; #>
my %PARENT = (); #> HASH TABLE OF THE PARENTS OF NODES.
my %SPLITS_INTO = (); #> HASH TABLE OF THE DAUGHTERS OF NODES.
my %SEQ = (); #> HASH TABLE OF SEQUENCE NAMES FOR LEAVES OF THE TREE.
my $node; #>
my @nodes; #>
my $seq; #>
my $PRINT_FORK_DETAILS = 0; #> IF 1, PRINT DETAILS OF THE BRANCHING OF THE TREE.
my $nodes; #>
my $i; #>
# PARSE THE TREE USING BIOPERL. THE TREE IS IN EXTENDED NEWICK FORMAT.
$treefh = new IO::String($tree);
$input = new Bio::TreeIO(-fh => $treefh, -format => "nhx");
eval { $new_tree = $input->next_tree };
if ($@)
{
print STDERR "ERROR: not able to parse tree $AC $TYPE.\n";
print "ERROR: not able to parse tree $AC $TYPE.\n";
return 0;
}
if ($new_tree eq '')
{
print STDERR "ERROR: not able to parse tree $AC $TYPE.\n";
print "ERROR: not able to parse tree $AC $TYPE.\n";
return 0;
}
# GET THE ROOT NODE OF THE TREE:
$rootnode = $new_tree->get_root_node;
# GET ALL THE INTERNAL NODES IN THE TREE, AND RECORD THEIR PARENT NODES IN THE HASH
# TABLE %PARENT, AND THEIR DESCENDANTS IN THE HASH TABLE %SPLITS_INTO:
@nodes = $new_tree->get_nodes();
foreach $node(@nodes)
{
if ($node ne $rootnode)
{
$parent = $node->ancestor;
# RECORD THAT NODE $parent IS THE PARENT NODE OF $node:
$PARENT{$node} = $parent;
# RECORD THAT $parent HAS $node AS A DESCENDANT:
if (!($SPLITS_INTO{$parent})) { $SPLITS_INTO{$parent} = $node; }
else { $SPLITS_INTO{$parent} = $SPLITS_INTO{$parent}.",".$node;}
# IF THIS IS A LEAF, CHECK WHETHER THIS IS AN OUTGROUP SEQUENCE:
if ($node->is_Leaf)
{
$seq = $node->id();
$SEQ{$node} = $seq;
}
}
}
# SEE IF WE WANT TO PRINT OUT DETAILS OF THE BRANCHING OF THE TREE:
if ($PRINT_FORK_DETAILS == 1)
{
foreach $parent (keys %SPLITS_INTO)
{
print "$parent splits into ";
$nodes = $SPLITS_INTO{$parent};
@nodes = split(/\,/,$nodes);
for ($i = 0; $i <= $#nodes; $i++)
{
$node = $nodes[$i];
if ($SEQ{$node}) { print "$SEQ{$node}, ";}
else { print "$node, "; }
}
print "\n";
}
}
return 1;
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment