Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Perl script to retrieve all multiple alignments for animal gene families from the TreeFam database
#!/usr/local/bin/perl
=head1 NAME
get_treefam_alns.pl
=head1 SYNOPSIS
get_treefam_alns.pl treefam_version output outputdir
=head1 DESCRIPTION
This script finds all the alignments from a particular version (<treefam_version>) of
the TreeFam mysql database. The alignments are written in output file <output>, with
"#END" between each pair of alignments.
=head1 VERSION
Perl script last edited 19-Oct-2012.
=head1 CONTACT
alc@sanger.ac.uk (Avril Coghlan)
=cut
#
# Perl script get_treefam_alns.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 19-Oct-12.
# Last edited 19-Oct-2012.
# SCRIPT SYNOPSIS: get_treefam_alns.pl: get the full alignments for all families from a particular version of TreeFam (<= version 8)
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
use strict;
use warnings;
use DBI;
my $num_args = $#ARGV + 1;
if ($num_args != 3)
{
print "Usage of get_treefam_alns.pl\n\n";
print "perl get_treefam_alns.pl <treefam_version> <output> <outputdir>\n";
print "where <treefam_version> is the version of TreeFam to use (eg. 7),\n";
print " <output> is the output file of alignments,\n";
print " <outputdir> is the directory for writing output files\n";
print "For example, >perl get_treefam_alns.pl 7 myalns\n";
print "/nfs/users/nfs_a/alc/Documents/StrongyloidesTreeFam\n";
exit;
}
# FIND THE TREEFAM VERSION TO USE:
my $treefam_version = $ARGV[0];
# FIND THE OUTPUT FILE NAME:
my $output = $ARGV[1];
# FIND THE DIRECTORY TO WRITE OUTPUT FILES TO:
my $outputdir = $ARGV[2];
#------------------------------------------------------------------#
# TEST SUBROUTINES:
my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING.
&test_print_error;
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
&run_main_program($treefam_version,$output,$outputdir);
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
sub run_main_program
{
my $version = $_[0]; # THE TREEFAM VERSION TO USE.
my $output = $_[1]; # THE OUTPUT FILE.
my $outputdir = $_[2]; # DIRECTORY TO WRITE OUTPUT FILES TO.
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR.
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR.
my $familiesA; # ARRAY OF FAMILIES OF TYPE 'A'
my $familiesB; # ARRAY OF FAMILIES OF TYPE 'B'
# GET A LIST OF ALL THE TREEFAM-A FAMILIES:
($familiesA,$errorcode,$errormsg) = &get_family_list($version,"A");
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# GET A LIST OF ALL THE TREEFAM-B FAMILIES:
($familiesB,$errorcode,$errormsg) = &get_family_list($version,"B");
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# RETRIEVE THE ALIGNMENTS FOR THE FAMILIES FROM THE DATABASE:
($errorcode,$errormsg) = &get_alns($familiesA,$familiesB,$version,$output,$outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
}
#------------------------------------------------------------------#
# RETRIEVE THE ALIGNMENTS FOR THE FAMILIES FROM THE DATABASE:
# SUBROUTINE SYNOPSIS: get_alns(): retrieve alignments for all TreeFam families from the database
sub get_alns
{
my $familiesA = $_[0]; # ARRAY OF TREEFAM-A FAMILIES
my $familiesB = $_[1]; # ARRAY OF TREEFAM-B FAMILIES
my $version = $_[2]; # VERSION OF TREEFAM TO USE
my $output = $_[3]; # OUTPUT FILE
my $outputdir = $_[4]; # DIRECTORY FOR WRITING OUTPUT FILES TO
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $num_familiesA; # NUMBER OF TREEFAM-A FAMILIES
my $num_familiesB; # NUMBER OF TREEFAM-B FAMILIES
my $i; #
my $family; # TREEFAM FAMILY
my $table_w; # TABLE THAT WE WANT FROM THE DATABASE
my $st; #
my $sth; #
my @array; #
my $rv; #
my $database; # TREEFAM DATABASE TO CONNECT TO
my $dbh; #
my $id; # IDENTIFIER FOR A GENE IN A TREEFAM FAMILY
my $disp_id; # DISPLAY IDENTIFIER FOR A GENE IN A TREEFAM FAMILY
my $cigar; # ALIGNMENT CIGAR FOR A GENE IN A TREEFAM FAMILY
# OPEN THE OUTPUT FILE:
open(OUTPUT,">$output") || die "ERROR: get_hmms: cannot open $output\n";
# CONNECT TO THE DATABASE AND GET ALL FAMILIES OF THIS TYPE:
$database = "treefam_".$version;
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return;
# GET THE ALIGNMENT FOR EACH FAMILY FROM THE DATABASE:
$table_w = "aa_full_align";
$num_familiesA = @$familiesA;
$num_familiesB = @$familiesB;
for ($i = 1; $i <= $num_familiesA; $i++)
{
$family = $familiesA->[($i-1)];
print OUTPUT "$family\n";
print STDERR "Getting alignment for $family...\n";
$st = "SELECT ID, DISP_ID, CIGAR from $table_w WHERE AC=\'$family\'";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$id = $array[0];
$disp_id = $array[1];
$cigar = $array[2];
print OUTPUT "$disp_id $cigar\n";
}
}
else
{
print STDERR "WARNING: no alignment for $family\n";
print OUTPUT "WARNING: no alignment for $family\n";
}
print OUTPUT "#END\n";
}
for ($i = 1; $i <= $num_familiesB; $i++)
{
$family = $familiesB->[($i-1)];
print OUTPUT "$family\n";
print STDERR "Getting alignment for $family...\n";
$st = "SELECT ID, DISP_ID, CIGAR from $table_w WHERE AC=\'$family\'";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$id = $array[0];
$disp_id = $array[1];
$cigar = $array[2];
print OUTPUT "$disp_id $cigar\n";
}
}
else
{
print STDERR "WARNING: no alignment for $family\n";
print OUTPUT "WARNING: no alignment for $family\n";
}
print OUTPUT "#END\n";
}
close(OUTPUT);
return($errorcode,$errormsg);
}
#------------------------------------------------------------------#
# GET A LIST OF ALL THE TREEFAM FAMILIES OF A PARTICULAR TYPE ("A" OR "B"):
# SUBROUTINE SYNOPSIS: get_family_list(): get a list of all TreeFam families of type A/B
sub get_family_list
{
my $version = $_[0]; # VERSION OF TREEFAM THAT WE WANT TO GET FAMILIES FOR, eg. 8
my $type = $_[1]; # TYPE OF FAMILY ("A" OR "B")
my $dbh; #
my $database; # VERSION OF THE TREEFAM DATABASE, eg. treefam_8
my $table_w; # THE DATABASE TABLE OF INTEREST
my $st; #
my $sth; #
my $rv; #
my @array; #
my $AC; # TREEFAM ACCESSION FOR A FAMILY
my @families; # ARRAY OF TREEFAM FAMILIES
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
if ($type ne 'A' && $type ne 'B')
{
$errormsg = "ERROR: get_family_list: type $type\n";
$errorcode = 1; # ERRORCODE=1
return(\@families,$errorcode,$errormsg);
}
$database = "treefam_".$version;
# CONNECT TO THE DATABASE AND GET ALL FAMILIES OF THIS TYPE:
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return(\@families,$errorcode,$errormsg);
# GET ALL THE FAMILIES OF TYPE $type:
if ($type eq 'A') { $table_w = "familyA"; }
elsif ($type eq 'B') { $table_w = "familyB";}
$st = "SELECT AC from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[0];
@families = (@families,$AC);
if ($#families % 100 == 0) { print STDERR "Read $#families families now (of type $type) ...\n"; }
}
}
$dbh->disconnect();
return(\@families,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &print_error
sub test_print_error
{
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR
($errormsg,$errorcode) = &print_error(45,45,1);
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message','My error message',1);
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;}
($errormsg,$errorcode) = &print_error('none',45,1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message', 0, 1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;}
}
#------------------------------------------------------------------#
# PRINT OUT AN ERROR MESSAGE AND EXIT.
sub print_error
{
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED.
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED.
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/)
{
if ($called_from_test == 1)
{
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n";
exit;
}
}
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/))
{
if ($called_from_test == 1)
{
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n";
exit;
}
}
if ($errormsg eq 'none' || $errorcode == 0)
{
if ($called_from_test == 1)
{
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n";
exit;
}
}
else
{
print STDERR "$errormsg";
exit;
}
return($errormsg,$errorcode);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment