Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Last active October 12, 2015 15:07
Show Gist options
  • Save avrilcoghlan/4045234 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/4045234 to your computer and use it in GitHub Desktop.
Perl script to retrieve all multiple alignments for animal gene families from the TreeFam database
#!/usr/local/bin/perl
=head1 NAME
get_treefam_alns.pl
=head1 SYNOPSIS
get_treefam_alns.pl treefam_version output outputdir
=head1 DESCRIPTION
This script finds all the alignments from a particular version (<treefam_version>) of
the TreeFam mysql database. The alignments are written in output file <output>, with
"#END" between each pair of alignments.
=head1 VERSION
Perl script last edited 19-Oct-2012.
=head1 CONTACT
alc@sanger.ac.uk (Avril Coghlan)
=cut
#
# Perl script get_treefam_alns.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 19-Oct-12.
# Last edited 19-Oct-2012.
# SCRIPT SYNOPSIS: get_treefam_alns.pl: get the full alignments for all families from a particular version of TreeFam (<= version 8)
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
use strict;
use warnings;
use DBI;
my $num_args = $#ARGV + 1;
if ($num_args != 3)
{
print "Usage of get_treefam_alns.pl\n\n";
print "perl get_treefam_alns.pl <treefam_version> <output> <outputdir>\n";
print "where <treefam_version> is the version of TreeFam to use (eg. 7),\n";
print " <output> is the output file of alignments,\n";
print " <outputdir> is the directory for writing output files\n";
print "For example, >perl get_treefam_alns.pl 7 myalns\n";
print "/nfs/users/nfs_a/alc/Documents/StrongyloidesTreeFam\n";
exit;
}
# FIND THE TREEFAM VERSION TO USE:
my $treefam_version = $ARGV[0];
# FIND THE OUTPUT FILE NAME:
my $output = $ARGV[1];
# FIND THE DIRECTORY TO WRITE OUTPUT FILES TO:
my $outputdir = $ARGV[2];
#------------------------------------------------------------------#
# TEST SUBROUTINES:
my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING.
&test_print_error;
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
&run_main_program($treefam_version,$output,$outputdir);
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
sub run_main_program
{
my $version = $_[0]; # THE TREEFAM VERSION TO USE.
my $output = $_[1]; # THE OUTPUT FILE.
my $outputdir = $_[2]; # DIRECTORY TO WRITE OUTPUT FILES TO.
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR.
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR.
my $familiesA; # ARRAY OF FAMILIES OF TYPE 'A'
my $familiesB; # ARRAY OF FAMILIES OF TYPE 'B'
# GET A LIST OF ALL THE TREEFAM-A FAMILIES:
($familiesA,$errorcode,$errormsg) = &get_family_list($version,"A");
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# GET A LIST OF ALL THE TREEFAM-B FAMILIES:
($familiesB,$errorcode,$errormsg) = &get_family_list($version,"B");
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# RETRIEVE THE ALIGNMENTS FOR THE FAMILIES FROM THE DATABASE:
($errorcode,$errormsg) = &get_alns($familiesA,$familiesB,$version,$output,$outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
}
#------------------------------------------------------------------#
# RETRIEVE THE ALIGNMENTS FOR THE FAMILIES FROM THE DATABASE:
# SUBROUTINE SYNOPSIS: get_alns(): retrieve alignments for all TreeFam families from the database
sub get_alns
{
my $familiesA = $_[0]; # ARRAY OF TREEFAM-A FAMILIES
my $familiesB = $_[1]; # ARRAY OF TREEFAM-B FAMILIES
my $version = $_[2]; # VERSION OF TREEFAM TO USE
my $output = $_[3]; # OUTPUT FILE
my $outputdir = $_[4]; # DIRECTORY FOR WRITING OUTPUT FILES TO
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $num_familiesA; # NUMBER OF TREEFAM-A FAMILIES
my $num_familiesB; # NUMBER OF TREEFAM-B FAMILIES
my $i; #
my $family; # TREEFAM FAMILY
my $table_w; # TABLE THAT WE WANT FROM THE DATABASE
my $st; #
my $sth; #
my @array; #
my $rv; #
my $database; # TREEFAM DATABASE TO CONNECT TO
my $dbh; #
my $id; # IDENTIFIER FOR A GENE IN A TREEFAM FAMILY
my $disp_id; # DISPLAY IDENTIFIER FOR A GENE IN A TREEFAM FAMILY
my $cigar; # ALIGNMENT CIGAR FOR A GENE IN A TREEFAM FAMILY
# OPEN THE OUTPUT FILE:
open(OUTPUT,">$output") || die "ERROR: get_hmms: cannot open $output\n";
# CONNECT TO THE DATABASE AND GET ALL FAMILIES OF THIS TYPE:
$database = "treefam_".$version;
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return;
# GET THE ALIGNMENT FOR EACH FAMILY FROM THE DATABASE:
$table_w = "aa_full_align";
$num_familiesA = @$familiesA;
$num_familiesB = @$familiesB;
for ($i = 1; $i <= $num_familiesA; $i++)
{
$family = $familiesA->[($i-1)];
print OUTPUT "$family\n";
print STDERR "Getting alignment for $family...\n";
$st = "SELECT ID, DISP_ID, CIGAR from $table_w WHERE AC=\'$family\'";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$id = $array[0];
$disp_id = $array[1];
$cigar = $array[2];
print OUTPUT "$disp_id $cigar\n";
}
}
else
{
print STDERR "WARNING: no alignment for $family\n";
print OUTPUT "WARNING: no alignment for $family\n";
}
print OUTPUT "#END\n";
}
for ($i = 1; $i <= $num_familiesB; $i++)
{
$family = $familiesB->[($i-1)];
print OUTPUT "$family\n";
print STDERR "Getting alignment for $family...\n";
$st = "SELECT ID, DISP_ID, CIGAR from $table_w WHERE AC=\'$family\'";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$id = $array[0];
$disp_id = $array[1];
$cigar = $array[2];
print OUTPUT "$disp_id $cigar\n";
}
}
else
{
print STDERR "WARNING: no alignment for $family\n";
print OUTPUT "WARNING: no alignment for $family\n";
}
print OUTPUT "#END\n";
}
close(OUTPUT);
return($errorcode,$errormsg);
}
#------------------------------------------------------------------#
# GET A LIST OF ALL THE TREEFAM FAMILIES OF A PARTICULAR TYPE ("A" OR "B"):
# SUBROUTINE SYNOPSIS: get_family_list(): get a list of all TreeFam families of type A/B
sub get_family_list
{
my $version = $_[0]; # VERSION OF TREEFAM THAT WE WANT TO GET FAMILIES FOR, eg. 8
my $type = $_[1]; # TYPE OF FAMILY ("A" OR "B")
my $dbh; #
my $database; # VERSION OF THE TREEFAM DATABASE, eg. treefam_8
my $table_w; # THE DATABASE TABLE OF INTEREST
my $st; #
my $sth; #
my $rv; #
my @array; #
my $AC; # TREEFAM ACCESSION FOR A FAMILY
my @families; # ARRAY OF TREEFAM FAMILIES
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
if ($type ne 'A' && $type ne 'B')
{
$errormsg = "ERROR: get_family_list: type $type\n";
$errorcode = 1; # ERRORCODE=1
return(\@families,$errorcode,$errormsg);
}
$database = "treefam_".$version;
# CONNECT TO THE DATABASE AND GET ALL FAMILIES OF THIS TYPE:
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return(\@families,$errorcode,$errormsg);
# GET ALL THE FAMILIES OF TYPE $type:
if ($type eq 'A') { $table_w = "familyA"; }
elsif ($type eq 'B') { $table_w = "familyB";}
$st = "SELECT AC from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$AC = $array[0];
@families = (@families,$AC);
if ($#families % 100 == 0) { print STDERR "Read $#families families now (of type $type) ...\n"; }
}
}
$dbh->disconnect();
return(\@families,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &print_error
sub test_print_error
{
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR
($errormsg,$errorcode) = &print_error(45,45,1);
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message','My error message',1);
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;}
($errormsg,$errorcode) = &print_error('none',45,1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message', 0, 1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;}
}
#------------------------------------------------------------------#
# PRINT OUT AN ERROR MESSAGE AND EXIT.
sub print_error
{
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED.
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED.
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/)
{
if ($called_from_test == 1)
{
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n";
exit;
}
}
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/))
{
if ($called_from_test == 1)
{
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n";
exit;
}
}
if ($errormsg eq 'none' || $errorcode == 0)
{
if ($called_from_test == 1)
{
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n";
exit;
}
}
else
{
print STDERR "$errormsg";
exit;
}
return($errormsg,$errorcode);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment