Last active
October 12, 2015 15:07
-
-
Save avrilcoghlan/4045234 to your computer and use it in GitHub Desktop.
Perl script to retrieve all multiple alignments for animal gene families from the TreeFam database
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
=head1 NAME | |
get_treefam_alns.pl | |
=head1 SYNOPSIS | |
get_treefam_alns.pl treefam_version output outputdir | |
=head1 DESCRIPTION | |
This script finds all the alignments from a particular version (<treefam_version>) of | |
the TreeFam mysql database. The alignments are written in output file <output>, with | |
"#END" between each pair of alignments. | |
=head1 VERSION | |
Perl script last edited 19-Oct-2012. | |
=head1 CONTACT | |
alc@sanger.ac.uk (Avril Coghlan) | |
=cut | |
# | |
# Perl script get_treefam_alns.pl | |
# Written by Avril Coghlan (alc@sanger.ac.uk) | |
# 19-Oct-12. | |
# Last edited 19-Oct-2012. | |
# SCRIPT SYNOPSIS: get_treefam_alns.pl: get the full alignments for all families from a particular version of TreeFam (<= version 8) | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
use strict; | |
use warnings; | |
use DBI; | |
my $num_args = $#ARGV + 1; | |
if ($num_args != 3) | |
{ | |
print "Usage of get_treefam_alns.pl\n\n"; | |
print "perl get_treefam_alns.pl <treefam_version> <output> <outputdir>\n"; | |
print "where <treefam_version> is the version of TreeFam to use (eg. 7),\n"; | |
print " <output> is the output file of alignments,\n"; | |
print " <outputdir> is the directory for writing output files\n"; | |
print "For example, >perl get_treefam_alns.pl 7 myalns\n"; | |
print "/nfs/users/nfs_a/alc/Documents/StrongyloidesTreeFam\n"; | |
exit; | |
} | |
# FIND THE TREEFAM VERSION TO USE: | |
my $treefam_version = $ARGV[0]; | |
# FIND THE OUTPUT FILE NAME: | |
my $output = $ARGV[1]; | |
# FIND THE DIRECTORY TO WRITE OUTPUT FILES TO: | |
my $outputdir = $ARGV[2]; | |
#------------------------------------------------------------------# | |
# TEST SUBROUTINES: | |
my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING. | |
&test_print_error; | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
&run_main_program($treefam_version,$output,$outputdir); | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
sub run_main_program | |
{ | |
my $version = $_[0]; # THE TREEFAM VERSION TO USE. | |
my $output = $_[1]; # THE OUTPUT FILE. | |
my $outputdir = $_[2]; # DIRECTORY TO WRITE OUTPUT FILES TO. | |
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR. | |
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR. | |
my $familiesA; # ARRAY OF FAMILIES OF TYPE 'A' | |
my $familiesB; # ARRAY OF FAMILIES OF TYPE 'B' | |
# GET A LIST OF ALL THE TREEFAM-A FAMILIES: | |
($familiesA,$errorcode,$errormsg) = &get_family_list($version,"A"); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# GET A LIST OF ALL THE TREEFAM-B FAMILIES: | |
($familiesB,$errorcode,$errormsg) = &get_family_list($version,"B"); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# RETRIEVE THE ALIGNMENTS FOR THE FAMILIES FROM THE DATABASE: | |
($errorcode,$errormsg) = &get_alns($familiesA,$familiesB,$version,$output,$outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
} | |
#------------------------------------------------------------------# | |
# RETRIEVE THE ALIGNMENTS FOR THE FAMILIES FROM THE DATABASE: | |
# SUBROUTINE SYNOPSIS: get_alns(): retrieve alignments for all TreeFam families from the database | |
sub get_alns | |
{ | |
my $familiesA = $_[0]; # ARRAY OF TREEFAM-A FAMILIES | |
my $familiesB = $_[1]; # ARRAY OF TREEFAM-B FAMILIES | |
my $version = $_[2]; # VERSION OF TREEFAM TO USE | |
my $output = $_[3]; # OUTPUT FILE | |
my $outputdir = $_[4]; # DIRECTORY FOR WRITING OUTPUT FILES TO | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $num_familiesA; # NUMBER OF TREEFAM-A FAMILIES | |
my $num_familiesB; # NUMBER OF TREEFAM-B FAMILIES | |
my $i; # | |
my $family; # TREEFAM FAMILY | |
my $table_w; # TABLE THAT WE WANT FROM THE DATABASE | |
my $st; # | |
my $sth; # | |
my @array; # | |
my $rv; # | |
my $database; # TREEFAM DATABASE TO CONNECT TO | |
my $dbh; # | |
my $id; # IDENTIFIER FOR A GENE IN A TREEFAM FAMILY | |
my $disp_id; # DISPLAY IDENTIFIER FOR A GENE IN A TREEFAM FAMILY | |
my $cigar; # ALIGNMENT CIGAR FOR A GENE IN A TREEFAM FAMILY | |
# OPEN THE OUTPUT FILE: | |
open(OUTPUT,">$output") || die "ERROR: get_hmms: cannot open $output\n"; | |
# CONNECT TO THE DATABASE AND GET ALL FAMILIES OF THIS TYPE: | |
$database = "treefam_".$version; | |
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return; | |
# GET THE ALIGNMENT FOR EACH FAMILY FROM THE DATABASE: | |
$table_w = "aa_full_align"; | |
$num_familiesA = @$familiesA; | |
$num_familiesB = @$familiesB; | |
for ($i = 1; $i <= $num_familiesA; $i++) | |
{ | |
$family = $familiesA->[($i-1)]; | |
print OUTPUT "$family\n"; | |
print STDERR "Getting alignment for $family...\n"; | |
$st = "SELECT ID, DISP_ID, CIGAR from $table_w WHERE AC=\'$family\'"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$id = $array[0]; | |
$disp_id = $array[1]; | |
$cigar = $array[2]; | |
print OUTPUT "$disp_id $cigar\n"; | |
} | |
} | |
else | |
{ | |
print STDERR "WARNING: no alignment for $family\n"; | |
print OUTPUT "WARNING: no alignment for $family\n"; | |
} | |
print OUTPUT "#END\n"; | |
} | |
for ($i = 1; $i <= $num_familiesB; $i++) | |
{ | |
$family = $familiesB->[($i-1)]; | |
print OUTPUT "$family\n"; | |
print STDERR "Getting alignment for $family...\n"; | |
$st = "SELECT ID, DISP_ID, CIGAR from $table_w WHERE AC=\'$family\'"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$id = $array[0]; | |
$disp_id = $array[1]; | |
$cigar = $array[2]; | |
print OUTPUT "$disp_id $cigar\n"; | |
} | |
} | |
else | |
{ | |
print STDERR "WARNING: no alignment for $family\n"; | |
print OUTPUT "WARNING: no alignment for $family\n"; | |
} | |
print OUTPUT "#END\n"; | |
} | |
close(OUTPUT); | |
return($errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# GET A LIST OF ALL THE TREEFAM FAMILIES OF A PARTICULAR TYPE ("A" OR "B"): | |
# SUBROUTINE SYNOPSIS: get_family_list(): get a list of all TreeFam families of type A/B | |
sub get_family_list | |
{ | |
my $version = $_[0]; # VERSION OF TREEFAM THAT WE WANT TO GET FAMILIES FOR, eg. 8 | |
my $type = $_[1]; # TYPE OF FAMILY ("A" OR "B") | |
my $dbh; # | |
my $database; # VERSION OF THE TREEFAM DATABASE, eg. treefam_8 | |
my $table_w; # THE DATABASE TABLE OF INTEREST | |
my $st; # | |
my $sth; # | |
my $rv; # | |
my @array; # | |
my $AC; # TREEFAM ACCESSION FOR A FAMILY | |
my @families; # ARRAY OF TREEFAM FAMILIES | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
if ($type ne 'A' && $type ne 'B') | |
{ | |
$errormsg = "ERROR: get_family_list: type $type\n"; | |
$errorcode = 1; # ERRORCODE=1 | |
return(\@families,$errorcode,$errormsg); | |
} | |
$database = "treefam_".$version; | |
# CONNECT TO THE DATABASE AND GET ALL FAMILIES OF THIS TYPE: | |
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return(\@families,$errorcode,$errormsg); | |
# GET ALL THE FAMILIES OF TYPE $type: | |
if ($type eq 'A') { $table_w = "familyA"; } | |
elsif ($type eq 'B') { $table_w = "familyB";} | |
$st = "SELECT AC from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
@families = (@families,$AC); | |
if ($#families % 100 == 0) { print STDERR "Read $#families families now (of type $type) ...\n"; } | |
} | |
} | |
$dbh->disconnect(); | |
return(\@families,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &print_error | |
sub test_print_error | |
{ | |
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR | |
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR | |
($errormsg,$errorcode) = &print_error(45,45,1); | |
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;} | |
($errormsg,$errorcode) = &print_error('My error message','My error message',1); | |
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;} | |
($errormsg,$errorcode) = &print_error('none',45,1); | |
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;} | |
($errormsg,$errorcode) = &print_error('My error message', 0, 1); | |
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;} | |
} | |
#------------------------------------------------------------------# | |
# PRINT OUT AN ERROR MESSAGE AND EXIT. | |
sub print_error | |
{ | |
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED. | |
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED. | |
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT | |
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; | |
exit; | |
} | |
} | |
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/)) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; | |
exit; | |
} | |
} | |
if ($errormsg eq 'none' || $errorcode == 0) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; | |
exit; | |
} | |
} | |
else | |
{ | |
print STDERR "$errormsg"; | |
exit; | |
} | |
return($errormsg,$errorcode); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment