Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Last active October 12, 2015 15:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/4045251 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/4045251 to your computer and use it in GitHub Desktop.
Perl script to translate TreeFam cigar-format alignments to fasta-format alignments
#!/usr/local/bin/perl
=head1 NAME
translate_treefam_cigars_to_alns.pl
=head1 SYNOPSIS
translate_treefam_cigars_to_alns.pl treefam_version output outputdir cigars alntype start_with
=head1 DESCRIPTION
This script reads in a file with cigar-format TreeFam alignments (<cigars>), for
a particular version of the TreeFam database, and translates these into fasta
format alignments. The fasta-format alignments are written in the output file
<output>, with "#END" between each pair of alignments. The argument <alntype>
says whether "full" or "seed" alignments should be used. If the <start_with>
argument is not "none", then <start_with> should be the family to start with (used
when a job has crashed, to restart at this family).
=head1 VERSION
Perl script last edited 22-Oct-2012.
=head1 CONTACT
alc@sanger.ac.uk (Avril Coghlan)
=cut
#
# Perl script translate_treefam_cigars_to_alns.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 22-Oct-12.
# Last edited 22-Oct-2012.
# SCRIPT SYNOPSIS: translate_treefam_cigars_to_alns.pl: reads a file of cigar-format TreeFam alignments, and converts them to fasta format
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
use strict;
use warnings;
use DBI;
my $num_args = $#ARGV + 1;
if ($num_args != 6)
{
print "Usage of translate_treefam_cigars_to_alns.pl\n\n";
print "perl translate_treefam_cigars_to_alns.pl <treefam_version> <output> <outputdir> <cigars> <alntype> <start_with>\n";
print "where <treefam_version> is the version of TreeFam to use (eg. 7),\n";
print " <output> is the output file of alignments,\n";
print " <outputdir> is the directory for writing output files,\n";
print " <cigars> is the file with cigar-format alignments,\n";
print " <start_with> is the family to start with ('none' or a family name)\n";
print "For example, >perl translate_treefam_cigars_to_alns.pl 7 myalns\n";
print "/nfs/users/nfs_a/alc/Documents/StrongyloidesTreeFam alignments full TF105184\n";
exit;
}
# FIND THE TREEFAM VERSION TO USE:
my $treefam_version = $ARGV[0];
# FIND THE OUTPUT FILE NAME:
my $output = $ARGV[1];
# FIND THE DIRECTORY TO WRITE OUTPUT FILES TO:
my $outputdir = $ARGV[2];
# FIND THE FILE WITH CIGAR-FORMAT ALIGNMENTS:
my $cigars = $ARGV[3];
# FIND THE TYPE OF ALIGNMENT TO USE:
my $alntype = $ARGV[4];
# FIND THE FAMILY TO START WITH:
my $start_with = $ARGV[5];
# TEST SUBROUTINES:
my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING.
my $PRINT_WARNINGS = 0; # SAYS WHETHER TO PRINT WARNINGS ABOUT MISSING DATA.
&test_print_error;
&test_get_fasta_aln_for_seq;
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
&run_main_program($treefam_version,$output,$outputdir,$cigars,$alntype,$start_with);
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
sub run_main_program
{
my $version = $_[0]; # THE TREEFAM VERSION TO USE.
my $output = $_[1]; # THE OUTPUT FILE.
my $outputdir = $_[2]; # DIRECTORY TO WRITE OUTPUT FILES TO.
my $cigars = $_[3]; # THE FILE WITH TREEFAM CIGAR-FORMAT ALIGNMENTS.
my $alntype = $_[4]; # FIND THE TYPE OF ALIGNMENT TO USE
my $start_with = $_[5]; # FAMILY TO START WITH
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR.
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR.
# READ IN THE CIGAR-FORMAT ALIGNMENTS FOR THE FAMILIES:
($errorcode,$errormsg) = &read_cigar_alignments($cigars,$version,$output,$outputdir,$alntype,$start_with);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
}
#------------------------------------------------------------------#
# READ IN THE CIGAR-FORMAT ALIGNMENTS FOR THE FAMILIES:
sub read_cigar_alignments
{
my $cigars = $_[0]; # THE FILE WITH THE CIGAR-FORMAT ALIGNMENTS
my $version = $_[1]; # VERSION OF THE TREEFAM DATABASE TO USE
my $output = $_[2]; # OUTPUT FILE TO WRITE TO
my $outputdir = $_[3]; # OUTPUT DIRECTORY TO WRITE FILES TO
my $alntype = $_[4]; # TYPE OF ALIGNMENT TO USE (FULL/SEED)
my $start_with = $_[5]; # FAMILY TO START WITH
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $line; #
my @temp; #
my $family = "none";# THE TREEFAM FAMILY
my @seqs; # NAMES OF SEQUENCES IN A TREEFAM FAMILY
my $seq; # NAME OF A SEQUENCE IN A FAMILY
my $cigar; # CIGAR STRING FOR A SEQUENCE
my %CIGAR = (); # HASH TABLE TO RECORD CIGAR STRINGS FOR SEQUENCES
my $mfa; # FASTA FORMAT ALIGNMENT
my $SEQ; # HASH TABLE FOR STORING SEQUENCES OF PROTEINS IN A FAMILY
my $ID; # HASH TABLE FOR STORING IDs OF PROTEINS IN A FAMILY
my %DISCARD = (); # RECORDS WHICH SEQUENCES TO DISCARD FROM A FAMILY
my $mfa_line_length = "none";# LENGTH OF THE ENTRIES IN THE MULTIPLE ALIGNMENT
my $found_start_with = 0; # SAYS WHETHER WE'VE SEEN FAMILY $start_with ALREADY
# OPEN THE OUTPUT FILE:
$output = $outputdir."/".$output;
open(OUTPUT,">$output") || die "ERROR: read_cigar_alignments: cannot open $output\n";
# READ IN THE CIGAR-FORMAT ALIGNMENTS:
open(CIGARS,"$cigars") || die "ERROR: read_cigar_alignments: cannot open $cigars\n";
while(<CIGARS>)
{
$line = $_;
chomp $line;
@temp = split(/\s+/,$line);
if (substr($line,0,2) eq 'TF' && $#temp == 0)
{
$family = $temp[0];
if ($family eq $start_with || $start_with eq 'none') { $found_start_with = 1;}
if ($found_start_with == 0) { goto NEXT_FAMILY;}
print STDERR "Looking at family $family...\n";
%CIGAR = ();
%DISCARD = ();
$mfa_line_length = "none";
undef(@seqs);
print OUTPUT "$family\n";
}
elsif ($#temp == 1 && $found_start_with == 1)
{
$seq = $temp[0];
$cigar = $temp[1];
@seqs = (@seqs,$seq);
if ($CIGAR{$seq})
{
# IN SOME CASES TWO SEQUENCES WERE STORED UNDER THE SAME NAME eg. SpPLK1_STRPU in TF101089 HAS
# TWO ENTRIES, BUT THEY SHOULD BE SpPLK1_F2_STRPU AND SpPLK1_STRPU. WE CAN'T TELL THEM APART,
# SO DISCARD BOTH
$DISCARD{$seq} = 1;
}
else
{
$CIGAR{$seq} = $cigar;
}
}
elsif (substr($line,0,4) eq '#END' && $#temp == 0 && $found_start_with == 1)
{
# GET THE IDENTIFIERS FOR THE SEQUENCES IN THE FAMILY:
($ID,$errorcode,$errormsg) = &get_ids_for_family($family,$version,\%DISCARD,$alntype);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# GET THE SEQUENCES FOR THE PROTEINS IN THE FAMILY:
($SEQ,$errorcode,$errormsg) = &get_seqs_for_family(\@seqs,$version,$ID,\%DISCARD,$alntype);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# CONVERT THE CIGAR FORMAT ALIGNMENT TO A FASTA-FORMAT ALIGNMENT:
($mfa,$errorcode,$errormsg) = &convert_cigar_to_fasta(\%CIGAR,$SEQ,\@seqs,\%DISCARD,$alntype);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
if ($mfa eq 'none')
{
print STDERR "WARNING: could not get alignment for family $family\n";
}
else
{
if ($mfa_line_length eq 'none') { $mfa_line_length = length($mfa);}
else
{
if ($mfa_line_length != length($mfa))
{
$errormsg = "ERROR: read_cigar_alignments: length of $mfa is not expected length for family $family\n";
$errorcode= 16; # ERRORCODE=16
return($errorcode,$errormsg);
}
}
# PRINT THE MULTIPLE ALIGNMENT TO THE OUTPUT FILE:
print OUTPUT "$mfa";
print OUTPUT "#END\n";
}
}
NEXT_FAMILY:
}
close(CIGARS);
close(OUTPUT);
return($errorcode,$errormsg);
}
#------------------------------------------------------------------#
# CONVERT THE CIGAR FORMAT ALIGNMENT TO A FASTA-FORMAT ALIGNMENT:
# SUBROUTINE SYNOPSIS: convert_cigar_to_fasta(): converts a cigar-format alignment to fasta format
sub convert_cigar_to_fasta
{
my $CIGAR = $_[0]; # THE HASH TABLE WITH THE CIGAR ALIGNMENT FOR EACH SEQUENCE
my $SEQ = $_[1]; # THE HASH TABLE WITH THE SEQUENCE FOR EACH GENE
my $seqs = $_[2]; # ARRAY WITH SEQUENCES IN THE FAMILY
my $DISCARD = $_[3]; # HASH TABLE WITH SEQUENCES TO DISCARD FROM THE FAMILY
my $alntype = $_[4]; # TYPE OF ALIGNMENT BEING USED (SEED/FULL)
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS "none" IF THERE IS NO ERROR
my $mfa = ""; # FASTA FORMAT ALIGNMENT FOR THE FAMILY
my $no_seqs; # NUMBER OF SEQUENCES IN THE FAMILY
my $seq; # A SEQUENCE IN THE FAMILY
my $i; #
my $sequence; # PROTEIN SEQUENCE FOR $seq
my $cigar; # CIGAR FORMAT ALIGNMENT FOR $seq
my $fasta; # FASTA FORMAT ALIGNMENT FOR $seq
$no_seqs = @$seqs;
for ($i = 1; $i <= $no_seqs; $i++)
{
$seq = $seqs->[($i-1)];
if (!($DISCARD->{$seq}))
{
# GET THE SEQUENCE FOR $seq:
if (!($SEQ->{$seq}))
{
if ($alntype eq 'full') # WE ARE USING THE FULL ALIGNMENTS SO SHOULD HAVE SEQUENCES FOR ALL GENES
{
$errormsg = "ERROR: convert_cigar_to_fasta: do not know sequence for $seq\n";
$errorcode = 9; # ERRORCODE=9
return($mfa,$errorcode,$errormsg);
}
elsif ($alntype eq 'seed') # WE ARE USING THE SEED ALIGNMENTS SO MAY BE MISSING SEQUENCES FOR SOME GENES
{
if ($PRINT_WARNINGS == 1) { print STDERR "WARNING: convert_cigar_to_fasta: do not know sequence for $seq in seed alignment\n"; }
}
}
else
{
$sequence = $SEQ->{$seq};
# GET THE CIGAR FOR $seq:
if (!($CIGAR->{$seq}))
{
$errormsg = "ERROR: convert_cigar_to_fasta: do not know cigar for $seq\n";
$errorcode = 10; # ERRORCODE=10
return($mfa,$errorcode,$errormsg)
}
$cigar = $CIGAR->{$seq};
# CONVERT THE CIGAR FOR $seq TO A FASTA FORMAT ALIGNMENT:
($fasta,$errorcode,$errormsg) = &get_fasta_aln_for_seq($sequence,$cigar);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
if ($fasta eq 'none')
{
$mfa = "none";
return($mfa,$errorcode,$errormsg);
}
# ADD THE FASTA FORMAT ALIGNMENT FOR $seq TO $mfa:
$mfa = $mfa.">$seq\n";
$mfa = $mfa."$fasta\n";
}
}
}
return($mfa,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &get_fasta_aln_for_seq
sub test_get_fasta_aln_for_seq
{
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR
my $fasta; # FASTA FORMAT ALIGNMENT FOR A SEQUENCE
($fasta,$errorcode,$errormsg) = &get_fasta_aln_for_seq("ABCDEF","3D6M");
if ($fasta ne '---ABCDEF' || $errorcode != 0) { print STDERR "ERROR: test_get_fasta_aln_for_seq: failed test1\n";}
($fasta,$errorcode,$errormsg) = &get_fasta_aln_for_seq("ABCDEF","1D6M1D");
if ($fasta ne '-ABCDEF-' || $errorcode != 0) { print STDERR "ERROR: test_get_fasta_aln_for_seq: failed test2\n";}
($fasta,$errorcode,$errormsg) = &get_fasta_aln_for_seq("ABCDEF","3M3D3M");
if ($fasta ne 'ABC---DEF' || $errorcode != 0) { print STDERR "ERROR: test_get_fasta_aln_for_seq: failed test3\n";}
($fasta,$errorcode,$errormsg) = &get_fasta_aln_for_seq("ABCDEF","3M3I3M");
if ($errorcode != 14) { print STDERR "ERROR: test_get_fasta_aln_for_seq: failed test4\n";}
($fasta,$errorcode,$errormsg) = &get_fasta_aln_for_seq("ABCDEF","3M");
if ($errorcode != 0 || $fasta ne 'none') { print STDERR "ERROR: test_get_fasta_aln_for_seq: failed test5\n";}
($fasta,$errorcode,$errormsg) = &get_fasta_aln_for_seq("ABCDEF","20M");
if ($errorcode != 0 || $fasta ne 'none') { print STDERR "ERROR: test_get_fasta_aln_for_seq: failed test6\n";}
}
#------------------------------------------------------------------#
# CONVERT THE CIGAR FOR A SEQUENCE TO A FASTA FORMAT ALIGNMENT:
sub get_fasta_aln_for_seq
{
my $sequence = $_[0]; # PROTEIN SEQUENCE FOR A TREEFAM GENE
my $cigar = $_[1]; # CIGAR FORMAT ALIGNMENT FOR THE SEQUENCE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS "none" IF THERE IS NO ERROR
my $fasta = ""; # FASTA FORMAT ALIGNMENT FOR THE SEQUENCE
my @tmp; #
my @tmp2; #
my $i; #
my $cigar_type; # TYPE OF ELEMENT IN THE CIGAR STRING
my $cigar_len; # LENGTH OF ELEMENT IN THE CIGAR STRING
my $j; #
my $sequence_index; # INDEX IN THE SEQUENCE $sequence
my $aa; # AN AMINO ACID IN SEQUENCE $sequence
my $sequence_length; # LENGTH OF $sequence
my $last_letter; # LAST LETTER OF $sequence
# REMOVE * FROM THE END OF $sequence:
$last_letter = substr($sequence,length($sequence)-1,1);
if ($last_letter eq '*') { chop($sequence);}
# REMOVE . FROM THE END OF $sequence (OCCURS FOR SOME SPECIES):
$last_letter = substr($sequence,length($sequence)-1,1);
if ($last_letter eq '.') { chop($sequence);}
@tmp = split(/\d+/,$cigar);
@tmp2 = split(/\D+/,$cigar);
# LOOK AT EACH ELEMENT IN THE CIGAR STRING
$sequence_index = -1;
$sequence_length = length($sequence);
for ($i = 0; $i <= $#tmp2; $i++)
{
$cigar_type = $tmp[($i+1)];
$cigar_len = $tmp2[$i];
if ($cigar_type eq 'D')
{
for ($j = 1; $j <= $cigar_len; $j++)
{
$fasta = $fasta."-";
}
}
elsif ($cigar_type eq 'M')
{
for ($j = 1; $j <= $cigar_len; $j++)
{
$aa = "*";
while($aa eq '*' || $aa eq '.') # IGNORE *s OR .s IN THE SEQUENCE (SOME SEQUENCES HAVE INTERNAL '*' OR '.'):
{
$sequence_index++;
if ($sequence_index > $sequence_length - 1)
{
$fasta = "none";
print STDERR "WARNING: get_fasta_aln_for_seq: sequence_length $sequence_length but sequence_index $sequence_index (sequence $sequence, cigar $cigar)\n";
return($fasta,$errorcode,$errormsg);
}
$aa = substr($sequence,$sequence_index,1);
}
$fasta = $fasta.$aa;
}
}
else
{
$errormsg = "ERROR: get_fasta_aln_for_seq: cigar_type $cigar_type\n";
$errorcode = 14; # ERRORCODE=14
return($fasta,$errorcode,$errormsg);
}
}
if ($sequence_index != $sequence_length - 1) # THE CIGAR ALIGNMENT AND SEQUENCE ARE NOT COMPATIBLE:
{
$fasta = 'none';
print STDERR "WARNING: get_fasta_aln_for_seq: sequence_length $sequence_length but sequence_index $sequence_index (sequence $sequence, cigar $cigar)\n";
return($fasta,$errorcode,$errormsg);
}
return($fasta,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# GET THE IDENTIFIERS FOR THE SEQUENCES IN THE FAMILY:
# SUBROUTINE SYNOPSIS: get_ids_for_family(): get the IDs for the sequences in a TreeFam family
sub get_ids_for_family
{
my $family = $_[0]; # TREEFAM FAMILY IDENTIFIER
my $version = $_[1]; # VERSION OF THE TREEFAM DATABASE TO USE
my $DISCARD = $_[2]; # HASH TABLE OF SEQUENCES TO DISCARD
my $alntype = $_[3]; # TYPE OF ALIGNMENTS TO USE (SEED/FULL)
my %ID = (); # HASH TABLE TO STORE IDENTIFIERS FOR SEQUENCES
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $database; # DATABASE TO CONNECT TO
my $dbh; #
my $table_w; # DATABASE TABLE THAT WE WANT TO LOOK AT
my $st; #
my $sth; #
my $rv; #
my @array; #
my $id; # TAXONOMY ID.
my $disp_id; # DISPLAY ID.
# CONNECT TO THE DATABASE:
$database = "treefam_".$version;
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return;
# CHECK THAT WE ARE USING EITHER SEED OR FULL ALIGNMENTS:
if ($alntype ne 'seed' && $alntype ne 'full')
{
$errormsg = "ERROR: get_ids_for_family: alntype $alntype\n";
$errorcode = 16; # ERRORCODE=16
$dbh->disconnect();
return(\%ID,$errorcode,$errormsg);
}
# GET THE SEQUENCES FOR THEFAMILY FROM THE DATABASE:
if ($alntype eq 'full')
{
$table_w = "aa_full_align";
}
else
{
$table_w = "aa_seed_align";
}
$st = "SELECT ID, DISP_ID from $table_w WHERE AC=\'$family\'";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$id = $array[0];
$disp_id = $array[1];
if ($ID{$disp_id})
{
$errormsg = "ERROR: get_ids_for_family: already have id for $disp_id\n";
$errorcode = 5; # ERRORCODE=5
$dbh->disconnect();
return(\%ID,$errorcode,$errormsg);
}
if (!($DISCARD->{$disp_id}))
{
$ID{$disp_id} = $id;
}
}
}
else
{
print STDERR "WARNING: get_ids_for_family: no sequences found for family $family\n";
$dbh->disconnect();
return(\%ID,$errorcode,$errormsg);
}
$dbh->disconnect();
return(\%ID,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# GET THE SEQUENCES FOR THE PROTEINS IN THE FAMILY:
# SUBROUTINE SYNOPSIS: get_seqs_for_family(): get the sequences for the proteins in a TreeFam family
sub get_seqs_for_family
{
my $seqs = $_[0]; # POINTER TO ARRAY OF SEQUENCES IN A FAMILY
my $version = $_[1]; # VERSION OF THE TREEFAM DATABASE TO USE
my $ID = $_[2]; # HASH TABLE OF IDENTIFIERS FOR TREEFAM SEQUENCES IN THE FAMILY
my $DISCARD = $_[3]; # HASH TABLE OF SEQUENCES TO DISCARD
my $alntype = $_[4]; # TYPE OF ALIGNMENT TO USE (SEED/FULL)
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my %SEQ = (); # HASH TABLE FOR STORING SEQUENCES OF PROTEINS IN A FAMILY
my $no_seqs; # NUMBER OF SEQUENCES IN A FAMILY
my $seq; # SEQUENCE IN A FAMILY
my $database; # DATABASE THAT WE WANT TO CONNECT TO
my $dbh; #
my $table_w; # DATABASE TABLE THAT WE ARE INTERESTED IN
my $i; #
my $st; #
my $sth; #
my $rv; #
my @array; #
my $id; # TREEFAM IDENTIFIER FOR A SEQUENCE
my $sequence; # PROTEIN SEQUENCE FOR A GENE.
# CONNECT TO THE DATABASE AND GET ALL FAMILIES OF THIS TYPE:
$database = "treefam_".$version;
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return;
# GET THE ALIGNMENT FOR EACH FAMILY FROM THE DATABASE:
$table_w = "aa_seq";
$no_seqs = @$seqs;
for ($i = 1; $i <= $no_seqs; $i++)
{
$seq = $seqs->[($i-1)];
if (!($DISCARD->{$seq}))
{
if (!($ID->{$seq}))
{
$errormsg = "ERROR: get_seqs_for_family: do not know id for $seq\n";
$errorcode = 6; # ERRORCODE=6
$dbh->disconnect();
return(\%SEQ,$errorcode,$errormsg);
}
$id = $ID->{$seq};
$st = "SELECT SEQ from $table_w WHERE ID=\'$id\'";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$sequence = $array[0];
if ($SEQ{$seq})
{
$errormsg = "ERROR: get_seqs_for_family: already have sequence for $seq\n";
$errorcode= 7; # ERRORCODE=7
$dbh->disconnect();
return(\%SEQ,$errorcode,$errormsg);
}
$SEQ{$seq} = $sequence;
}
}
else
{
if ($alntype eq 'full') # WE ARE USING THE FULL ALIGNMENT SO SHOULD KNOW SEQUENCES OF ALL GENES
{
$errormsg = "ERROR: get_seqs_for_family: do not know sequence for $seq\n";
$errorcode = 8; # ERRORCODE=8
$dbh->disconnect();
return(\%SEQ,$errorcode,$errormsg);
}
else # WE ARE USING THE SEED ALIGNMENT SO MAY BE MISSING SEQUENCES OF SOME GENES
{
if ($PRINT_WARNINGS == 1) { print STDERR "WARNING: get_seqs_for_family: do not know sequence for $seq from seed alignment\n"; }
}
}
}
}
$dbh->disconnect();
return(\%SEQ,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &print_error
sub test_print_error
{
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR
($errormsg,$errorcode) = &print_error(45,45,1);
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message','My error message',1);
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;}
($errormsg,$errorcode) = &print_error('none',45,1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message', 0, 1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;}
}
#------------------------------------------------------------------#
# PRINT OUT AN ERROR MESSAGE AND EXIT.
sub print_error
{
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED.
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED.
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/)
{
if ($called_from_test == 1)
{
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n";
exit;
}
}
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/))
{
if ($called_from_test == 1)
{
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n";
exit;
}
}
if ($errormsg eq 'none' || $errorcode == 0)
{
if ($called_from_test == 1)
{
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n";
exit;
}
}
else
{
print STDERR "$errormsg";
exit;
}
return($errormsg,$errorcode);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment