Last active
October 12, 2015 15:08
-
-
Save avrilcoghlan/4045251 to your computer and use it in GitHub Desktop.
Perl script to translate TreeFam cigar-format alignments to fasta-format alignments
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
=head1 NAME | |
translate_treefam_cigars_to_alns.pl | |
=head1 SYNOPSIS | |
translate_treefam_cigars_to_alns.pl treefam_version output outputdir cigars alntype start_with | |
=head1 DESCRIPTION | |
This script reads in a file with cigar-format TreeFam alignments (<cigars>), for | |
a particular version of the TreeFam database, and translates these into fasta | |
format alignments. The fasta-format alignments are written in the output file | |
<output>, with "#END" between each pair of alignments. The argument <alntype> | |
says whether "full" or "seed" alignments should be used. If the <start_with> | |
argument is not "none", then <start_with> should be the family to start with (used | |
when a job has crashed, to restart at this family). | |
=head1 VERSION | |
Perl script last edited 22-Oct-2012. | |
=head1 CONTACT | |
alc@sanger.ac.uk (Avril Coghlan) | |
=cut | |
# | |
# Perl script translate_treefam_cigars_to_alns.pl | |
# Written by Avril Coghlan (alc@sanger.ac.uk) | |
# 22-Oct-12. | |
# Last edited 22-Oct-2012. | |
# SCRIPT SYNOPSIS: translate_treefam_cigars_to_alns.pl: reads a file of cigar-format TreeFam alignments, and converts them to fasta format | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
use strict; | |
use warnings; | |
use DBI; | |
my $num_args = $#ARGV + 1; | |
if ($num_args != 6) | |
{ | |
print "Usage of translate_treefam_cigars_to_alns.pl\n\n"; | |
print "perl translate_treefam_cigars_to_alns.pl <treefam_version> <output> <outputdir> <cigars> <alntype> <start_with>\n"; | |
print "where <treefam_version> is the version of TreeFam to use (eg. 7),\n"; | |
print " <output> is the output file of alignments,\n"; | |
print " <outputdir> is the directory for writing output files,\n"; | |
print " <cigars> is the file with cigar-format alignments,\n"; | |
print " <start_with> is the family to start with ('none' or a family name)\n"; | |
print "For example, >perl translate_treefam_cigars_to_alns.pl 7 myalns\n"; | |
print "/nfs/users/nfs_a/alc/Documents/StrongyloidesTreeFam alignments full TF105184\n"; | |
exit; | |
} | |
# FIND THE TREEFAM VERSION TO USE: | |
my $treefam_version = $ARGV[0]; | |
# FIND THE OUTPUT FILE NAME: | |
my $output = $ARGV[1]; | |
# FIND THE DIRECTORY TO WRITE OUTPUT FILES TO: | |
my $outputdir = $ARGV[2]; | |
# FIND THE FILE WITH CIGAR-FORMAT ALIGNMENTS: | |
my $cigars = $ARGV[3]; | |
# FIND THE TYPE OF ALIGNMENT TO USE: | |
my $alntype = $ARGV[4]; | |
# FIND THE FAMILY TO START WITH: | |
my $start_with = $ARGV[5]; | |
# TEST SUBROUTINES: | |
my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING. | |
my $PRINT_WARNINGS = 0; # SAYS WHETHER TO PRINT WARNINGS ABOUT MISSING DATA. | |
&test_print_error; | |
&test_get_fasta_aln_for_seq; | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
&run_main_program($treefam_version,$output,$outputdir,$cigars,$alntype,$start_with); | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
sub run_main_program | |
{ | |
my $version = $_[0]; # THE TREEFAM VERSION TO USE. | |
my $output = $_[1]; # THE OUTPUT FILE. | |
my $outputdir = $_[2]; # DIRECTORY TO WRITE OUTPUT FILES TO. | |
my $cigars = $_[3]; # THE FILE WITH TREEFAM CIGAR-FORMAT ALIGNMENTS. | |
my $alntype = $_[4]; # FIND THE TYPE OF ALIGNMENT TO USE | |
my $start_with = $_[5]; # FAMILY TO START WITH | |
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR. | |
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR. | |
# READ IN THE CIGAR-FORMAT ALIGNMENTS FOR THE FAMILIES: | |
($errorcode,$errormsg) = &read_cigar_alignments($cigars,$version,$output,$outputdir,$alntype,$start_with); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
} | |
#------------------------------------------------------------------# | |
# READ IN THE CIGAR-FORMAT ALIGNMENTS FOR THE FAMILIES: | |
sub read_cigar_alignments | |
{ | |
my $cigars = $_[0]; # THE FILE WITH THE CIGAR-FORMAT ALIGNMENTS | |
my $version = $_[1]; # VERSION OF THE TREEFAM DATABASE TO USE | |
my $output = $_[2]; # OUTPUT FILE TO WRITE TO | |
my $outputdir = $_[3]; # OUTPUT DIRECTORY TO WRITE FILES TO | |
my $alntype = $_[4]; # TYPE OF ALIGNMENT TO USE (FULL/SEED) | |
my $start_with = $_[5]; # FAMILY TO START WITH | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $line; # | |
my @temp; # | |
my $family = "none";# THE TREEFAM FAMILY | |
my @seqs; # NAMES OF SEQUENCES IN A TREEFAM FAMILY | |
my $seq; # NAME OF A SEQUENCE IN A FAMILY | |
my $cigar; # CIGAR STRING FOR A SEQUENCE | |
my %CIGAR = (); # HASH TABLE TO RECORD CIGAR STRINGS FOR SEQUENCES | |
my $mfa; # FASTA FORMAT ALIGNMENT | |
my $SEQ; # HASH TABLE FOR STORING SEQUENCES OF PROTEINS IN A FAMILY | |
my $ID; # HASH TABLE FOR STORING IDs OF PROTEINS IN A FAMILY | |
my %DISCARD = (); # RECORDS WHICH SEQUENCES TO DISCARD FROM A FAMILY | |
my $mfa_line_length = "none";# LENGTH OF THE ENTRIES IN THE MULTIPLE ALIGNMENT | |
my $found_start_with = 0; # SAYS WHETHER WE'VE SEEN FAMILY $start_with ALREADY | |
# OPEN THE OUTPUT FILE: | |
$output = $outputdir."/".$output; | |
open(OUTPUT,">$output") || die "ERROR: read_cigar_alignments: cannot open $output\n"; | |
# READ IN THE CIGAR-FORMAT ALIGNMENTS: | |
open(CIGARS,"$cigars") || die "ERROR: read_cigar_alignments: cannot open $cigars\n"; | |
while(<CIGARS>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
if (substr($line,0,2) eq 'TF' && $#temp == 0) | |
{ | |
$family = $temp[0]; | |
if ($family eq $start_with || $start_with eq 'none') { $found_start_with = 1;} | |
if ($found_start_with == 0) { goto NEXT_FAMILY;} | |
print STDERR "Looking at family $family...\n"; | |
%CIGAR = (); | |
%DISCARD = (); | |
$mfa_line_length = "none"; | |
undef(@seqs); | |
print OUTPUT "$family\n"; | |
} | |
elsif ($#temp == 1 && $found_start_with == 1) | |
{ | |
$seq = $temp[0]; | |
$cigar = $temp[1]; | |
@seqs = (@seqs,$seq); | |
if ($CIGAR{$seq}) | |
{ | |
# IN SOME CASES TWO SEQUENCES WERE STORED UNDER THE SAME NAME eg. SpPLK1_STRPU in TF101089 HAS | |
# TWO ENTRIES, BUT THEY SHOULD BE SpPLK1_F2_STRPU AND SpPLK1_STRPU. WE CAN'T TELL THEM APART, | |
# SO DISCARD BOTH | |
$DISCARD{$seq} = 1; | |
} | |
else | |
{ | |
$CIGAR{$seq} = $cigar; | |
} | |
} | |
elsif (substr($line,0,4) eq '#END' && $#temp == 0 && $found_start_with == 1) | |
{ | |
# GET THE IDENTIFIERS FOR THE SEQUENCES IN THE FAMILY: | |
($ID,$errorcode,$errormsg) = &get_ids_for_family($family,$version,\%DISCARD,$alntype); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# GET THE SEQUENCES FOR THE PROTEINS IN THE FAMILY: | |
($SEQ,$errorcode,$errormsg) = &get_seqs_for_family(\@seqs,$version,$ID,\%DISCARD,$alntype); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# CONVERT THE CIGAR FORMAT ALIGNMENT TO A FASTA-FORMAT ALIGNMENT: | |
($mfa,$errorcode,$errormsg) = &convert_cigar_to_fasta(\%CIGAR,$SEQ,\@seqs,\%DISCARD,$alntype); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
if ($mfa eq 'none') | |
{ | |
print STDERR "WARNING: could not get alignment for family $family\n"; | |
} | |
else | |
{ | |
if ($mfa_line_length eq 'none') { $mfa_line_length = length($mfa);} | |
else | |
{ | |
if ($mfa_line_length != length($mfa)) | |
{ | |
$errormsg = "ERROR: read_cigar_alignments: length of $mfa is not expected length for family $family\n"; | |
$errorcode= 16; # ERRORCODE=16 | |
return($errorcode,$errormsg); | |
} | |
} | |
# PRINT THE MULTIPLE ALIGNMENT TO THE OUTPUT FILE: | |
print OUTPUT "$mfa"; | |
print OUTPUT "#END\n"; | |
} | |
} | |
NEXT_FAMILY: | |
} | |
close(CIGARS); | |
close(OUTPUT); | |
return($errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# CONVERT THE CIGAR FORMAT ALIGNMENT TO A FASTA-FORMAT ALIGNMENT: | |
# SUBROUTINE SYNOPSIS: convert_cigar_to_fasta(): converts a cigar-format alignment to fasta format | |
sub convert_cigar_to_fasta | |
{ | |
my $CIGAR = $_[0]; # THE HASH TABLE WITH THE CIGAR ALIGNMENT FOR EACH SEQUENCE | |
my $SEQ = $_[1]; # THE HASH TABLE WITH THE SEQUENCE FOR EACH GENE | |
my $seqs = $_[2]; # ARRAY WITH SEQUENCES IN THE FAMILY | |
my $DISCARD = $_[3]; # HASH TABLE WITH SEQUENCES TO DISCARD FROM THE FAMILY | |
my $alntype = $_[4]; # TYPE OF ALIGNMENT BEING USED (SEED/FULL) | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS "none" IF THERE IS NO ERROR | |
my $mfa = ""; # FASTA FORMAT ALIGNMENT FOR THE FAMILY | |
my $no_seqs; # NUMBER OF SEQUENCES IN THE FAMILY | |
my $seq; # A SEQUENCE IN THE FAMILY | |
my $i; # | |
my $sequence; # PROTEIN SEQUENCE FOR $seq | |
my $cigar; # CIGAR FORMAT ALIGNMENT FOR $seq | |
my $fasta; # FASTA FORMAT ALIGNMENT FOR $seq | |
$no_seqs = @$seqs; | |
for ($i = 1; $i <= $no_seqs; $i++) | |
{ | |
$seq = $seqs->[($i-1)]; | |
if (!($DISCARD->{$seq})) | |
{ | |
# GET THE SEQUENCE FOR $seq: | |
if (!($SEQ->{$seq})) | |
{ | |
if ($alntype eq 'full') # WE ARE USING THE FULL ALIGNMENTS SO SHOULD HAVE SEQUENCES FOR ALL GENES | |
{ | |
$errormsg = "ERROR: convert_cigar_to_fasta: do not know sequence for $seq\n"; | |
$errorcode = 9; # ERRORCODE=9 | |
return($mfa,$errorcode,$errormsg); | |
} | |
elsif ($alntype eq 'seed') # WE ARE USING THE SEED ALIGNMENTS SO MAY BE MISSING SEQUENCES FOR SOME GENES | |
{ | |
if ($PRINT_WARNINGS == 1) { print STDERR "WARNING: convert_cigar_to_fasta: do not know sequence for $seq in seed alignment\n"; } | |
} | |
} | |
else | |
{ | |
$sequence = $SEQ->{$seq}; | |
# GET THE CIGAR FOR $seq: | |
if (!($CIGAR->{$seq})) | |
{ | |
$errormsg = "ERROR: convert_cigar_to_fasta: do not know cigar for $seq\n"; | |
$errorcode = 10; # ERRORCODE=10 | |
return($mfa,$errorcode,$errormsg) | |
} | |
$cigar = $CIGAR->{$seq}; | |
# CONVERT THE CIGAR FOR $seq TO A FASTA FORMAT ALIGNMENT: | |
($fasta,$errorcode,$errormsg) = &get_fasta_aln_for_seq($sequence,$cigar); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
if ($fasta eq 'none') | |
{ | |
$mfa = "none"; | |
return($mfa,$errorcode,$errormsg); | |
} | |
# ADD THE FASTA FORMAT ALIGNMENT FOR $seq TO $mfa: | |
$mfa = $mfa.">$seq\n"; | |
$mfa = $mfa."$fasta\n"; | |
} | |
} | |
} | |
return($mfa,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &get_fasta_aln_for_seq | |
sub test_get_fasta_aln_for_seq | |
{ | |
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR | |
my $fasta; # FASTA FORMAT ALIGNMENT FOR A SEQUENCE | |
($fasta,$errorcode,$errormsg) = &get_fasta_aln_for_seq("ABCDEF","3D6M"); | |
if ($fasta ne '---ABCDEF' || $errorcode != 0) { print STDERR "ERROR: test_get_fasta_aln_for_seq: failed test1\n";} | |
($fasta,$errorcode,$errormsg) = &get_fasta_aln_for_seq("ABCDEF","1D6M1D"); | |
if ($fasta ne '-ABCDEF-' || $errorcode != 0) { print STDERR "ERROR: test_get_fasta_aln_for_seq: failed test2\n";} | |
($fasta,$errorcode,$errormsg) = &get_fasta_aln_for_seq("ABCDEF","3M3D3M"); | |
if ($fasta ne 'ABC---DEF' || $errorcode != 0) { print STDERR "ERROR: test_get_fasta_aln_for_seq: failed test3\n";} | |
($fasta,$errorcode,$errormsg) = &get_fasta_aln_for_seq("ABCDEF","3M3I3M"); | |
if ($errorcode != 14) { print STDERR "ERROR: test_get_fasta_aln_for_seq: failed test4\n";} | |
($fasta,$errorcode,$errormsg) = &get_fasta_aln_for_seq("ABCDEF","3M"); | |
if ($errorcode != 0 || $fasta ne 'none') { print STDERR "ERROR: test_get_fasta_aln_for_seq: failed test5\n";} | |
($fasta,$errorcode,$errormsg) = &get_fasta_aln_for_seq("ABCDEF","20M"); | |
if ($errorcode != 0 || $fasta ne 'none') { print STDERR "ERROR: test_get_fasta_aln_for_seq: failed test6\n";} | |
} | |
#------------------------------------------------------------------# | |
# CONVERT THE CIGAR FOR A SEQUENCE TO A FASTA FORMAT ALIGNMENT: | |
sub get_fasta_aln_for_seq | |
{ | |
my $sequence = $_[0]; # PROTEIN SEQUENCE FOR A TREEFAM GENE | |
my $cigar = $_[1]; # CIGAR FORMAT ALIGNMENT FOR THE SEQUENCE | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS "none" IF THERE IS NO ERROR | |
my $fasta = ""; # FASTA FORMAT ALIGNMENT FOR THE SEQUENCE | |
my @tmp; # | |
my @tmp2; # | |
my $i; # | |
my $cigar_type; # TYPE OF ELEMENT IN THE CIGAR STRING | |
my $cigar_len; # LENGTH OF ELEMENT IN THE CIGAR STRING | |
my $j; # | |
my $sequence_index; # INDEX IN THE SEQUENCE $sequence | |
my $aa; # AN AMINO ACID IN SEQUENCE $sequence | |
my $sequence_length; # LENGTH OF $sequence | |
my $last_letter; # LAST LETTER OF $sequence | |
# REMOVE * FROM THE END OF $sequence: | |
$last_letter = substr($sequence,length($sequence)-1,1); | |
if ($last_letter eq '*') { chop($sequence);} | |
# REMOVE . FROM THE END OF $sequence (OCCURS FOR SOME SPECIES): | |
$last_letter = substr($sequence,length($sequence)-1,1); | |
if ($last_letter eq '.') { chop($sequence);} | |
@tmp = split(/\d+/,$cigar); | |
@tmp2 = split(/\D+/,$cigar); | |
# LOOK AT EACH ELEMENT IN THE CIGAR STRING | |
$sequence_index = -1; | |
$sequence_length = length($sequence); | |
for ($i = 0; $i <= $#tmp2; $i++) | |
{ | |
$cigar_type = $tmp[($i+1)]; | |
$cigar_len = $tmp2[$i]; | |
if ($cigar_type eq 'D') | |
{ | |
for ($j = 1; $j <= $cigar_len; $j++) | |
{ | |
$fasta = $fasta."-"; | |
} | |
} | |
elsif ($cigar_type eq 'M') | |
{ | |
for ($j = 1; $j <= $cigar_len; $j++) | |
{ | |
$aa = "*"; | |
while($aa eq '*' || $aa eq '.') # IGNORE *s OR .s IN THE SEQUENCE (SOME SEQUENCES HAVE INTERNAL '*' OR '.'): | |
{ | |
$sequence_index++; | |
if ($sequence_index > $sequence_length - 1) | |
{ | |
$fasta = "none"; | |
print STDERR "WARNING: get_fasta_aln_for_seq: sequence_length $sequence_length but sequence_index $sequence_index (sequence $sequence, cigar $cigar)\n"; | |
return($fasta,$errorcode,$errormsg); | |
} | |
$aa = substr($sequence,$sequence_index,1); | |
} | |
$fasta = $fasta.$aa; | |
} | |
} | |
else | |
{ | |
$errormsg = "ERROR: get_fasta_aln_for_seq: cigar_type $cigar_type\n"; | |
$errorcode = 14; # ERRORCODE=14 | |
return($fasta,$errorcode,$errormsg); | |
} | |
} | |
if ($sequence_index != $sequence_length - 1) # THE CIGAR ALIGNMENT AND SEQUENCE ARE NOT COMPATIBLE: | |
{ | |
$fasta = 'none'; | |
print STDERR "WARNING: get_fasta_aln_for_seq: sequence_length $sequence_length but sequence_index $sequence_index (sequence $sequence, cigar $cigar)\n"; | |
return($fasta,$errorcode,$errormsg); | |
} | |
return($fasta,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# GET THE IDENTIFIERS FOR THE SEQUENCES IN THE FAMILY: | |
# SUBROUTINE SYNOPSIS: get_ids_for_family(): get the IDs for the sequences in a TreeFam family | |
sub get_ids_for_family | |
{ | |
my $family = $_[0]; # TREEFAM FAMILY IDENTIFIER | |
my $version = $_[1]; # VERSION OF THE TREEFAM DATABASE TO USE | |
my $DISCARD = $_[2]; # HASH TABLE OF SEQUENCES TO DISCARD | |
my $alntype = $_[3]; # TYPE OF ALIGNMENTS TO USE (SEED/FULL) | |
my %ID = (); # HASH TABLE TO STORE IDENTIFIERS FOR SEQUENCES | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $database; # DATABASE TO CONNECT TO | |
my $dbh; # | |
my $table_w; # DATABASE TABLE THAT WE WANT TO LOOK AT | |
my $st; # | |
my $sth; # | |
my $rv; # | |
my @array; # | |
my $id; # TAXONOMY ID. | |
my $disp_id; # DISPLAY ID. | |
# CONNECT TO THE DATABASE: | |
$database = "treefam_".$version; | |
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return; | |
# CHECK THAT WE ARE USING EITHER SEED OR FULL ALIGNMENTS: | |
if ($alntype ne 'seed' && $alntype ne 'full') | |
{ | |
$errormsg = "ERROR: get_ids_for_family: alntype $alntype\n"; | |
$errorcode = 16; # ERRORCODE=16 | |
$dbh->disconnect(); | |
return(\%ID,$errorcode,$errormsg); | |
} | |
# GET THE SEQUENCES FOR THEFAMILY FROM THE DATABASE: | |
if ($alntype eq 'full') | |
{ | |
$table_w = "aa_full_align"; | |
} | |
else | |
{ | |
$table_w = "aa_seed_align"; | |
} | |
$st = "SELECT ID, DISP_ID from $table_w WHERE AC=\'$family\'"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$id = $array[0]; | |
$disp_id = $array[1]; | |
if ($ID{$disp_id}) | |
{ | |
$errormsg = "ERROR: get_ids_for_family: already have id for $disp_id\n"; | |
$errorcode = 5; # ERRORCODE=5 | |
$dbh->disconnect(); | |
return(\%ID,$errorcode,$errormsg); | |
} | |
if (!($DISCARD->{$disp_id})) | |
{ | |
$ID{$disp_id} = $id; | |
} | |
} | |
} | |
else | |
{ | |
print STDERR "WARNING: get_ids_for_family: no sequences found for family $family\n"; | |
$dbh->disconnect(); | |
return(\%ID,$errorcode,$errormsg); | |
} | |
$dbh->disconnect(); | |
return(\%ID,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# GET THE SEQUENCES FOR THE PROTEINS IN THE FAMILY: | |
# SUBROUTINE SYNOPSIS: get_seqs_for_family(): get the sequences for the proteins in a TreeFam family | |
sub get_seqs_for_family | |
{ | |
my $seqs = $_[0]; # POINTER TO ARRAY OF SEQUENCES IN A FAMILY | |
my $version = $_[1]; # VERSION OF THE TREEFAM DATABASE TO USE | |
my $ID = $_[2]; # HASH TABLE OF IDENTIFIERS FOR TREEFAM SEQUENCES IN THE FAMILY | |
my $DISCARD = $_[3]; # HASH TABLE OF SEQUENCES TO DISCARD | |
my $alntype = $_[4]; # TYPE OF ALIGNMENT TO USE (SEED/FULL) | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my %SEQ = (); # HASH TABLE FOR STORING SEQUENCES OF PROTEINS IN A FAMILY | |
my $no_seqs; # NUMBER OF SEQUENCES IN A FAMILY | |
my $seq; # SEQUENCE IN A FAMILY | |
my $database; # DATABASE THAT WE WANT TO CONNECT TO | |
my $dbh; # | |
my $table_w; # DATABASE TABLE THAT WE ARE INTERESTED IN | |
my $i; # | |
my $st; # | |
my $sth; # | |
my $rv; # | |
my @array; # | |
my $id; # TREEFAM IDENTIFIER FOR A SEQUENCE | |
my $sequence; # PROTEIN SEQUENCE FOR A GENE. | |
# CONNECT TO THE DATABASE AND GET ALL FAMILIES OF THIS TYPE: | |
$database = "treefam_".$version; | |
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return; | |
# GET THE ALIGNMENT FOR EACH FAMILY FROM THE DATABASE: | |
$table_w = "aa_seq"; | |
$no_seqs = @$seqs; | |
for ($i = 1; $i <= $no_seqs; $i++) | |
{ | |
$seq = $seqs->[($i-1)]; | |
if (!($DISCARD->{$seq})) | |
{ | |
if (!($ID->{$seq})) | |
{ | |
$errormsg = "ERROR: get_seqs_for_family: do not know id for $seq\n"; | |
$errorcode = 6; # ERRORCODE=6 | |
$dbh->disconnect(); | |
return(\%SEQ,$errorcode,$errormsg); | |
} | |
$id = $ID->{$seq}; | |
$st = "SELECT SEQ from $table_w WHERE ID=\'$id\'"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$sequence = $array[0]; | |
if ($SEQ{$seq}) | |
{ | |
$errormsg = "ERROR: get_seqs_for_family: already have sequence for $seq\n"; | |
$errorcode= 7; # ERRORCODE=7 | |
$dbh->disconnect(); | |
return(\%SEQ,$errorcode,$errormsg); | |
} | |
$SEQ{$seq} = $sequence; | |
} | |
} | |
else | |
{ | |
if ($alntype eq 'full') # WE ARE USING THE FULL ALIGNMENT SO SHOULD KNOW SEQUENCES OF ALL GENES | |
{ | |
$errormsg = "ERROR: get_seqs_for_family: do not know sequence for $seq\n"; | |
$errorcode = 8; # ERRORCODE=8 | |
$dbh->disconnect(); | |
return(\%SEQ,$errorcode,$errormsg); | |
} | |
else # WE ARE USING THE SEED ALIGNMENT SO MAY BE MISSING SEQUENCES OF SOME GENES | |
{ | |
if ($PRINT_WARNINGS == 1) { print STDERR "WARNING: get_seqs_for_family: do not know sequence for $seq from seed alignment\n"; } | |
} | |
} | |
} | |
} | |
$dbh->disconnect(); | |
return(\%SEQ,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &print_error | |
sub test_print_error | |
{ | |
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR | |
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR | |
($errormsg,$errorcode) = &print_error(45,45,1); | |
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;} | |
($errormsg,$errorcode) = &print_error('My error message','My error message',1); | |
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;} | |
($errormsg,$errorcode) = &print_error('none',45,1); | |
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;} | |
($errormsg,$errorcode) = &print_error('My error message', 0, 1); | |
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;} | |
} | |
#------------------------------------------------------------------# | |
# PRINT OUT AN ERROR MESSAGE AND EXIT. | |
sub print_error | |
{ | |
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED. | |
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED. | |
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT | |
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; | |
exit; | |
} | |
} | |
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/)) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; | |
exit; | |
} | |
} | |
if ($errormsg eq 'none' || $errorcode == 0) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; | |
exit; | |
} | |
} | |
else | |
{ | |
print STDERR "$errormsg"; | |
exit; | |
} | |
return($errormsg,$errorcode); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment