Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created February 28, 2013 13:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/5056813 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5056813 to your computer and use it in GitHub Desktop.
Perl script that, for a TreeFam family, gets sequences from the database, and builds an alignment & HMM
#!/usr/local/bin/perl
=head1 NAME
make_aln_and_hmm_for_treefam_family.pl
=head1 SYNOPSIS
make_aln_and_hmm_for_treefam_family.pl treefam_version output outputdir family hmmer_bin aln_output map_output
=head1 DESCRIPTION
For a TreeFam family of interest (<family>), this script reads in the list of sequences
that belongs to the family from the TreeFam database, and builds a multiple sequence
alignment for the family using MUSCLE, and then builds a HMM for the alignment using
HMMER. The HMM is written in the output file <output>. <hmmer_bin> is the directory
containing the hmmer executables. The multiple sequence alignment is written to file
<aln_output>. The hmmbuild map output is put into <map_output>.
=head1 VERSION
Perl script last edited 25-Oct-2012.
=head1 CONTACT
alc@sanger.ac.uk (Avril Coghlan)
=cut
#
# Perl script make_aln_and_hmm_for_treefam_family.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 25-Oct-12.
# Last edited 25-Oct-2012.
# SCRIPT SYNOPSIS: make_aln_and_hmm_for_treefam_family.pl: for a TreeFam family, gets sequences from the database, and builds an alignment & HMM
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
use strict;
use warnings;
use DBI;
use Devel::Size qw(size total_size);
my $num_args = $#ARGV + 1;
if ($num_args != 7)
{
print "Usage of make_aln_and_hmm_for_treefam_family.pl\n\n";
print "perl make_aln_and_hmm_for_treefam_family.pl <treefam_version> <output> <outputdir> <family> <hmmer_bin> <aln_output> <map_output>\n";
print "where <treefam_version> is the version of TreeFam to use (eg. 7),\n";
print " <output> is the output file of HMMs,\n";
print " <outputdir> is the directory for writing output files,\n";
print " <family> is the family of interest,\n";
print " <hmmer_bin> is the directory containing the hmmer executable,\n";
print " <aln_output> is the output multiple sequence alignment\n";
print "For example, >perl make_aln_and_hmm_for_treefam_family.pl 7 myhmms\n";
print "/nfs/users/nfs_a/alc/Documents/StrongyloidesTreeFam TF101000\n";
print "/software/hmmer/bin/ myaln mymap\n";
exit;
}
# FIND THE TREEFAM VERSION TO USE:
my $treefam_version = $ARGV[0];
# FIND THE OUTPUT FILE NAME:
my $output = $ARGV[1];
# FIND THE DIRECTORY TO WRITE OUTPUT FILES TO:
my $outputdir = $ARGV[2];
# FIND THE FAMILY OF INTEREST:
my $family = $ARGV[3];
# FIND THE DIRECTORY WITH THE HMMER EXECUTABLES:
my $hmmer_bin = $ARGV[4];
# FIND THE NAME OF THE OUTPUT MULTIPLE SEQUENCE ALIGNMENT:
my $aln_output = $ARGV[5];
# FIND THE NAME OF THE FILE WITH THE hmmbuild MAP OUTPUT:
my $map_output = $ARGV[6];
# TEST SUBROUTINES:
my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING.
my $PRINT_WARNINGS = 0; # SAYS WHETHER TO PRINT WARNINGS ABOUT MISSING DATA.
&test_print_error;
&test_make_hmm($outputdir);
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
&run_main_program($treefam_version,$output,$outputdir,$family,$hmmer_bin,$aln_output,$map_output);
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
sub run_main_program
{
my $version = $_[0]; ## THE TREEFAM VERSION TO USE.
my $output = $_[1]; ## THE OUTPUT FILE.
my $outputdir = $_[2]; ## DIRECTORY TO WRITE OUTPUT FILES TO.
my $family = $_[3]; ## THE FAMILY OF INTEREST
my $hmmer_build = $_[4]; ## DIRECTORY CONTAINING THE HMMER EXECUTABLES
my $aln_output = $_[5]; ## THE MULTIPLE SEQUENCE ALIGNMENT FILE NAME
my $map_output = $_[6]; ## THE hmmbuild MAP OUTPUT
my $errorcode; ## RETURNED AS 0 IF THERE IS NO ERROR.
my $errormsg; ## RETURNED AS 'none' IF THERE IS NO ERROR.
my $fasta; ## FASTA FILE OF SEQUENCES IN THE FAMILY
my $genelist; ## FILE WITH IDENTIFIERS OF SEQUENCES FROM SPECIES THAT WERE ARE INTERESTED IN
my $genelist2; ## SHORTER FILE WITH IDENTIFIERS OF SEQUENCES FROM SPECIES THAT WERE ARE INTERESTED IN
# MAKE A FILE WITH THE IDENTIFIERS OF SEQUENCES FROM SPECIES THAT WE ARE INTERESTED IN:
($genelist,$genelist2,$errorcode,$errormsg) = &make_genelist($version,$outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# GET THE SEQUENCES FOR THE FAMILY AND WRITE THEM TO A FASTA FILE:
($fasta,$errorcode,$errormsg) = &make_fasta_of_seqs($version,$outputdir,$family,$genelist,$genelist2);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# USE MUSCLE TO MAKE AN ALIGNMENT FOR THE FAMILY:
($errorcode,$errormsg) = &make_aln($fasta,$aln_output);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# USE HMMER TO MAKE A HMM FOR THE FAMILY:
($errorcode,$errormsg) = &make_hmm($aln_output,$hmmer_bin,$output,$map_output);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
}
#------------------------------------------------------------------#
# USE MUSCLE TO MAKE AN ALIGNMENT FOR THE FAMILY:
# SUBROUTINE SYNOPSIS: make_aln(): builds a multiple alignment for a TreeFam family, given a fasta file
sub make_aln
{
my $fasta = $_[0]; # FASTA FILE
my $mfa = $_[1]; # THE MULTIPLE SEQUENCE ALIGNMENT FILE
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES
my $cmd; # COMMAND TO RUN
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
# CALL MUSCLE TO MAKE A MULTIPLE ALIGNMENT
$cmd = "/software/pubseq/bin/muscle -in $fasta -out $mfa >& /dev/null";
system "$cmd";
# DELETE THE TEMPORARY FASTA FILE:
system "rm -f $fasta";
return($errorcode,$errormsg);
}
#------------------------------------------------------------------#
# GET THE SEQUENCES FOR A FAMILY AND WRITE THEM TO A FASTA FILE:
# SUBROUTINE SYNOPSIS: make_fasta_of_seqs(): retrieve sequences for a TreeFam family and write them to a fasta file
sub make_fasta_of_seqs
{
my $version = $_[0]; # VERSION OF THE TREEFAM DATABASE TO USE
my $outputdir = $_[1]; # DIRECTORY FOR WRITING OUTPUT FILES TO
my $family = $_[2]; # FAMILY OF INTEREST
my $genelist = $_[3]; # FILE WITH GENES FROM THE SPECIES WE WANT TO TAKE
my $genelist2 = $_[4]; # SHORTER FILE WITH GENES FROM THE SPECIES WE WANT
my $fasta; # FASTA FILE FOR THE FAMILY
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES
my $database; # TREEFAM DATABASE TO USE
my $dbh; #
my $st; #
my $sth; #
my $rv; #
my @array; #
my @ids; # ARRAY OF IDs TO TAKE
my $on_genelist; # SAYS WHETHER AN ID IS ON $genelist
my $line; #
my @temp; #
my $no_ids; # NUMBER OF IDs IN @ids
my $id; # AN IDENTIFIER FOR A SEQUENCE
my $i; #
my $seq; # SEQUENCE FOR AN ID.
my $table_w; # DATABASE TABLE OF INTEREST
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
# OPEN A NEW FILE FOR WRITING THE SEQUENCES:
$random_number = rand();
$fasta = $outputdir."/tmp".$random_number;
open(FASTA,">$fasta") || die "ERROR: make_fasta_of_seqs: cannot open $fasta\n";
# CONNECT TO THE DATABASE AND GET ALL SEQUENCES FOR THE FAMILY:
$database = "treefam_".$version;
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return;
# GET THE SEQUENCES FROM THE FAMILY FROM THE DATABASE:
$table_w = "fam_genes";
$st = "SELECT ID from $table_w WHERE AC=\'$family\'";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$id = $array[0];
# CHECK IF $id APPEARS ON $genelist:
$on_genelist = 0;
open(TMP,"grep $id $genelist |");
while(<TMP>)
{
$line = $_;
chomp $line;
if ($line eq $id) { $on_genelist = 1;}
}
close(TMP);
if ($on_genelist == 1)
{
@ids = (@ids,$id);
}
}
}
# IF THERE ARE MORE THAN 100 SEQUENCES, ONLY TAKE THE SEQUENCES FROM A FEW SPECIES:
if ($#ids >= 100)
{
$no_ids = $#ids + 1;
for ($i = 1; $i <= $no_ids; $i++)
{
$id = $ids[($i-1)];
# CHECK IF THIS IS IN THE SHORTER FILE OF IDs FROM SPECIES TO TAKE:
# CHECK IF $id APPEARS ON $genelist2:
$on_genelist = 0;
open(TMP,"grep $id $genelist2 |");
while(<TMP>)
{
$line = $_;
chomp $line;
if ($line eq $id) { $on_genelist = 1;}
}
close(TMP);
if ($on_genelist == 0) # RECORD THAT THIS $id IS NOT IN $genelist2:
{
$ids[$i] = "NA";
}
}
}
# IF THERE ARE <=5 SEQUENCES, TAKE ALL THE SEQUENCES FROM THE FAMILY:
if ($#ids <= 5)
{
@ids = ();
$table_w = "fam_genes";
$st = "SELECT ID from $table_w WHERE AC=\'$family\'";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$id = $array[0];
@ids = (@ids,$id);
}
}
}
# NOW GET THE SEQUENCES FOR THE IDs:
$table_w = "aa_seq";
$no_ids = $#ids + 1;
for ($i = 1; $i <= $no_ids; $i++)
{
$id = $ids[($i-1)];
if ($id ne 'NA')
{
$st = "SELECT SEQ from $table_w WHERE ID=\'$id\'";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$seq = $array[0];
# REMOVE *s FROM THE SEQUENCE:
$seq =~ s/\*//g;
# REMOVE .s FROM THE SEQUENCE:
$seq =~ s/\*//g;
print FASTA ">$id\n";
print FASTA "$seq\n";
}
}
}
}
close(FASTA);
# REMOVE THE GENELIST FILE:
system "rm -f $genelist";
system "rm -f $genelist2";
$dbh->disconnect();
return($fasta,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# MAKE A FILE WITH THE IDENTIFIERS OF SEQUENCES FROM SPECIES THAT WE ARE INTERESTED IN:
sub make_genelist
{
my $version = $_[0]; # VERSION OF THE TREEFAM DATABASE TO USE
my $outputdir = $_[1]; # DIRECTORY FOR WRITING OUTPUT FILES TO
my $genelist; # FILE FOR WRITING THE LIST OF SEQUENCES TO
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES
my $database; # TREEFAM DATABASE TO CONNECT TO
my $dbh; #
my $table_w; # TABLE TO QUERY FROM THE DATABASE
my @species; # ARRAY OF SPECIES WE ARE INTERESTED IN
my $num_species; # NUMBER OF SPECIES WE ARE INTERESTED IN
my $species; # A SPECIES WE ARE INTERESTED IN
my $i; #
my $st; #
my $sth; #
my $rv; #
my @array; #
my $id; # IDENTIFIER FOR SEQUENCE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = 'none';# RETURNED AS 'none' IF THERE IS NO ERROR
my $genelist2; # A SECOND FILE OF SPECIES TO TAKE, WITH FEWER SPECIES
# OPEN A NEW FILE FOR WRITING THE LIST OF SEQUENCES:
$random_number = rand();
$genelist = $outputdir."/tmp".$random_number;
open(GENELIST,">$genelist") || die "ERROR: make_genelist: cannot open $genelist\n";
$random_number = rand();
$genelist2 = $outputdir."/tmp".$random_number;
open(GENELIST2,">$genelist2") || die "ERROR: make_genelist2: cannot open $genelist2\n";
# CONNECT TO THE DATABASE AND GET ALL SEQUENCES OF PARTICULAR SPECIES:
$database = "treefam_".$version;
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return;
# GET THE SEQUENCES FROM CERTAIN SPECIES FROM THE DATABASE:
$table_w = "genes";
@species = ("6239", # CAENORHABDITIS ELEGANS
"6238", # CAENOBHABDITIS BRIGGSAE
"6279", # BRUGIA MALAYI
"7227", # DROSOPHILA MELANOGASTER
"7234", # DROSOPHILA PERSIMILIS
"7165", # ANOPHELES GAMBIAE
"7159", # AEDES AEGYPTI
"6183", # SCHISTOSOMA MANSONI
"45351",# NEMATOSTELLA VECTENSIS
"7719", # CIONA INTESTINALIS
"9606", # HUMAN
"9598", # CHIMP
"9544", # MACAQUE
"10116",# RAT
"10090",# MOUSE
"9913", # COW
"9615", # DOG
"13616",# MONODELPHIS DOMESTICA
"9031", # CHICKEN
"7955", # ZEBRAFISH
"8364", # XENOPUS TROPICALIS
"99883",# TETRAODON NIGROVIRIDIS
"69293",# GASTEROSTEUS ACULEATUS (STICKLEBACK)
"4932", # SACCHAROMYCES CEREVISIAE
"4896", # SCHIZOSACCHAROMYCES POMBE
"3702", # ARABIDOPSIS THALIANA
"7668", # STRONGYLOCENTROTUS PURPURATUS
"44689");# DICTYOSTELIUM DISCOIDEUM
$num_species = $#species + 1;
for ($i = 1; $i <= $num_species; $i++)
{
$species = $species[($i-1)];
$st = "SELECT ID from $table_w WHERE TAX_ID=\'$species\'";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$id = $array[0];
print GENELIST "$id\n";
if ($species eq '6329' || $species eq '7227' || $species eq '6183' || # C. ELEGANS, D. MELANOGASTER, S. MANSONI
$species eq '45351'|| $species eq '7719' || $species eq '9606' || # N. VECTENSIS, C. INTESTINALIS, HUMAN
$species eq '4932' || # S. CEREVISIAE
$species eq '3702' || $species eq '7668' || $species eq '44689') # A. THALIANA, S. PURPURATUS, D. DICOIDEUM
{
print GENELIST2 "$id\n";
}
}
}
}
close(GENELIST);
close(GENELIST2);
$dbh->disconnect();
return($genelist,$genelist2,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &make_hmm
sub test_make_hmm
{
my $outputdir = $_[0]; ## DIRECTORY FOR WRITING OUTPUT FILES TO
my $errorcode; ## RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR
my $errormsg; ## RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR
my $random_number; ## RANDOM NUMBER FOR TEMPORARY FILE NAME
my $mfafile; ## FILE FOR WRITING MULTIPLE ALIGNMENT TO
my $hmmfile; ## FILE FOR WRITING THE HMM TO
my $map_output; ## hmmbuild MAP OUTPUT FILE
$random_number = rand();
$mfafile = $outputdir."/tmp".$random_number;
$random_number = rand();
$hmmfile = $outputdir."/tmp".$random_number;
$random_number = rand();
$map_output = $outputdir."/tmp".$random_number;
open(MFAFILE,">$mfafile") || die "ERROR: cannot open $mfafile\n";
print MFAFILE ">seq1\nACGTACC\n";
print MFAFILE ">seq2\n-CCTACC\n";
print MFAFILE ">seq3\nAC-TACC\n";
close(MFAFILE);
($errorcode,$errormsg) = &make_hmm($mfafile,"/software/hmmer/bin/",$hmmfile,$map_output);
if ($errorcode != 0) { print STDERR "ERROR: test_make_hmm: failed test1\n"; exit;}
system "rm -f $mfafile";
system "rm -f $hmmfile";
system "rm -f $map_output";
}
#------------------------------------------------------------------#
# MAKE A HMM FOR THIS FAMILY:
# SUBROUTINE SYNOPSIS: make_hmm(): build a HMM for a TreeFam family, given a multiple alignment
sub make_hmm
{
my $mfafile = $_[0]; ## FILE CONTAINING THE MULTIPLE ALIGNMENT FOR THE FAMILY
my $hmmer_bin = $_[1]; ## DIRECTORY CONTAINING THE HMMER EXECUTABLES
my $output = $_[2]; ## NAME OF THE OUTPUT HMM FILE
my $map_output = $_[3]; ## NAME OF THE HMMBUILD MAP OUTPUT FILE
my $errorcode = 0; ## RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";## RETURNED AS 'none' IF THERE IS NO ERROR
my $cmd; ## COMMAND TO RUN
# RUN HMMER2 TO MAKE THE HMM:
$cmd = $hmmer_bin."/hmmbuild -o $map_output $output $mfafile >& /dev/null";
system "$cmd";
$cmd = $hmmer_bin."/hmmcalibrate $output >& /dev/null";
system "$cmd";
return($errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &print_error
sub test_print_error
{
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR
($errormsg,$errorcode) = &print_error(45,45,1);
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message','My error message',1);
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;}
($errormsg,$errorcode) = &print_error('none',45,1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message', 0, 1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;}
}
#------------------------------------------------------------------#
# PRINT OUT AN ERROR MESSAGE AND EXIT.
sub print_error
{
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED.
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED.
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/)
{
if ($called_from_test == 1)
{
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n";
exit;
}
}
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/))
{
if ($called_from_test == 1)
{
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n";
exit;
}
}
if ($errormsg eq 'none' || $errorcode == 0)
{
if ($called_from_test == 1)
{
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n";
exit;
}
}
else
{
print STDERR "$errormsg";
exit;
}
return($errormsg,$errorcode);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment