Last active
November 22, 2017 02:11
-
-
Save avrilcoghlan/4017341 to your computer and use it in GitHub Desktop.
Perl script to run GeneWise, comparing a file of multiple of HMMs to a fasta file of multiple sequences
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
=head1 NAME | |
run_genewisedb.pl | |
=head1 SYNOPSIS | |
run_genewisedb.pl input_fasta input_hmms output outputdir spliceflat parameterfile | |
where input_fasta is the input fasta file of scaffolds, | |
input_hmms is the input file of HMMs, | |
output is the genewise output file, | |
outputdir is the output directory for writing output files, | |
spliceflat says whether to use the -splice flat option (yes/no), | |
parameterfile is the name of the intron parameter file (none if there is none). | |
=head1 DESCRIPTION | |
This script takes an input fasta file of scaffolds (<input_fasta>) and an | |
input file of HMMs (<input_hmms>) and runs genewise on the scaffolds, using | |
one HMM at a time. The output is then written in the output file <output>. | |
Before running this perl script you need to run: | |
setenv WISECONFIGDIR /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/ | |
=head1 VERSION | |
Perl script last edited 31-Oct-2012. | |
=head1 CONTACT | |
alc@sanger.ac.uk (Avril Coghlan) | |
=cut | |
# | |
# Perl script run_genewisedb.pl | |
# Written by Avril Coghlan (alc@sanger.ac.uk) | |
# 31-Oct-12. | |
# Last edited 31-Oct-2012. | |
# SCRIPT SYNOPSIS: run_genewisedb.pl: runs GeneWise using TreeFam HMMs, on an input fasta file of scaffolds. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
use strict; | |
use warnings; | |
my $num_args = $#ARGV + 1; | |
if ($num_args != 6) | |
{ | |
print "Usage of run_genewisedb.pl\n\n"; | |
print "perl run_genewisedb.pl <input_fasta> <input_hmms> <output> <outputdir> <spliceflat> <parameterfile>\n"; | |
print "where <input_fasta> is the input fasta file,\n"; | |
print " <input_hmms> is the input file of HMMs,\n"; | |
print " <output> is the genewise output file,\n"; | |
print " <outputdir> is the output directory for writing output files,\n"; | |
print " <spliceflat> says whether to use the -splice flat option (yes/no),\n"; | |
print " <parameterfile> is the name of the intron parameter file (none if there is none)\n"; | |
print "For example, >perl run_genewisedb.pl /lustre/scratch108/parasites/es9/50HGgenebuild/brpah/genome_small.fa\n"; | |
print "/lustre/scratch108/parasites/es9/GeneWise/TreeFam8.hmm genewise_output\n"; | |
print "/nfs/users/nfs_a/alc/Documents/GeneWise50Helminths yes mygenestat.stat\n"; | |
print "NOTE: Before running this perl script, you need to run:\n"; | |
print "setenv WISECONFIGDIR /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/\n"; | |
exit; | |
} | |
# FIND THE PATH TO THE INPUT FASTA FILE: | |
my $input_fasta = $ARGV[0]; | |
# FIND THE INPUT FILE OF HMMs: | |
my $input_hmms = $ARGV[1]; | |
# FIND THE GENEWISE OUTPUT FILE: | |
my $output = $ARGV[2]; | |
# FIND THE DIRECTORY TO USE FOR OUTPUT FILES: | |
my $outputdir = $ARGV[3]; | |
# FIND OUT WHETHER WE WANT TO USE THE -splice flat OPTION IN GENEWISE: | |
my $spliceflat = $ARGV[4]; | |
# FIND THE NAME OF THE INTRON PARAMETER FILE: | |
my $parameterfile = $ARGV[5]; | |
#------------------------------------------------------------------# | |
# TEST SUBROUTINES: | |
my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING. | |
&test_print_error; | |
&test_write_fasta_file($outputdir); | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
&run_main_program($outputdir,$input_fasta,$input_hmms,$output,$spliceflat,$parameterfile); | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
sub run_main_program | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN. | |
my $input_fasta = $_[1]; # THE INPUT FASTA FILE | |
my $input_hmms = $_[2]; # THE INPUT FILE OF HMMs | |
my $output = $_[3]; # THE OUTPUT FILE NAME | |
my $spliceflat = $_[4]; # SAYS WHETHER WE WANT TO USE THE -splice flat OPTION IN GENEWISE | |
my $parameterfile = $_[5]; # THE NAME OF THE INTRON PARAMETER FILE | |
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR. | |
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR. | |
# CHECK THAT THE WISECONFIGDIR IS SET CORRECTLY: | |
($errorcode,$errormsg) = &check_wiseconfigdir; | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# RUN GENEWISE WITH ONE SCAFFOLD/CONTIG AT A TIME, ON ONE HMM AT A TIME: | |
($errorcode,$errormsg) = &run_genewise_on_scaffolds($outputdir,$input_fasta,$input_hmms,$output,$spliceflat,$parameterfile); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
} | |
#------------------------------------------------------------------# | |
# RUN GENEWISE WITH ONE SCAFFOLD/CONTIG AT A TIME, ON ONE HMM AT A TIME: | |
sub run_genewise_on_scaffolds | |
{ | |
my $outputdir = $_[0]; # THE DIRECTORY FOR WRITING OUTPUT FILES | |
my $input_fasta = $_[1]; # THE INPUT FASTA FILE OF SCAFFOLDS | |
my $input_hmms = $_[2]; # THE INPUT FILE OF HMMs | |
my $output = $_[3]; # THE OUTPUT FILE | |
my $spliceflat = $_[4]; # SAYS WHETHER TO USE THE SPLICE FLAT OPTION | |
my $parameterfile = $_[5]; # THE INTRON PARAMETER FILE NAME | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $line; # | |
my @temp; # | |
my $name; # NAME OF A SCAFFOLD | |
my $seq = ""; # SEQUENCE OF A SCAFFOLD | |
my $seq_fasta; # FASTA FILE FOR SEQUENCE OF A SCAFFOLD | |
# OPEN THE OUTPUT FILE: | |
$output = $outputdir."/".$output; | |
open(OUTPUT,">$output") || die "ERROR: run_genewise: cannot open $output\n"; | |
close(OUTPUT); | |
# READ IN THE INPUT FASTA FILE AND TAKE ONE SCAFFOLD AT A TIME: | |
open(INPUTFASTA,"$input_fasta") || die "ERROR: run_genewise_on_scaffolds: cannot open $input_fasta\n"; | |
while(<INPUTFASTA>) | |
{ | |
$line = $_; | |
chomp $line; | |
if (substr($line,0,1) eq '>') | |
{ | |
# WRITE THE SEQUENCE TO A TEMPORARY FASTA FILE: | |
if ($seq ne '') | |
{ | |
($seq_fasta,$errorcode,$errormsg) = &write_fasta_file($seq,$name,$outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# RUN GENEWISE ON $seq_fasta, ONE HMM AT A TIME: | |
($errorcode,$errormsg) = &run_genewise($outputdir,$seq_fasta,$input_hmms,$output,$spliceflat,$parameterfile,$name); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# DELETE TEMPORARY FILE: | |
system "rm -f $seq_fasta"; | |
} | |
@temp = split(/\s+/,$line); | |
$name = $temp[0]; | |
$name = substr($name,1,length($name)-1); | |
} | |
else | |
{ | |
$seq = $seq.$line; | |
} | |
} | |
close(INPUTFASTA); | |
# WRITE THE SEQUENCE TO A TEMPORARY FASTA FILE: | |
if ($seq ne '') | |
{ | |
($seq_fasta,$errorcode,$errormsg) = &write_fasta_file($seq,$name,$outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# RUN GENEWISE ON $seq_fasta: | |
($errorcode,$errormsg) = &run_genewise($outputdir,$seq_fasta,$input_hmms,$output,$spliceflat,$parameterfile,$name); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# DELETE TEMPORARY FILE: | |
system "rm -f $seq_fasta"; | |
} | |
return($errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &write_fasta_file | |
sub test_write_fasta_file | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES TO | |
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR | |
my $fasta; # FASTA FILE | |
my $expected_fasta; # EXPECTED CONTENTS OF FASTA FILE | |
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES | |
my $differences; # DIFFERENCES BETWEEN $fasta AND $expected_fasta | |
my $length_differences; # LENGTH OF $differences | |
my $line; # | |
($fasta,$errorcode,$errormsg) = &write_fasta_file("AGCTAGCT","myseq",$outputdir); | |
if ($errorcode != 0) { print STDERR "ERROR: test_write_fasta_file: failed test1\n"; exit;} | |
$random_number = rand(); | |
$expected_fasta = $outputdir."/tmp".$random_number; | |
open(EXPECTED,">$expected_fasta") || die "ERROR: write_fasta_file: cannot open $expected_fasta\n"; | |
print EXPECTED ">myseq\n"; | |
print EXPECTED "AGCTAGCT\n"; | |
close(EXPECTED); | |
$differences = ""; | |
open(TEMP,"diff $fasta $expected_fasta |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_write_fasta_file: failed test1 (files $fasta $expected_fasta)\n"; exit;} | |
system "rm -f $fasta"; | |
system "rm -f $expected_fasta"; | |
} | |
#------------------------------------------------------------------# | |
# WRITE THE SEQUENCE TO A TEMPORARY FASTA FILE: | |
sub write_fasta_file | |
{ | |
my $seq = $_[0]; # SEQUENCE | |
my $name = $_[1]; # NAME OF THE SEQUENCE | |
my $outputdir = $_[2]; # DIRECTORY FOR WRITING OUTPUT FILES TO | |
my $fasta; # FASTA FILE | |
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
$random_number = rand(); | |
$fasta = $outputdir."/tmp".$random_number; | |
open(FASTA,">$fasta") || die "ERROR: write_fasta_file: cannot open $fasta\n"; | |
print FASTA ">$name\n"; | |
print FASTA "$seq\n"; | |
close(FASTA); | |
return($fasta,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# CHECK THAT THE WISECONFIGDIR IS SET CORRECTLY: | |
sub check_wiseconfigdir | |
{ | |
my $wiseconfigdir = "none";# THE WISECONFIGDIR | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
open(TMP,"echo \$WISECONFIGDIR |"); | |
while(<TMP>) | |
{ | |
$wiseconfigdir = $_; | |
chomp($wiseconfigdir); | |
} | |
close(TMP); | |
if ($wiseconfigdir ne '/nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/') | |
{ | |
$errormsg = "ERROR: check_wiseconfigdir: wiseconfigdir $wiseconfigdir\nBefore running this perl script, you must run:\nsetenv WISECONFIGDIR /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/\n"; | |
$errorcode = 6; # ERRORCODE=6 | |
return($errorcode,$errormsg); | |
} | |
return($errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# RUN GENEWISE ON ONE HMM AT A TIME: | |
sub run_genewise | |
{ | |
my $outputdir = $_[0]; # DIRECTORY FOR WRITING OUTPUT FILES TO | |
my $input_fasta = $_[1]; # THE INPUT FASTA FILE | |
my $input_hmms = $_[2]; # THE INPUT FILE OF HMMs | |
my $output = $_[3]; # THE OUTPUT FILE NAME | |
my $spliceflat = $_[4]; # SAYS WHETHER WE WANT TO USE THE -splice flat OPTION IN GENEWISE | |
my $parameterfile = $_[5]; # THE NAME OF THE INTRON PARAMETER FILE | |
my $scaffoldname = $_[6]; # THE NAME OF THE SCAFFOLD | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR. | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR. | |
my $line; # | |
my $hmm = ""; # A HMM | |
my $hmmname = "none";# HMM NAME | |
my @temp; # | |
# READ IN ONE HMM AT A TIME FROM THE INPUT FILE OF HMMs: | |
open(HMMS,"$input_hmms") || die "ERROR: run_genewise: cannot open $input_hmms\n"; | |
while(<HMMS>) | |
{ | |
$line = $_; | |
$hmm = $hmm.$line; | |
if (substr($line,0,2) eq '//') | |
{ | |
# CHECK WE HAVE A NAME FOR THE HMM: | |
if ($hmmname eq 'none') | |
{ | |
$errormsg = "ERROR: run_genewise: do not have name for hmm $hmm\n"; | |
$errorcode = 3; # ERRORCODE=3 | |
return($errorcode,$errormsg); | |
} | |
# RUN GENEWISE FOR THIS HMM: | |
($errorcode,$errormsg) = &run_genewise_on_hmm($hmm,$outputdir,$input_fasta,$output,$hmmname,$spliceflat,$parameterfile,$scaffoldname); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
$hmm = ""; | |
$hmmname = "none"; | |
} | |
if (substr($line,0,4) eq 'NAME') | |
{ | |
@temp = split(/\s+/,$line); | |
$hmmname = $temp[1]; | |
chomp($hmmname); | |
} | |
} | |
close(HMMS); | |
if ($hmm ne '') | |
{ | |
$errormsg = "ERROR: run_genewise: at end of file hmm should be empty\n"; | |
$errorcode = 1; # ERRORCODE=1 | |
return($errorcode,$errormsg); | |
} | |
return($errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# READ IN ONE HMM AT A TIME FROM THE INPUT FILE OF HMMs: | |
sub run_genewise_on_hmm | |
{ | |
my $hmm = $_[0]; # THE HMM | |
my $outputdir = $_[1]; # DIRECTORY TO WRITE OUTPUT FILES TO | |
my $input_fasta = $_[2]; # INPUT FASTA FILE | |
my $output = $_[3]; # FILE FOR GENEWISE OUTPUT | |
my $hmmname = $_[4]; # NAME OF THE HMM | |
my $spliceflat = $_[5]; # SAYS WHETHER WE WANT TO USE THE -splice flat OPTION IN GENEWISE | |
my $parameterfile = $_[6]; # THE NAME OF THE INTRON PARAMETER FILE | |
my $scaffoldname = $_[7]; # NAME OF THE SCAFFOLD | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR. | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR. | |
my $random_number; # RANDOM NUMBER FOR TEMPORARY FILE NAME | |
my $hmmfile; # FILE CONTAINING THE HMM | |
my $genewise; # TEMPORARY FILE FOR THE GENEWISE OUTPUT FOR THIS HMM | |
my $cmd; # THE GENEWISE COMMAND TO RUN | |
my $genewise_exe = "/nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/src/bin/genewise"; # wise 2.4.1, can use -genestats | |
# CHECK THE HMM LOOKS OK: | |
if (substr($hmm,0,8) ne 'HMMER2.0' && substr($hmm,length($hmm)-2,2) ne '//') | |
{ | |
$errormsg = "ERROR: run_genewise_on_hmm: hmm $hmm\n"; | |
$errorcode = 2; # ERRORCODE=2 | |
return($errorcode,$errormsg); | |
} | |
# CHECK THE SPLICEFLAT OPTION IS 'yes' OR 'no': | |
if ($spliceflat ne 'yes' && $spliceflat ne 'no') | |
{ | |
$errormsg = "ERROR: run_genewise_on_hmm: spliceflat is $spliceflat (should be yes or no)\n"; | |
$errorcode = 4; # ERRORCODE=4 | |
return($errorcode,$errormsg); | |
} | |
# WRITE THE HMM TO A FILE: | |
$random_number = rand(); | |
$hmmfile = $outputdir."/tmp".$random_number; | |
open(HMMFILE,">$hmmfile") || die "ERROR: run_genewise_on_hmm: cannot open $hmmfile\n"; | |
print HMMFILE "$hmm"; | |
close(HMMFILE); | |
# WRITE THE FAMILY NAME IN THE OUTPUT FILE: | |
system "echo $hmmname >> $output"; | |
# RUN GENEWISE: | |
$random_number = rand(); | |
$genewise = $outputdir."/tmp".$random_number; | |
print STDERR "Scaffold $scaffoldname: running genewise for HMM $hmmname ...\n"; | |
if ($parameterfile ne 'none' && $spliceflat eq 'no') | |
{ | |
$cmd = "$genewise_exe $hmmfile $input_fasta -gff -hmmer -genestats $parameterfile -nosplice_gtag > $genewise"; | |
} | |
elsif ($parameterfile eq 'none' && $spliceflat eq 'yes') | |
{ | |
$cmd = "$genewise_exe $hmmfile $input_fasta -gff -hmmer -splice_gtag > $genewise"; | |
} | |
elsif ($parameterfile eq 'none' && $spliceflat eq 'no') | |
{ | |
$cmd = "$genewise_exe $hmmfile $input_fasta -gff -hmmer -nosplice_gtag > $genewise"; # USES THE gene.stats FILE THAT COMES WITH GENEWISE, /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/gene.stat | |
} | |
elsif ($parameterfile ne 'none' && $spliceflat eq 'yes') | |
{ | |
$errormsg = "ERROR: run_genewise_on_hmm: cannot have spliceflat $spliceflat and parameterfile $parameterfile\n"; | |
$errorcode = 5; # ERRORCODE=5 | |
return($errorcode,$errormsg); | |
} | |
system "$cmd"; | |
# CONCATENATE THIS GENEWISE OUTPUT ONTO THE MAIN OUTPUT FILE OF GENEWISE OUTPUT: | |
sleep(1); | |
system "cat $genewise"; | |
system "cat $genewise >> $output"; | |
# DELETE TEMPORARY FILES: | |
system "rm -f $hmmfile"; | |
system "rm -f $genewise"; | |
return($errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &print_error | |
sub test_print_error | |
{ | |
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR | |
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR | |
($errormsg,$errorcode) = &print_error(45,45,1); | |
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;} | |
($errormsg,$errorcode) = &print_error('My error message','My error message',1); | |
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;} | |
($errormsg,$errorcode) = &print_error('none',45,1); | |
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;} | |
($errormsg,$errorcode) = &print_error('My error message', 0, 1); | |
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;} | |
} | |
#------------------------------------------------------------------# | |
# PRINT OUT AN ERROR MESSAGE AND EXIT. | |
sub print_error | |
{ | |
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED. | |
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED. | |
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT | |
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; | |
exit; | |
} | |
} | |
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/)) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; | |
exit; | |
} | |
} | |
if ($errormsg eq 'none' || $errorcode == 0) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; | |
exit; | |
} | |
} | |
else | |
{ | |
print STDERR "$errormsg"; | |
exit; | |
} | |
return($errormsg,$errorcode); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment