Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Last active November 22, 2017 02:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save avrilcoghlan/4017341 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/4017341 to your computer and use it in GitHub Desktop.
Perl script to run GeneWise, comparing a file of multiple of HMMs to a fasta file of multiple sequences
#!/usr/local/bin/perl
=head1 NAME
run_genewisedb.pl
=head1 SYNOPSIS
run_genewisedb.pl input_fasta input_hmms output outputdir spliceflat parameterfile
where input_fasta is the input fasta file of scaffolds,
input_hmms is the input file of HMMs,
output is the genewise output file,
outputdir is the output directory for writing output files,
spliceflat says whether to use the -splice flat option (yes/no),
parameterfile is the name of the intron parameter file (none if there is none).
=head1 DESCRIPTION
This script takes an input fasta file of scaffolds (<input_fasta>) and an
input file of HMMs (<input_hmms>) and runs genewise on the scaffolds, using
one HMM at a time. The output is then written in the output file <output>.
Before running this perl script you need to run:
setenv WISECONFIGDIR /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/
=head1 VERSION
Perl script last edited 31-Oct-2012.
=head1 CONTACT
alc@sanger.ac.uk (Avril Coghlan)
=cut
#
# Perl script run_genewisedb.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 31-Oct-12.
# Last edited 31-Oct-2012.
# SCRIPT SYNOPSIS: run_genewisedb.pl: runs GeneWise using TreeFam HMMs, on an input fasta file of scaffolds.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
use strict;
use warnings;
my $num_args = $#ARGV + 1;
if ($num_args != 6)
{
print "Usage of run_genewisedb.pl\n\n";
print "perl run_genewisedb.pl <input_fasta> <input_hmms> <output> <outputdir> <spliceflat> <parameterfile>\n";
print "where <input_fasta> is the input fasta file,\n";
print " <input_hmms> is the input file of HMMs,\n";
print " <output> is the genewise output file,\n";
print " <outputdir> is the output directory for writing output files,\n";
print " <spliceflat> says whether to use the -splice flat option (yes/no),\n";
print " <parameterfile> is the name of the intron parameter file (none if there is none)\n";
print "For example, >perl run_genewisedb.pl /lustre/scratch108/parasites/es9/50HGgenebuild/brpah/genome_small.fa\n";
print "/lustre/scratch108/parasites/es9/GeneWise/TreeFam8.hmm genewise_output\n";
print "/nfs/users/nfs_a/alc/Documents/GeneWise50Helminths yes mygenestat.stat\n";
print "NOTE: Before running this perl script, you need to run:\n";
print "setenv WISECONFIGDIR /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/\n";
exit;
}
# FIND THE PATH TO THE INPUT FASTA FILE:
my $input_fasta = $ARGV[0];
# FIND THE INPUT FILE OF HMMs:
my $input_hmms = $ARGV[1];
# FIND THE GENEWISE OUTPUT FILE:
my $output = $ARGV[2];
# FIND THE DIRECTORY TO USE FOR OUTPUT FILES:
my $outputdir = $ARGV[3];
# FIND OUT WHETHER WE WANT TO USE THE -splice flat OPTION IN GENEWISE:
my $spliceflat = $ARGV[4];
# FIND THE NAME OF THE INTRON PARAMETER FILE:
my $parameterfile = $ARGV[5];
#------------------------------------------------------------------#
# TEST SUBROUTINES:
my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING.
&test_print_error;
&test_write_fasta_file($outputdir);
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
&run_main_program($outputdir,$input_fasta,$input_hmms,$output,$spliceflat,$parameterfile);
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
sub run_main_program
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN.
my $input_fasta = $_[1]; # THE INPUT FASTA FILE
my $input_hmms = $_[2]; # THE INPUT FILE OF HMMs
my $output = $_[3]; # THE OUTPUT FILE NAME
my $spliceflat = $_[4]; # SAYS WHETHER WE WANT TO USE THE -splice flat OPTION IN GENEWISE
my $parameterfile = $_[5]; # THE NAME OF THE INTRON PARAMETER FILE
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR.
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR.
# CHECK THAT THE WISECONFIGDIR IS SET CORRECTLY:
($errorcode,$errormsg) = &check_wiseconfigdir;
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# RUN GENEWISE WITH ONE SCAFFOLD/CONTIG AT A TIME, ON ONE HMM AT A TIME:
($errorcode,$errormsg) = &run_genewise_on_scaffolds($outputdir,$input_fasta,$input_hmms,$output,$spliceflat,$parameterfile);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
}
#------------------------------------------------------------------#
# RUN GENEWISE WITH ONE SCAFFOLD/CONTIG AT A TIME, ON ONE HMM AT A TIME:
sub run_genewise_on_scaffolds
{
my $outputdir = $_[0]; # THE DIRECTORY FOR WRITING OUTPUT FILES
my $input_fasta = $_[1]; # THE INPUT FASTA FILE OF SCAFFOLDS
my $input_hmms = $_[2]; # THE INPUT FILE OF HMMs
my $output = $_[3]; # THE OUTPUT FILE
my $spliceflat = $_[4]; # SAYS WHETHER TO USE THE SPLICE FLAT OPTION
my $parameterfile = $_[5]; # THE INTRON PARAMETER FILE NAME
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $line; #
my @temp; #
my $name; # NAME OF A SCAFFOLD
my $seq = ""; # SEQUENCE OF A SCAFFOLD
my $seq_fasta; # FASTA FILE FOR SEQUENCE OF A SCAFFOLD
# OPEN THE OUTPUT FILE:
$output = $outputdir."/".$output;
open(OUTPUT,">$output") || die "ERROR: run_genewise: cannot open $output\n";
close(OUTPUT);
# READ IN THE INPUT FASTA FILE AND TAKE ONE SCAFFOLD AT A TIME:
open(INPUTFASTA,"$input_fasta") || die "ERROR: run_genewise_on_scaffolds: cannot open $input_fasta\n";
while(<INPUTFASTA>)
{
$line = $_;
chomp $line;
if (substr($line,0,1) eq '>')
{
# WRITE THE SEQUENCE TO A TEMPORARY FASTA FILE:
if ($seq ne '')
{
($seq_fasta,$errorcode,$errormsg) = &write_fasta_file($seq,$name,$outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# RUN GENEWISE ON $seq_fasta, ONE HMM AT A TIME:
($errorcode,$errormsg) = &run_genewise($outputdir,$seq_fasta,$input_hmms,$output,$spliceflat,$parameterfile,$name);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# DELETE TEMPORARY FILE:
system "rm -f $seq_fasta";
}
@temp = split(/\s+/,$line);
$name = $temp[0];
$name = substr($name,1,length($name)-1);
}
else
{
$seq = $seq.$line;
}
}
close(INPUTFASTA);
# WRITE THE SEQUENCE TO A TEMPORARY FASTA FILE:
if ($seq ne '')
{
($seq_fasta,$errorcode,$errormsg) = &write_fasta_file($seq,$name,$outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# RUN GENEWISE ON $seq_fasta:
($errorcode,$errormsg) = &run_genewise($outputdir,$seq_fasta,$input_hmms,$output,$spliceflat,$parameterfile,$name);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# DELETE TEMPORARY FILE:
system "rm -f $seq_fasta";
}
return($errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &write_fasta_file
sub test_write_fasta_file
{
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES TO
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR
my $fasta; # FASTA FILE
my $expected_fasta; # EXPECTED CONTENTS OF FASTA FILE
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES
my $differences; # DIFFERENCES BETWEEN $fasta AND $expected_fasta
my $length_differences; # LENGTH OF $differences
my $line; #
($fasta,$errorcode,$errormsg) = &write_fasta_file("AGCTAGCT","myseq",$outputdir);
if ($errorcode != 0) { print STDERR "ERROR: test_write_fasta_file: failed test1\n"; exit;}
$random_number = rand();
$expected_fasta = $outputdir."/tmp".$random_number;
open(EXPECTED,">$expected_fasta") || die "ERROR: write_fasta_file: cannot open $expected_fasta\n";
print EXPECTED ">myseq\n";
print EXPECTED "AGCTAGCT\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $fasta $expected_fasta |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_write_fasta_file: failed test1 (files $fasta $expected_fasta)\n"; exit;}
system "rm -f $fasta";
system "rm -f $expected_fasta";
}
#------------------------------------------------------------------#
# WRITE THE SEQUENCE TO A TEMPORARY FASTA FILE:
sub write_fasta_file
{
my $seq = $_[0]; # SEQUENCE
my $name = $_[1]; # NAME OF THE SEQUENCE
my $outputdir = $_[2]; # DIRECTORY FOR WRITING OUTPUT FILES TO
my $fasta; # FASTA FILE
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
$random_number = rand();
$fasta = $outputdir."/tmp".$random_number;
open(FASTA,">$fasta") || die "ERROR: write_fasta_file: cannot open $fasta\n";
print FASTA ">$name\n";
print FASTA "$seq\n";
close(FASTA);
return($fasta,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# CHECK THAT THE WISECONFIGDIR IS SET CORRECTLY:
sub check_wiseconfigdir
{
my $wiseconfigdir = "none";# THE WISECONFIGDIR
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
open(TMP,"echo \$WISECONFIGDIR |");
while(<TMP>)
{
$wiseconfigdir = $_;
chomp($wiseconfigdir);
}
close(TMP);
if ($wiseconfigdir ne '/nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/')
{
$errormsg = "ERROR: check_wiseconfigdir: wiseconfigdir $wiseconfigdir\nBefore running this perl script, you must run:\nsetenv WISECONFIGDIR /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/\n";
$errorcode = 6; # ERRORCODE=6
return($errorcode,$errormsg);
}
return($errorcode,$errormsg);
}
#------------------------------------------------------------------#
# RUN GENEWISE ON ONE HMM AT A TIME:
sub run_genewise
{
my $outputdir = $_[0]; # DIRECTORY FOR WRITING OUTPUT FILES TO
my $input_fasta = $_[1]; # THE INPUT FASTA FILE
my $input_hmms = $_[2]; # THE INPUT FILE OF HMMs
my $output = $_[3]; # THE OUTPUT FILE NAME
my $spliceflat = $_[4]; # SAYS WHETHER WE WANT TO USE THE -splice flat OPTION IN GENEWISE
my $parameterfile = $_[5]; # THE NAME OF THE INTRON PARAMETER FILE
my $scaffoldname = $_[6]; # THE NAME OF THE SCAFFOLD
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR.
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR.
my $line; #
my $hmm = ""; # A HMM
my $hmmname = "none";# HMM NAME
my @temp; #
# READ IN ONE HMM AT A TIME FROM THE INPUT FILE OF HMMs:
open(HMMS,"$input_hmms") || die "ERROR: run_genewise: cannot open $input_hmms\n";
while(<HMMS>)
{
$line = $_;
$hmm = $hmm.$line;
if (substr($line,0,2) eq '//')
{
# CHECK WE HAVE A NAME FOR THE HMM:
if ($hmmname eq 'none')
{
$errormsg = "ERROR: run_genewise: do not have name for hmm $hmm\n";
$errorcode = 3; # ERRORCODE=3
return($errorcode,$errormsg);
}
# RUN GENEWISE FOR THIS HMM:
($errorcode,$errormsg) = &run_genewise_on_hmm($hmm,$outputdir,$input_fasta,$output,$hmmname,$spliceflat,$parameterfile,$scaffoldname);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
$hmm = "";
$hmmname = "none";
}
if (substr($line,0,4) eq 'NAME')
{
@temp = split(/\s+/,$line);
$hmmname = $temp[1];
chomp($hmmname);
}
}
close(HMMS);
if ($hmm ne '')
{
$errormsg = "ERROR: run_genewise: at end of file hmm should be empty\n";
$errorcode = 1; # ERRORCODE=1
return($errorcode,$errormsg);
}
return($errorcode,$errormsg);
}
#------------------------------------------------------------------#
# READ IN ONE HMM AT A TIME FROM THE INPUT FILE OF HMMs:
sub run_genewise_on_hmm
{
my $hmm = $_[0]; # THE HMM
my $outputdir = $_[1]; # DIRECTORY TO WRITE OUTPUT FILES TO
my $input_fasta = $_[2]; # INPUT FASTA FILE
my $output = $_[3]; # FILE FOR GENEWISE OUTPUT
my $hmmname = $_[4]; # NAME OF THE HMM
my $spliceflat = $_[5]; # SAYS WHETHER WE WANT TO USE THE -splice flat OPTION IN GENEWISE
my $parameterfile = $_[6]; # THE NAME OF THE INTRON PARAMETER FILE
my $scaffoldname = $_[7]; # NAME OF THE SCAFFOLD
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR.
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR.
my $random_number; # RANDOM NUMBER FOR TEMPORARY FILE NAME
my $hmmfile; # FILE CONTAINING THE HMM
my $genewise; # TEMPORARY FILE FOR THE GENEWISE OUTPUT FOR THIS HMM
my $cmd; # THE GENEWISE COMMAND TO RUN
my $genewise_exe = "/nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/src/bin/genewise"; # wise 2.4.1, can use -genestats
# CHECK THE HMM LOOKS OK:
if (substr($hmm,0,8) ne 'HMMER2.0' && substr($hmm,length($hmm)-2,2) ne '//')
{
$errormsg = "ERROR: run_genewise_on_hmm: hmm $hmm\n";
$errorcode = 2; # ERRORCODE=2
return($errorcode,$errormsg);
}
# CHECK THE SPLICEFLAT OPTION IS 'yes' OR 'no':
if ($spliceflat ne 'yes' && $spliceflat ne 'no')
{
$errormsg = "ERROR: run_genewise_on_hmm: spliceflat is $spliceflat (should be yes or no)\n";
$errorcode = 4; # ERRORCODE=4
return($errorcode,$errormsg);
}
# WRITE THE HMM TO A FILE:
$random_number = rand();
$hmmfile = $outputdir."/tmp".$random_number;
open(HMMFILE,">$hmmfile") || die "ERROR: run_genewise_on_hmm: cannot open $hmmfile\n";
print HMMFILE "$hmm";
close(HMMFILE);
# WRITE THE FAMILY NAME IN THE OUTPUT FILE:
system "echo $hmmname >> $output";
# RUN GENEWISE:
$random_number = rand();
$genewise = $outputdir."/tmp".$random_number;
print STDERR "Scaffold $scaffoldname: running genewise for HMM $hmmname ...\n";
if ($parameterfile ne 'none' && $spliceflat eq 'no')
{
$cmd = "$genewise_exe $hmmfile $input_fasta -gff -hmmer -genestats $parameterfile -nosplice_gtag > $genewise";
}
elsif ($parameterfile eq 'none' && $spliceflat eq 'yes')
{
$cmd = "$genewise_exe $hmmfile $input_fasta -gff -hmmer -splice_gtag > $genewise";
}
elsif ($parameterfile eq 'none' && $spliceflat eq 'no')
{
$cmd = "$genewise_exe $hmmfile $input_fasta -gff -hmmer -nosplice_gtag > $genewise"; # USES THE gene.stats FILE THAT COMES WITH GENEWISE, /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/gene.stat
}
elsif ($parameterfile ne 'none' && $spliceflat eq 'yes')
{
$errormsg = "ERROR: run_genewise_on_hmm: cannot have spliceflat $spliceflat and parameterfile $parameterfile\n";
$errorcode = 5; # ERRORCODE=5
return($errorcode,$errormsg);
}
system "$cmd";
# CONCATENATE THIS GENEWISE OUTPUT ONTO THE MAIN OUTPUT FILE OF GENEWISE OUTPUT:
sleep(1);
system "cat $genewise";
system "cat $genewise >> $output";
# DELETE TEMPORARY FILES:
system "rm -f $hmmfile";
system "rm -f $genewise";
return($errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &print_error
sub test_print_error
{
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR
($errormsg,$errorcode) = &print_error(45,45,1);
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message','My error message',1);
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;}
($errormsg,$errorcode) = &print_error('none',45,1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message', 0, 1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;}
}
#------------------------------------------------------------------#
# PRINT OUT AN ERROR MESSAGE AND EXIT.
sub print_error
{
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED.
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED.
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/)
{
if ($called_from_test == 1)
{
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n";
exit;
}
}
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/))
{
if ($called_from_test == 1)
{
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n";
exit;
}
}
if ($errormsg eq 'none' || $errorcode == 0)
{
if ($called_from_test == 1)
{
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n";
exit;
}
}
else
{
print STDERR "$errormsg";
exit;
}
return($errormsg,$errorcode);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment