Last active
November 22, 2017 02:11
Revisions
-
avrilcoghlan revised this gist
Feb 28, 2013 . 1 changed file with 1 addition and 0 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -37,6 +37,7 @@ =head1 CONTACT # Written by Avril Coghlan (alc@sanger.ac.uk) # 31-Oct-12. # Last edited 31-Oct-2012. # SCRIPT SYNOPSIS: run_genewisedb.pl: runs GeneWise using TreeFam HMMs, on an input fasta file of scaffolds. # #------------------------------------------------------------------# -
avrilcoghlan created this gist
Nov 5, 2012 .There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,519 @@ #!/usr/local/bin/perl =head1 NAME run_genewisedb.pl =head1 SYNOPSIS run_genewisedb.pl input_fasta input_hmms output outputdir spliceflat parameterfile where input_fasta is the input fasta file of scaffolds, input_hmms is the input file of HMMs, output is the genewise output file, outputdir is the output directory for writing output files, spliceflat says whether to use the -splice flat option (yes/no), parameterfile is the name of the intron parameter file (none if there is none). =head1 DESCRIPTION This script takes an input fasta file of scaffolds (<input_fasta>) and an input file of HMMs (<input_hmms>) and runs genewise on the scaffolds, using one HMM at a time. The output is then written in the output file <output>. Before running this perl script you need to run: setenv WISECONFIGDIR /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/ =head1 VERSION Perl script last edited 31-Oct-2012. =head1 CONTACT alc@sanger.ac.uk (Avril Coghlan) =cut # # Perl script run_genewisedb.pl # Written by Avril Coghlan (alc@sanger.ac.uk) # 31-Oct-12. # Last edited 31-Oct-2012. # #------------------------------------------------------------------# # CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: use strict; use warnings; my $num_args = $#ARGV + 1; if ($num_args != 6) { print "Usage of run_genewisedb.pl\n\n"; print "perl run_genewisedb.pl <input_fasta> <input_hmms> <output> <outputdir> <spliceflat> <parameterfile>\n"; print "where <input_fasta> is the input fasta file,\n"; print " <input_hmms> is the input file of HMMs,\n"; print " <output> is the genewise output file,\n"; print " <outputdir> is the output directory for writing output files,\n"; print " <spliceflat> says whether to use the -splice flat option (yes/no),\n"; print " <parameterfile> is the name of the intron parameter file (none if there is none)\n"; print "For example, >perl run_genewisedb.pl /lustre/scratch108/parasites/es9/50HGgenebuild/brpah/genome_small.fa\n"; print "/lustre/scratch108/parasites/es9/GeneWise/TreeFam8.hmm genewise_output\n"; print "/nfs/users/nfs_a/alc/Documents/GeneWise50Helminths yes mygenestat.stat\n"; print "NOTE: Before running this perl script, you need to run:\n"; print "setenv WISECONFIGDIR /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/\n"; exit; } # FIND THE PATH TO THE INPUT FASTA FILE: my $input_fasta = $ARGV[0]; # FIND THE INPUT FILE OF HMMs: my $input_hmms = $ARGV[1]; # FIND THE GENEWISE OUTPUT FILE: my $output = $ARGV[2]; # FIND THE DIRECTORY TO USE FOR OUTPUT FILES: my $outputdir = $ARGV[3]; # FIND OUT WHETHER WE WANT TO USE THE -splice flat OPTION IN GENEWISE: my $spliceflat = $ARGV[4]; # FIND THE NAME OF THE INTRON PARAMETER FILE: my $parameterfile = $ARGV[5]; #------------------------------------------------------------------# # TEST SUBROUTINES: my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING. &test_print_error; &test_write_fasta_file($outputdir); #------------------------------------------------------------------# # RUN THE MAIN PART OF THE CODE: &run_main_program($outputdir,$input_fasta,$input_hmms,$output,$spliceflat,$parameterfile); print STDERR "FINISHED.\n"; #------------------------------------------------------------------# # RUN THE MAIN PART OF THE CODE: sub run_main_program { my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN. my $input_fasta = $_[1]; # THE INPUT FASTA FILE my $input_hmms = $_[2]; # THE INPUT FILE OF HMMs my $output = $_[3]; # THE OUTPUT FILE NAME my $spliceflat = $_[4]; # SAYS WHETHER WE WANT TO USE THE -splice flat OPTION IN GENEWISE my $parameterfile = $_[5]; # THE NAME OF THE INTRON PARAMETER FILE my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR. my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR. # CHECK THAT THE WISECONFIGDIR IS SET CORRECTLY: ($errorcode,$errormsg) = &check_wiseconfigdir; if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } # RUN GENEWISE WITH ONE SCAFFOLD/CONTIG AT A TIME, ON ONE HMM AT A TIME: ($errorcode,$errormsg) = &run_genewise_on_scaffolds($outputdir,$input_fasta,$input_hmms,$output,$spliceflat,$parameterfile); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } } #------------------------------------------------------------------# # RUN GENEWISE WITH ONE SCAFFOLD/CONTIG AT A TIME, ON ONE HMM AT A TIME: sub run_genewise_on_scaffolds { my $outputdir = $_[0]; # THE DIRECTORY FOR WRITING OUTPUT FILES my $input_fasta = $_[1]; # THE INPUT FASTA FILE OF SCAFFOLDS my $input_hmms = $_[2]; # THE INPUT FILE OF HMMs my $output = $_[3]; # THE OUTPUT FILE my $spliceflat = $_[4]; # SAYS WHETHER TO USE THE SPLICE FLAT OPTION my $parameterfile = $_[5]; # THE INTRON PARAMETER FILE NAME my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR my $line; # my @temp; # my $name; # NAME OF A SCAFFOLD my $seq = ""; # SEQUENCE OF A SCAFFOLD my $seq_fasta; # FASTA FILE FOR SEQUENCE OF A SCAFFOLD # OPEN THE OUTPUT FILE: $output = $outputdir."/".$output; open(OUTPUT,">$output") || die "ERROR: run_genewise: cannot open $output\n"; close(OUTPUT); # READ IN THE INPUT FASTA FILE AND TAKE ONE SCAFFOLD AT A TIME: open(INPUTFASTA,"$input_fasta") || die "ERROR: run_genewise_on_scaffolds: cannot open $input_fasta\n"; while(<INPUTFASTA>) { $line = $_; chomp $line; if (substr($line,0,1) eq '>') { # WRITE THE SEQUENCE TO A TEMPORARY FASTA FILE: if ($seq ne '') { ($seq_fasta,$errorcode,$errormsg) = &write_fasta_file($seq,$name,$outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } # RUN GENEWISE ON $seq_fasta, ONE HMM AT A TIME: ($errorcode,$errormsg) = &run_genewise($outputdir,$seq_fasta,$input_hmms,$output,$spliceflat,$parameterfile,$name); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } # DELETE TEMPORARY FILE: system "rm -f $seq_fasta"; } @temp = split(/\s+/,$line); $name = $temp[0]; $name = substr($name,1,length($name)-1); } else { $seq = $seq.$line; } } close(INPUTFASTA); # WRITE THE SEQUENCE TO A TEMPORARY FASTA FILE: if ($seq ne '') { ($seq_fasta,$errorcode,$errormsg) = &write_fasta_file($seq,$name,$outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } # RUN GENEWISE ON $seq_fasta: ($errorcode,$errormsg) = &run_genewise($outputdir,$seq_fasta,$input_hmms,$output,$spliceflat,$parameterfile,$name); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } # DELETE TEMPORARY FILE: system "rm -f $seq_fasta"; } return($errorcode,$errormsg); } #------------------------------------------------------------------# # TEST &write_fasta_file sub test_write_fasta_file { my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES TO my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR my $fasta; # FASTA FILE my $expected_fasta; # EXPECTED CONTENTS OF FASTA FILE my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES my $differences; # DIFFERENCES BETWEEN $fasta AND $expected_fasta my $length_differences; # LENGTH OF $differences my $line; # ($fasta,$errorcode,$errormsg) = &write_fasta_file("AGCTAGCT","myseq",$outputdir); if ($errorcode != 0) { print STDERR "ERROR: test_write_fasta_file: failed test1\n"; exit;} $random_number = rand(); $expected_fasta = $outputdir."/tmp".$random_number; open(EXPECTED,">$expected_fasta") || die "ERROR: write_fasta_file: cannot open $expected_fasta\n"; print EXPECTED ">myseq\n"; print EXPECTED "AGCTAGCT\n"; close(EXPECTED); $differences = ""; open(TEMP,"diff $fasta $expected_fasta |"); while(<TEMP>) { $line = $_; $differences = $differences.$line; } close(TEMP); $length_differences = length($differences); if ($length_differences != 0) { print STDERR "ERROR: test_write_fasta_file: failed test1 (files $fasta $expected_fasta)\n"; exit;} system "rm -f $fasta"; system "rm -f $expected_fasta"; } #------------------------------------------------------------------# # WRITE THE SEQUENCE TO A TEMPORARY FASTA FILE: sub write_fasta_file { my $seq = $_[0]; # SEQUENCE my $name = $_[1]; # NAME OF THE SEQUENCE my $outputdir = $_[2]; # DIRECTORY FOR WRITING OUTPUT FILES TO my $fasta; # FASTA FILE my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR $random_number = rand(); $fasta = $outputdir."/tmp".$random_number; open(FASTA,">$fasta") || die "ERROR: write_fasta_file: cannot open $fasta\n"; print FASTA ">$name\n"; print FASTA "$seq\n"; close(FASTA); return($fasta,$errorcode,$errormsg); } #------------------------------------------------------------------# # CHECK THAT THE WISECONFIGDIR IS SET CORRECTLY: sub check_wiseconfigdir { my $wiseconfigdir = "none";# THE WISECONFIGDIR my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR open(TMP,"echo \$WISECONFIGDIR |"); while(<TMP>) { $wiseconfigdir = $_; chomp($wiseconfigdir); } close(TMP); if ($wiseconfigdir ne '/nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/') { $errormsg = "ERROR: check_wiseconfigdir: wiseconfigdir $wiseconfigdir\nBefore running this perl script, you must run:\nsetenv WISECONFIGDIR /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/\n"; $errorcode = 6; # ERRORCODE=6 return($errorcode,$errormsg); } return($errorcode,$errormsg); } #------------------------------------------------------------------# # RUN GENEWISE ON ONE HMM AT A TIME: sub run_genewise { my $outputdir = $_[0]; # DIRECTORY FOR WRITING OUTPUT FILES TO my $input_fasta = $_[1]; # THE INPUT FASTA FILE my $input_hmms = $_[2]; # THE INPUT FILE OF HMMs my $output = $_[3]; # THE OUTPUT FILE NAME my $spliceflat = $_[4]; # SAYS WHETHER WE WANT TO USE THE -splice flat OPTION IN GENEWISE my $parameterfile = $_[5]; # THE NAME OF THE INTRON PARAMETER FILE my $scaffoldname = $_[6]; # THE NAME OF THE SCAFFOLD my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR. my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR. my $line; # my $hmm = ""; # A HMM my $hmmname = "none";# HMM NAME my @temp; # # READ IN ONE HMM AT A TIME FROM THE INPUT FILE OF HMMs: open(HMMS,"$input_hmms") || die "ERROR: run_genewise: cannot open $input_hmms\n"; while(<HMMS>) { $line = $_; $hmm = $hmm.$line; if (substr($line,0,2) eq '//') { # CHECK WE HAVE A NAME FOR THE HMM: if ($hmmname eq 'none') { $errormsg = "ERROR: run_genewise: do not have name for hmm $hmm\n"; $errorcode = 3; # ERRORCODE=3 return($errorcode,$errormsg); } # RUN GENEWISE FOR THIS HMM: ($errorcode,$errormsg) = &run_genewise_on_hmm($hmm,$outputdir,$input_fasta,$output,$hmmname,$spliceflat,$parameterfile,$scaffoldname); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } $hmm = ""; $hmmname = "none"; } if (substr($line,0,4) eq 'NAME') { @temp = split(/\s+/,$line); $hmmname = $temp[1]; chomp($hmmname); } } close(HMMS); if ($hmm ne '') { $errormsg = "ERROR: run_genewise: at end of file hmm should be empty\n"; $errorcode = 1; # ERRORCODE=1 return($errorcode,$errormsg); } return($errorcode,$errormsg); } #------------------------------------------------------------------# # READ IN ONE HMM AT A TIME FROM THE INPUT FILE OF HMMs: sub run_genewise_on_hmm { my $hmm = $_[0]; # THE HMM my $outputdir = $_[1]; # DIRECTORY TO WRITE OUTPUT FILES TO my $input_fasta = $_[2]; # INPUT FASTA FILE my $output = $_[3]; # FILE FOR GENEWISE OUTPUT my $hmmname = $_[4]; # NAME OF THE HMM my $spliceflat = $_[5]; # SAYS WHETHER WE WANT TO USE THE -splice flat OPTION IN GENEWISE my $parameterfile = $_[6]; # THE NAME OF THE INTRON PARAMETER FILE my $scaffoldname = $_[7]; # NAME OF THE SCAFFOLD my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR. my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR. my $random_number; # RANDOM NUMBER FOR TEMPORARY FILE NAME my $hmmfile; # FILE CONTAINING THE HMM my $genewise; # TEMPORARY FILE FOR THE GENEWISE OUTPUT FOR THIS HMM my $cmd; # THE GENEWISE COMMAND TO RUN my $genewise_exe = "/nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/src/bin/genewise"; # wise 2.4.1, can use -genestats # CHECK THE HMM LOOKS OK: if (substr($hmm,0,8) ne 'HMMER2.0' && substr($hmm,length($hmm)-2,2) ne '//') { $errormsg = "ERROR: run_genewise_on_hmm: hmm $hmm\n"; $errorcode = 2; # ERRORCODE=2 return($errorcode,$errormsg); } # CHECK THE SPLICEFLAT OPTION IS 'yes' OR 'no': if ($spliceflat ne 'yes' && $spliceflat ne 'no') { $errormsg = "ERROR: run_genewise_on_hmm: spliceflat is $spliceflat (should be yes or no)\n"; $errorcode = 4; # ERRORCODE=4 return($errorcode,$errormsg); } # WRITE THE HMM TO A FILE: $random_number = rand(); $hmmfile = $outputdir."/tmp".$random_number; open(HMMFILE,">$hmmfile") || die "ERROR: run_genewise_on_hmm: cannot open $hmmfile\n"; print HMMFILE "$hmm"; close(HMMFILE); # WRITE THE FAMILY NAME IN THE OUTPUT FILE: system "echo $hmmname >> $output"; # RUN GENEWISE: $random_number = rand(); $genewise = $outputdir."/tmp".$random_number; print STDERR "Scaffold $scaffoldname: running genewise for HMM $hmmname ...\n"; if ($parameterfile ne 'none' && $spliceflat eq 'no') { $cmd = "$genewise_exe $hmmfile $input_fasta -gff -hmmer -genestats $parameterfile -nosplice_gtag > $genewise"; } elsif ($parameterfile eq 'none' && $spliceflat eq 'yes') { $cmd = "$genewise_exe $hmmfile $input_fasta -gff -hmmer -splice_gtag > $genewise"; } elsif ($parameterfile eq 'none' && $spliceflat eq 'no') { $cmd = "$genewise_exe $hmmfile $input_fasta -gff -hmmer -nosplice_gtag > $genewise"; # USES THE gene.stats FILE THAT COMES WITH GENEWISE, /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/gene.stat } elsif ($parameterfile ne 'none' && $spliceflat eq 'yes') { $errormsg = "ERROR: run_genewise_on_hmm: cannot have spliceflat $spliceflat and parameterfile $parameterfile\n"; $errorcode = 5; # ERRORCODE=5 return($errorcode,$errormsg); } system "$cmd"; # CONCATENATE THIS GENEWISE OUTPUT ONTO THE MAIN OUTPUT FILE OF GENEWISE OUTPUT: sleep(1); system "cat $genewise"; system "cat $genewise >> $output"; # DELETE TEMPORARY FILES: system "rm -f $hmmfile"; system "rm -f $genewise"; return($errorcode,$errormsg); } #------------------------------------------------------------------# # TEST &print_error sub test_print_error { my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR ($errormsg,$errorcode) = &print_error(45,45,1); if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;} ($errormsg,$errorcode) = &print_error('My error message','My error message',1); if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;} ($errormsg,$errorcode) = &print_error('none',45,1); if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;} ($errormsg,$errorcode) = &print_error('My error message', 0, 1); if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;} } #------------------------------------------------------------------# # PRINT OUT AN ERROR MESSAGE AND EXIT. sub print_error { my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED. my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED. my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/) { if ($called_from_test == 1) { $errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11 return($errormsg,$errorcode); } else { print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; exit; } } if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/)) { if ($called_from_test == 1) { $errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12 return($errormsg,$errorcode); } else { print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; exit; } } if ($errormsg eq 'none' || $errorcode == 0) { if ($called_from_test == 1) { $errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13 return($errormsg,$errorcode); } else { print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; exit; } } else { print STDERR "$errormsg"; exit; } return($errormsg,$errorcode); } #------------------------------------------------------------------#