Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Last active November 22, 2017 02:11

Revisions

  1. avrilcoghlan revised this gist Feb 28, 2013. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions run_genewisedb.pl
    Original file line number Diff line number Diff line change
    @@ -37,6 +37,7 @@ =head1 CONTACT
    # Written by Avril Coghlan (alc@sanger.ac.uk)
    # 31-Oct-12.
    # Last edited 31-Oct-2012.
    # SCRIPT SYNOPSIS: run_genewisedb.pl: runs GeneWise using TreeFam HMMs, on an input fasta file of scaffolds.
    #
    #------------------------------------------------------------------#

  2. avrilcoghlan created this gist Nov 5, 2012.
    519 changes: 519 additions & 0 deletions run_genewisedb.pl
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,519 @@
    #!/usr/local/bin/perl

    =head1 NAME
    run_genewisedb.pl
    =head1 SYNOPSIS
    run_genewisedb.pl input_fasta input_hmms output outputdir spliceflat parameterfile
    where input_fasta is the input fasta file of scaffolds,
    input_hmms is the input file of HMMs,
    output is the genewise output file,
    outputdir is the output directory for writing output files,
    spliceflat says whether to use the -splice flat option (yes/no),
    parameterfile is the name of the intron parameter file (none if there is none).
    =head1 DESCRIPTION
    This script takes an input fasta file of scaffolds (<input_fasta>) and an
    input file of HMMs (<input_hmms>) and runs genewise on the scaffolds, using
    one HMM at a time. The output is then written in the output file <output>.
    Before running this perl script you need to run:
    setenv WISECONFIGDIR /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/
    =head1 VERSION
    Perl script last edited 31-Oct-2012.
    =head1 CONTACT
    alc@sanger.ac.uk (Avril Coghlan)
    =cut

    #
    # Perl script run_genewisedb.pl
    # Written by Avril Coghlan (alc@sanger.ac.uk)
    # 31-Oct-12.
    # Last edited 31-Oct-2012.
    #
    #------------------------------------------------------------------#

    # CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

    use strict;
    use warnings;

    my $num_args = $#ARGV + 1;
    if ($num_args != 6)
    {
    print "Usage of run_genewisedb.pl\n\n";
    print "perl run_genewisedb.pl <input_fasta> <input_hmms> <output> <outputdir> <spliceflat> <parameterfile>\n";
    print "where <input_fasta> is the input fasta file,\n";
    print " <input_hmms> is the input file of HMMs,\n";
    print " <output> is the genewise output file,\n";
    print " <outputdir> is the output directory for writing output files,\n";
    print " <spliceflat> says whether to use the -splice flat option (yes/no),\n";
    print " <parameterfile> is the name of the intron parameter file (none if there is none)\n";
    print "For example, >perl run_genewisedb.pl /lustre/scratch108/parasites/es9/50HGgenebuild/brpah/genome_small.fa\n";
    print "/lustre/scratch108/parasites/es9/GeneWise/TreeFam8.hmm genewise_output\n";
    print "/nfs/users/nfs_a/alc/Documents/GeneWise50Helminths yes mygenestat.stat\n";
    print "NOTE: Before running this perl script, you need to run:\n";
    print "setenv WISECONFIGDIR /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/\n";
    exit;
    }

    # FIND THE PATH TO THE INPUT FASTA FILE:

    my $input_fasta = $ARGV[0];

    # FIND THE INPUT FILE OF HMMs:

    my $input_hmms = $ARGV[1];

    # FIND THE GENEWISE OUTPUT FILE:

    my $output = $ARGV[2];

    # FIND THE DIRECTORY TO USE FOR OUTPUT FILES:

    my $outputdir = $ARGV[3];

    # FIND OUT WHETHER WE WANT TO USE THE -splice flat OPTION IN GENEWISE:

    my $spliceflat = $ARGV[4];

    # FIND THE NAME OF THE INTRON PARAMETER FILE:

    my $parameterfile = $ARGV[5];

    #------------------------------------------------------------------#

    # TEST SUBROUTINES:

    my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING.
    &test_print_error;
    &test_write_fasta_file($outputdir);

    #------------------------------------------------------------------#

    # RUN THE MAIN PART OF THE CODE:

    &run_main_program($outputdir,$input_fasta,$input_hmms,$output,$spliceflat,$parameterfile);

    print STDERR "FINISHED.\n";

    #------------------------------------------------------------------#

    # RUN THE MAIN PART OF THE CODE:

    sub run_main_program
    {
    my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN.
    my $input_fasta = $_[1]; # THE INPUT FASTA FILE
    my $input_hmms = $_[2]; # THE INPUT FILE OF HMMs
    my $output = $_[3]; # THE OUTPUT FILE NAME
    my $spliceflat = $_[4]; # SAYS WHETHER WE WANT TO USE THE -splice flat OPTION IN GENEWISE
    my $parameterfile = $_[5]; # THE NAME OF THE INTRON PARAMETER FILE
    my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR.
    my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR.

    # CHECK THAT THE WISECONFIGDIR IS SET CORRECTLY:
    ($errorcode,$errormsg) = &check_wiseconfigdir;
    if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }

    # RUN GENEWISE WITH ONE SCAFFOLD/CONTIG AT A TIME, ON ONE HMM AT A TIME:
    ($errorcode,$errormsg) = &run_genewise_on_scaffolds($outputdir,$input_fasta,$input_hmms,$output,$spliceflat,$parameterfile);
    if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
    }

    #------------------------------------------------------------------#

    # RUN GENEWISE WITH ONE SCAFFOLD/CONTIG AT A TIME, ON ONE HMM AT A TIME:

    sub run_genewise_on_scaffolds
    {
    my $outputdir = $_[0]; # THE DIRECTORY FOR WRITING OUTPUT FILES
    my $input_fasta = $_[1]; # THE INPUT FASTA FILE OF SCAFFOLDS
    my $input_hmms = $_[2]; # THE INPUT FILE OF HMMs
    my $output = $_[3]; # THE OUTPUT FILE
    my $spliceflat = $_[4]; # SAYS WHETHER TO USE THE SPLICE FLAT OPTION
    my $parameterfile = $_[5]; # THE INTRON PARAMETER FILE NAME
    my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
    my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
    my $line; #
    my @temp; #
    my $name; # NAME OF A SCAFFOLD
    my $seq = ""; # SEQUENCE OF A SCAFFOLD
    my $seq_fasta; # FASTA FILE FOR SEQUENCE OF A SCAFFOLD

    # OPEN THE OUTPUT FILE:
    $output = $outputdir."/".$output;
    open(OUTPUT,">$output") || die "ERROR: run_genewise: cannot open $output\n";
    close(OUTPUT);

    # READ IN THE INPUT FASTA FILE AND TAKE ONE SCAFFOLD AT A TIME:
    open(INPUTFASTA,"$input_fasta") || die "ERROR: run_genewise_on_scaffolds: cannot open $input_fasta\n";
    while(<INPUTFASTA>)
    {
    $line = $_;
    chomp $line;
    if (substr($line,0,1) eq '>')
    {
    # WRITE THE SEQUENCE TO A TEMPORARY FASTA FILE:
    if ($seq ne '')
    {
    ($seq_fasta,$errorcode,$errormsg) = &write_fasta_file($seq,$name,$outputdir);
    if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
    # RUN GENEWISE ON $seq_fasta, ONE HMM AT A TIME:
    ($errorcode,$errormsg) = &run_genewise($outputdir,$seq_fasta,$input_hmms,$output,$spliceflat,$parameterfile,$name);
    if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
    # DELETE TEMPORARY FILE:
    system "rm -f $seq_fasta";
    }
    @temp = split(/\s+/,$line);
    $name = $temp[0];
    $name = substr($name,1,length($name)-1);
    }
    else
    {
    $seq = $seq.$line;
    }

    }
    close(INPUTFASTA);
    # WRITE THE SEQUENCE TO A TEMPORARY FASTA FILE:
    if ($seq ne '')
    {
    ($seq_fasta,$errorcode,$errormsg) = &write_fasta_file($seq,$name,$outputdir);
    if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
    # RUN GENEWISE ON $seq_fasta:
    ($errorcode,$errormsg) = &run_genewise($outputdir,$seq_fasta,$input_hmms,$output,$spliceflat,$parameterfile,$name);
    if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
    # DELETE TEMPORARY FILE:
    system "rm -f $seq_fasta";
    }

    return($errorcode,$errormsg);

    }

    #------------------------------------------------------------------#

    # TEST &write_fasta_file

    sub test_write_fasta_file
    {
    my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES TO
    my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR
    my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR
    my $fasta; # FASTA FILE
    my $expected_fasta; # EXPECTED CONTENTS OF FASTA FILE
    my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES
    my $differences; # DIFFERENCES BETWEEN $fasta AND $expected_fasta
    my $length_differences; # LENGTH OF $differences
    my $line; #

    ($fasta,$errorcode,$errormsg) = &write_fasta_file("AGCTAGCT","myseq",$outputdir);
    if ($errorcode != 0) { print STDERR "ERROR: test_write_fasta_file: failed test1\n"; exit;}
    $random_number = rand();
    $expected_fasta = $outputdir."/tmp".$random_number;
    open(EXPECTED,">$expected_fasta") || die "ERROR: write_fasta_file: cannot open $expected_fasta\n";
    print EXPECTED ">myseq\n";
    print EXPECTED "AGCTAGCT\n";
    close(EXPECTED);
    $differences = "";
    open(TEMP,"diff $fasta $expected_fasta |");
    while(<TEMP>)
    {
    $line = $_;
    $differences = $differences.$line;
    }
    close(TEMP);
    $length_differences = length($differences);
    if ($length_differences != 0) { print STDERR "ERROR: test_write_fasta_file: failed test1 (files $fasta $expected_fasta)\n"; exit;}
    system "rm -f $fasta";
    system "rm -f $expected_fasta";
    }

    #------------------------------------------------------------------#

    # WRITE THE SEQUENCE TO A TEMPORARY FASTA FILE:

    sub write_fasta_file
    {
    my $seq = $_[0]; # SEQUENCE
    my $name = $_[1]; # NAME OF THE SEQUENCE
    my $outputdir = $_[2]; # DIRECTORY FOR WRITING OUTPUT FILES TO
    my $fasta; # FASTA FILE
    my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES
    my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
    my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR

    $random_number = rand();
    $fasta = $outputdir."/tmp".$random_number;
    open(FASTA,">$fasta") || die "ERROR: write_fasta_file: cannot open $fasta\n";
    print FASTA ">$name\n";
    print FASTA "$seq\n";
    close(FASTA);

    return($fasta,$errorcode,$errormsg);

    }

    #------------------------------------------------------------------#

    # CHECK THAT THE WISECONFIGDIR IS SET CORRECTLY:

    sub check_wiseconfigdir
    {
    my $wiseconfigdir = "none";# THE WISECONFIGDIR
    my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
    my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR

    open(TMP,"echo \$WISECONFIGDIR |");
    while(<TMP>)
    {
    $wiseconfigdir = $_;
    chomp($wiseconfigdir);
    }
    close(TMP);
    if ($wiseconfigdir ne '/nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/')
    {
    $errormsg = "ERROR: check_wiseconfigdir: wiseconfigdir $wiseconfigdir\nBefore running this perl script, you must run:\nsetenv WISECONFIGDIR /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/\n";
    $errorcode = 6; # ERRORCODE=6
    return($errorcode,$errormsg);
    }

    return($errorcode,$errormsg);
    }

    #------------------------------------------------------------------#

    # RUN GENEWISE ON ONE HMM AT A TIME:

    sub run_genewise
    {
    my $outputdir = $_[0]; # DIRECTORY FOR WRITING OUTPUT FILES TO
    my $input_fasta = $_[1]; # THE INPUT FASTA FILE
    my $input_hmms = $_[2]; # THE INPUT FILE OF HMMs
    my $output = $_[3]; # THE OUTPUT FILE NAME
    my $spliceflat = $_[4]; # SAYS WHETHER WE WANT TO USE THE -splice flat OPTION IN GENEWISE
    my $parameterfile = $_[5]; # THE NAME OF THE INTRON PARAMETER FILE
    my $scaffoldname = $_[6]; # THE NAME OF THE SCAFFOLD
    my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR.
    my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR.
    my $line; #
    my $hmm = ""; # A HMM
    my $hmmname = "none";# HMM NAME
    my @temp; #

    # READ IN ONE HMM AT A TIME FROM THE INPUT FILE OF HMMs:
    open(HMMS,"$input_hmms") || die "ERROR: run_genewise: cannot open $input_hmms\n";
    while(<HMMS>)
    {
    $line = $_;
    $hmm = $hmm.$line;
    if (substr($line,0,2) eq '//')
    {
    # CHECK WE HAVE A NAME FOR THE HMM:
    if ($hmmname eq 'none')
    {
    $errormsg = "ERROR: run_genewise: do not have name for hmm $hmm\n";
    $errorcode = 3; # ERRORCODE=3
    return($errorcode,$errormsg);
    }
    # RUN GENEWISE FOR THIS HMM:
    ($errorcode,$errormsg) = &run_genewise_on_hmm($hmm,$outputdir,$input_fasta,$output,$hmmname,$spliceflat,$parameterfile,$scaffoldname);
    if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
    $hmm = "";
    $hmmname = "none";
    }
    if (substr($line,0,4) eq 'NAME')
    {
    @temp = split(/\s+/,$line);
    $hmmname = $temp[1];
    chomp($hmmname);
    }
    }
    close(HMMS);
    if ($hmm ne '')
    {
    $errormsg = "ERROR: run_genewise: at end of file hmm should be empty\n";
    $errorcode = 1; # ERRORCODE=1
    return($errorcode,$errormsg);
    }

    return($errorcode,$errormsg);
    }

    #------------------------------------------------------------------#

    # READ IN ONE HMM AT A TIME FROM THE INPUT FILE OF HMMs:

    sub run_genewise_on_hmm
    {
    my $hmm = $_[0]; # THE HMM
    my $outputdir = $_[1]; # DIRECTORY TO WRITE OUTPUT FILES TO
    my $input_fasta = $_[2]; # INPUT FASTA FILE
    my $output = $_[3]; # FILE FOR GENEWISE OUTPUT
    my $hmmname = $_[4]; # NAME OF THE HMM
    my $spliceflat = $_[5]; # SAYS WHETHER WE WANT TO USE THE -splice flat OPTION IN GENEWISE
    my $parameterfile = $_[6]; # THE NAME OF THE INTRON PARAMETER FILE
    my $scaffoldname = $_[7]; # NAME OF THE SCAFFOLD
    my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR.
    my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR.
    my $random_number; # RANDOM NUMBER FOR TEMPORARY FILE NAME
    my $hmmfile; # FILE CONTAINING THE HMM
    my $genewise; # TEMPORARY FILE FOR THE GENEWISE OUTPUT FOR THIS HMM
    my $cmd; # THE GENEWISE COMMAND TO RUN
    my $genewise_exe = "/nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/src/bin/genewise"; # wise 2.4.1, can use -genestats

    # CHECK THE HMM LOOKS OK:
    if (substr($hmm,0,8) ne 'HMMER2.0' && substr($hmm,length($hmm)-2,2) ne '//')
    {
    $errormsg = "ERROR: run_genewise_on_hmm: hmm $hmm\n";
    $errorcode = 2; # ERRORCODE=2
    return($errorcode,$errormsg);
    }

    # CHECK THE SPLICEFLAT OPTION IS 'yes' OR 'no':
    if ($spliceflat ne 'yes' && $spliceflat ne 'no')
    {
    $errormsg = "ERROR: run_genewise_on_hmm: spliceflat is $spliceflat (should be yes or no)\n";
    $errorcode = 4; # ERRORCODE=4
    return($errorcode,$errormsg);
    }

    # WRITE THE HMM TO A FILE:
    $random_number = rand();
    $hmmfile = $outputdir."/tmp".$random_number;
    open(HMMFILE,">$hmmfile") || die "ERROR: run_genewise_on_hmm: cannot open $hmmfile\n";
    print HMMFILE "$hmm";
    close(HMMFILE);

    # WRITE THE FAMILY NAME IN THE OUTPUT FILE:
    system "echo $hmmname >> $output";

    # RUN GENEWISE:
    $random_number = rand();
    $genewise = $outputdir."/tmp".$random_number;
    print STDERR "Scaffold $scaffoldname: running genewise for HMM $hmmname ...\n";
    if ($parameterfile ne 'none' && $spliceflat eq 'no')
    {
    $cmd = "$genewise_exe $hmmfile $input_fasta -gff -hmmer -genestats $parameterfile -nosplice_gtag > $genewise";
    }
    elsif ($parameterfile eq 'none' && $spliceflat eq 'yes')
    {
    $cmd = "$genewise_exe $hmmfile $input_fasta -gff -hmmer -splice_gtag > $genewise";
    }
    elsif ($parameterfile eq 'none' && $spliceflat eq 'no')
    {
    $cmd = "$genewise_exe $hmmfile $input_fasta -gff -hmmer -nosplice_gtag > $genewise"; # USES THE gene.stats FILE THAT COMES WITH GENEWISE, /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/gene.stat
    }
    elsif ($parameterfile ne 'none' && $spliceflat eq 'yes')
    {
    $errormsg = "ERROR: run_genewise_on_hmm: cannot have spliceflat $spliceflat and parameterfile $parameterfile\n";
    $errorcode = 5; # ERRORCODE=5
    return($errorcode,$errormsg);
    }
    system "$cmd";

    # CONCATENATE THIS GENEWISE OUTPUT ONTO THE MAIN OUTPUT FILE OF GENEWISE OUTPUT:
    sleep(1);
    system "cat $genewise";
    system "cat $genewise >> $output";

    # DELETE TEMPORARY FILES:
    system "rm -f $hmmfile";
    system "rm -f $genewise";

    return($errorcode,$errormsg);
    }

    #------------------------------------------------------------------#

    # TEST &print_error

    sub test_print_error
    {
    my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR
    my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR

    ($errormsg,$errorcode) = &print_error(45,45,1);
    if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;}

    ($errormsg,$errorcode) = &print_error('My error message','My error message',1);
    if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;}

    ($errormsg,$errorcode) = &print_error('none',45,1);
    if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;}

    ($errormsg,$errorcode) = &print_error('My error message', 0, 1);
    if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;}
    }

    #------------------------------------------------------------------#

    # PRINT OUT AN ERROR MESSAGE AND EXIT.

    sub print_error
    {
    my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED.
    my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED.
    my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT

    if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/)
    {
    if ($called_from_test == 1)
    {
    $errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11
    return($errormsg,$errorcode);
    }
    else
    {
    print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n";
    exit;
    }
    }

    if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/))
    {
    if ($called_from_test == 1)
    {
    $errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12
    return($errormsg,$errorcode);
    }
    else
    {
    print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n";
    exit;
    }
    }

    if ($errormsg eq 'none' || $errorcode == 0)
    {
    if ($called_from_test == 1)
    {
    $errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13
    return($errormsg,$errorcode);
    }
    else
    {
    print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n";
    exit;
    }
    }
    else
    {
    print STDERR "$errormsg";
    exit;
    }

    return($errormsg,$errorcode);
    }

    #------------------------------------------------------------------#