Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Last active October 13, 2017 15:54
Show Gist options
  • Save avrilcoghlan/4016828 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/4016828 to your computer and use it in GitHub Desktop.
Perl script to make an intron parameter file for GeneWise
#!/usr/local/bin/perl
=head1 NAME
make_genewise_paramfile.pl
=head1 SYNOPSIS
make_genewise_paramfile.pl training_exon_gff fasta outputdir exons_names_sameas_genes output
where training_exon_gff is the input gff file of exons in training set genes,
fasta is the fasta file of the genome sequence,
outputdir is the output directory for writing output files,
exons_names_sameas_genes says whether the exons have the same names as genes in the input gff (yes/no),
output is the output file name.
=head1 DESCRIPTION
This script takes an input gff file of exons in training set genes (<training_exon_gff>),
and the genome fasta file (<fasta>), and uses them to make an intron parameter file that
can be used with GeneWise (Wise 2.4.1), using the GeneWise -genestats option.
Note that this perl script just uses training data for the intron splice sites.
It takes the rest of the parameters from the standard GeneWise parameter file gene.stat
that comes with GeneWise.
=head1 VERSION
Perl script last edited 2-Nov-2012.
=head1 CONTACT
alc@sanger.ac.uk (Avril Coghlan)
=cut
#
# Perl script make_genewise_paramfile.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 2-Nov-12.
# Last edited 2-Nov-2012.
# SCRIPT SYNOPSIS: make_genewise_paramfile: given a gff file of training genes for a species, makes an intron parameter file for use in GeneWise for that species.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
use strict;
use warnings;
my $num_args = $#ARGV + 1;
if ($num_args != 5)
{
print "Usage of make_genewise_paramfile.pl\n\n";
print "perl make_genewise_paramfile.pl <training_exon_gff> <fasta> <outputdir> <exons_names_sameas_genes> <output>\n";
print "where <training_exon_gff> is the input gff file of exons in the training set genes,\n";
print " <fasta> is the fasta file of the genome sequence,\n";
print " <outputdir> is the output directory for writing output files,\n";
print " <exons_names_sameas_genes> says whether the exons have the same names as genes in the input gff file,\n";
print " <output> is the output file name\n";
print "For example, >perl make_genewise_paramfile.pl output.cegma.gff S07.Im.SS.Gf.Rr.scaffolds.fa\n";
print "/nfs/users/nfs_a/alc/Documents/GeneWise50Helminths yes mygenestat.stat\n";
exit;
}
# FIND THE INPUT GFF FILE OF EXONS IN THE TRAINING SET GENES:
my $training_exon_gff = $ARGV[0];
# FIND THE FASTA FILE OF THE GENOME SEQUENCE:
my $fasta = $ARGV[1];
# FIND THE DIRECTORY TO USE FOR OUTPUT FILES:
my $outputdir = $ARGV[2];
# FIND OUT IF THE EXON NAMES ARE THE SAME AS GENE NAMES IN THE INPUT GFF FILE:
my $exons_names_sameas_genes = $ARGV[3];
# FIND THE OUTPUT FILE NAME:
my $output = $ARGV[4];
#------------------------------------------------------------------#
# TEST SUBROUTINES:
my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING.
&test_sort_exon_gff($outputdir);
&test_make_intron_gff($outputdir);
&test_remove_duplicate_introns($outputdir);
&test_make_splicesite_gff($outputdir);
&test_reverse_complement;
&test_make_splicesite_fasta($outputdir);
&test_new_intron_name;
&test_make_genewise_paramfile($outputdir);
&test_check_gtag_introns($outputdir);
&test_print_error;
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
&run_main_program($outputdir,$training_exon_gff,$fasta,$exons_names_sameas_genes,$output);
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
sub run_main_program
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN.
my $exon_gff = $_[1]; # THE GFF FILE OF EXONS IN THE TRAINING SET
my $fasta = $_[2]; # THE FASTA FILE OF THE GENOME SEQUENCE.
my $exons_names_sameas_genes = $_[3]; # SAYS WHETHER THE EXONS ARE NAMED THE SAME AS THE GENES IN THE INPUT GFF FILE
my $output = $_[4]; # THE GENEWISE PARAMETER FILE
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR.
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR.
my $sorted_exon_gff; # SORTED VERSION OF $exon_gff
my $intron_gff; # THE INTRON GFF FILE
my $splice_gff_3prime; # GFF FILE FOR 3' SPLICE SITES
my $splice_gff_5prime; # GFF FILE FOR 5' SPLICE SITES
my $splice_fasta_3prime; # FASTA FILE FOR 3' SPLICE SITES
my $splice_fasta_5prime; # FASTA FILE FOR 5' SPLICE SITES
# SORT THE GFF FILE BY CHROMOSOME, THEN BY GENE, THEN BY START IN THE GENE:
($sorted_exon_gff,$errorcode,$errormsg) = &sort_exon_gff($exon_gff,$outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# MAKE A GFF FILE OF THE INTRONS:
($intron_gff,$errorcode,$errormsg) = &make_intron_gff($sorted_exon_gff,$outputdir,$exons_names_sameas_genes);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# REMOVE ANY DUPLICATE INTRONS FROM THE GFF FILE OF INTRONS:
($intron_gff,$errorcode,$errormsg) = &remove_duplicate_introns($intron_gff,$outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# MAKE A GFF FILE WITH COORDINATES OF THE 3' SPLICE SITE OF THE INTRONS (9 bp 5' OF THE A
# OF THE <--AG AND 4 bp 3' OF THE A):
($splice_gff_3prime,$errorcode,$errormsg) = &make_splicesite_gff($intron_gff,$outputdir,'threeprime',9,4);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# MAKE A GFF FILE WITH COORDINATES OF THE 5' SPLICE SITE OF THE INTRONS (4 bp 5' OF THE G OF THE GT-->
# AND 9 bp 3' OF THE G):
($splice_gff_5prime,$errorcode,$errormsg) = &make_splicesite_gff($intron_gff,$outputdir,'fiveprime',4,9);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# MAKE A FASTA FILE WITH THE 3' SPLICE SITE SEQUENCES:
($splice_fasta_3prime,$errorcode,$errormsg) = &make_splicesite_fasta($splice_gff_3prime,$outputdir,$fasta);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# MAKE A FASTA FILE WITH THE 5' SPLICE SITE SEQUENCES:
($splice_fasta_5prime,$errorcode,$errormsg) = &make_splicesite_fasta($splice_gff_5prime,$outputdir,$fasta);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# MAKE A GENEWISE PARAMETER FILE:
($errorcode,$errormsg) = &make_genewise_paramfile($splice_fasta_5prime,$splice_fasta_3prime,$outputdir,$output);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# CHECK THAT THE SPLICE SITES IN THE GENEWISE PARAMETER FILE ARE MOSTLY GT...AG:
($errorcode,$errormsg) = &check_gtag_introns($output);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
}
#------------------------------------------------------------------#
# TEST &check_gtag_introns
sub test_check_gtag_introns
{
my $outputdir = $_[0]; # DIRECTORY FOR WRITING OUTPUT FILES INTO
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR
my $output; # GENEWISE PARAMETER FILE
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES
$random_number = rand();
$output = $outputdir."/tmp".$random_number;
open(OUTPUT,">$output") || die "ERROR: test_check_gtag_introns: cannot open $output\n";
print OUTPUT "#\n";
print OUTPUT "# new genestats file\n";
print OUTPUT "#\n";
print OUTPUT "splice3 10\n";
# AG
print OUTPUT "HELMINTH0000001 TTCATTCCAGGTAG\n";
print OUTPUT "HELMINTH0000002 TTTAATTCAGGTCA\n";
print OUTPUT "HELMINTH0000003 TCCTTTCCAGTAAC\n";
print OUTPUT "HELMINTH0000004 TAATTTTTAGGTGT\n";
print OUTPUT "HELMINTH0000005 TTTCATGCAGATTG\n";
print OUTPUT "HELMINTH0000006 TTGATTTCAGGAAG\n";
print OUTPUT "//\n";
print OUTPUT "splice5 5\n";
# GT
print OUTPUT "HELMINTH0000001 AAAAGTGCATTTTT\n";
print OUTPUT "HELMINTH0000002 GAAGGTAGGAAAAT\n";
print OUTPUT "HELMINTH0000003 ACATGTAAGTTATT\n";
print OUTPUT "HELMINTH0000004 CATGGTATGTTCAC\n";
print OUTPUT "HELMINTH0000005 TCAGGTACCAAATT\n";
print OUTPUT "HELMINTH0000006 GCGTGTAAGTTCCT\n";
print OUTPUT "//\n";
print OUTPUT "# A G C T N\n";
print OUTPUT "intron_emission\n";
print OUTPUT "0.25 0.25 0.25 0.25 0.25\n";
print OUTPUT "//\n";
print OUTPUT "polyp_emission\n";
print OUTPUT "0.2 0.2 0.3 0.3 0.25\n";
print OUTPUT "//\n";
print OUTPUT "rnd_emission\n";
print OUTPUT "0.25 0.25 0.25 0.25 0.25 1.0\n";
print OUTPUT "//\n";
print OUTPUT "rndcodon\n";
print OUTPUT "AAA 5290.000000\n";
print OUTPUT "AAC 4795.000000\n";
print OUTPUT "AAG 8178.000000\n";
print OUTPUT "AAT 3305.000000\n";
print OUTPUT "ACA 6240.000000\n";
print OUTPUT "ACC 7728.000000\n";
print OUTPUT "ACG 3347.000000\n";
print OUTPUT "ACT 4930.000000\n";
print OUTPUT "AGA 8491.000000\n";
print OUTPUT "AGC 8639.000000\n";
print OUTPUT "AGG 8997.000000\n";
print OUTPUT "AGT 4417.000000\n";
print OUTPUT "ATA 1975.000000\n";
print OUTPUT "ATC 4973.000000\n";
print OUTPUT "ATG 6474.000000\n";
print OUTPUT "ATT 3083.000000\n";
print OUTPUT "CAA 7057.000000\n";
print OUTPUT "CAC 6815.000000\n";
print OUTPUT "CAG 11041.000000\n";
print OUTPUT "CAT 5779.000000\n";
print OUTPUT "CCA 10537.000000\n";
print OUTPUT "CCC 10307.000000\n";
print OUTPUT "CCG 5621.000000\n";
print OUTPUT "CCT 10134.000000\n";
print OUTPUT "CGA 3377.000000\n";
print OUTPUT "CGC 5146.000000\n";
print OUTPUT "CGG 5375.000000\n";
print OUTPUT "CGT 2765.000000\n";
print OUTPUT "CTA 3502.000000\n";
print OUTPUT "CTC 7465.000000\n";
print OUTPUT "CTG 13780.000000\n";
print OUTPUT "CTT 5453.000000\n";
print OUTPUT "GAA 7461.000000\n";
print OUTPUT "GAC 6937.000000\n";
print OUTPUT "GAG 9975.000000\n";
print OUTPUT "GAT 4949.000000\n";
print OUTPUT "GCA 7747.000000\n";
print OUTPUT "GCC 10890.000000\n";
print OUTPUT "GCG 4828.000000\n";
print OUTPUT "GCT 9371.000000\n";
print OUTPUT "GGA 10143.000000\n";
print OUTPUT "GGC 10400.000000\n";
print OUTPUT "GGG 8869.000000\n";
print OUTPUT "GGT 5567.000000\n";
print OUTPUT "GTA 2143.000000\n";
print OUTPUT "GTC 4593.000000\n";
print OUTPUT "GTG 8189.000000\n";
print OUTPUT "GTT 3021.000000\n";
print OUTPUT "TAA 1775.000000\n";
print OUTPUT "TAC 3687.000000\n";
print OUTPUT "TAG 1333.000000\n";
print OUTPUT "TAT 2477.000000\n";
print OUTPUT "TCA 6180.000000\n";
print OUTPUT "TCC 7668.000000\n";
print OUTPUT "TCG 2875.000000\n";
print OUTPUT "TCT 5767.000000\n";
print OUTPUT "TGA 7315.000000\n";
print OUTPUT "TGC 8625.000000\n";
print OUTPUT "TGG 11718.000000\n";
print OUTPUT "TGT 5197.000000\n";
print OUTPUT "TTA 1664.000000\n";
print OUTPUT "TTC 5462.000000\n";
print OUTPUT "TTG 4420.000000\n";
print OUTPUT "TTT 3453.000000\n";
print OUTPUT "//\n\n";
close(OUTPUT);
($errorcode,$errormsg) = &check_gtag_introns($output);
if ($errorcode != 0) { print STDERR "ERROR: test_check_gtag_introns: failed test1\n"; exit;}
system "rm -f $output";
$random_number = rand();
$output = $outputdir."/tmp".$random_number;
open(OUTPUT,">$output") || die "ERROR: test_check_gtag_introns: cannot open $output\n";
print OUTPUT "#\n";
print OUTPUT "# new genestats file\n";
print OUTPUT "#\n";
print OUTPUT "splice3 10\n";
# AG
print OUTPUT "HELMINTH0000001 TTCATTCCACGTAG\n";
print OUTPUT "HELMINTH0000002 TTTAATTCCGGTCA\n";
print OUTPUT "HELMINTH0000003 TCCTTTCCATTAAC\n";
print OUTPUT "HELMINTH0000004 TAATTTTTTGGTGT\n";
print OUTPUT "HELMINTH0000005 TTTCATGCAAATTG\n";
print OUTPUT "HELMINTH0000006 TTGATTTCGGGAAG\n";
print OUTPUT "//\n";
print OUTPUT "splice5 5\n";
# GT
print OUTPUT "HELMINTH0000001 AAAAGTGCATTTTT\n";
print OUTPUT "HELMINTH0000002 GAAGGTAGGAAAAT\n";
print OUTPUT "HELMINTH0000003 ACATGTAAGTTATT\n";
print OUTPUT "HELMINTH0000004 CATGGTATGTTCAC\n";
print OUTPUT "HELMINTH0000005 TCAGGTACCAAATT\n";
print OUTPUT "HELMINTH0000006 GCGTGTAAGTTCCT\n";
print OUTPUT "//\n";
print OUTPUT "# A G C T N\n";
print OUTPUT "intron_emission\n";
print OUTPUT "0.25 0.25 0.25 0.25 0.25\n";
print OUTPUT "//\n";
print OUTPUT "polyp_emission\n";
print OUTPUT "0.2 0.2 0.3 0.3 0.25\n";
print OUTPUT "//\n";
print OUTPUT "rnd_emission\n";
print OUTPUT "0.25 0.25 0.25 0.25 0.25 1.0\n";
print OUTPUT "//\n";
print OUTPUT "rndcodon\n";
print OUTPUT "AAA 5290.000000\n";
print OUTPUT "AAC 4795.000000\n";
print OUTPUT "AAG 8178.000000\n";
print OUTPUT "AAT 3305.000000\n";
print OUTPUT "ACA 6240.000000\n";
print OUTPUT "ACC 7728.000000\n";
print OUTPUT "ACG 3347.000000\n";
print OUTPUT "ACT 4930.000000\n";
print OUTPUT "AGA 8491.000000\n";
print OUTPUT "AGC 8639.000000\n";
print OUTPUT "AGG 8997.000000\n";
print OUTPUT "AGT 4417.000000\n";
print OUTPUT "ATA 1975.000000\n";
print OUTPUT "ATC 4973.000000\n";
print OUTPUT "ATG 6474.000000\n";
print OUTPUT "ATT 3083.000000\n";
print OUTPUT "CAA 7057.000000\n";
print OUTPUT "CAC 6815.000000\n";
print OUTPUT "CAG 11041.000000\n";
print OUTPUT "CAT 5779.000000\n";
print OUTPUT "CCA 10537.000000\n";
print OUTPUT "CCC 10307.000000\n";
print OUTPUT "CCG 5621.000000\n";
print OUTPUT "CCT 10134.000000\n";
print OUTPUT "CGA 3377.000000\n";
print OUTPUT "CGC 5146.000000\n";
print OUTPUT "CGG 5375.000000\n";
print OUTPUT "CGT 2765.000000\n";
print OUTPUT "CTA 3502.000000\n";
print OUTPUT "CTC 7465.000000\n";
print OUTPUT "CTG 13780.000000\n";
print OUTPUT "CTT 5453.000000\n";
print OUTPUT "GAA 7461.000000\n";
print OUTPUT "GAC 6937.000000\n";
print OUTPUT "GAG 9975.000000\n";
print OUTPUT "GAT 4949.000000\n";
print OUTPUT "GCA 7747.000000\n";
print OUTPUT "GCC 10890.000000\n";
print OUTPUT "GCG 4828.000000\n";
print OUTPUT "GCT 9371.000000\n";
print OUTPUT "GGA 10143.000000\n";
print OUTPUT "GGC 10400.000000\n";
print OUTPUT "GGG 8869.000000\n";
print OUTPUT "GGT 5567.000000\n";
print OUTPUT "GTA 2143.000000\n";
print OUTPUT "GTC 4593.000000\n";
print OUTPUT "GTG 8189.000000\n";
print OUTPUT "GTT 3021.000000\n";
print OUTPUT "TAA 1775.000000\n";
print OUTPUT "TAC 3687.000000\n";
print OUTPUT "TAG 1333.000000\n";
print OUTPUT "TAT 2477.000000\n";
print OUTPUT "TCA 6180.000000\n";
print OUTPUT "TCC 7668.000000\n";
print OUTPUT "TCG 2875.000000\n";
print OUTPUT "TCT 5767.000000\n";
print OUTPUT "TGA 7315.000000\n";
print OUTPUT "TGC 8625.000000\n";
print OUTPUT "TGG 11718.000000\n";
print OUTPUT "TGT 5197.000000\n";
print OUTPUT "TTA 1664.000000\n";
print OUTPUT "TTC 5462.000000\n";
print OUTPUT "TTG 4420.000000\n";
print OUTPUT "TTT 3453.000000\n";
print OUTPUT "//\n\n";
close(OUTPUT);
($errorcode,$errormsg) = &check_gtag_introns($output);
if ($errorcode != 18) { print STDERR "ERROR: test_check_gtag_introns: failed test2\n"; exit;}
system "rm -f $output";
$random_number = rand();
$output = $outputdir."/tmp".$random_number;
open(OUTPUT,">$output") || die "ERROR: test_check_gtag_introns: cannot open $output\n";
print OUTPUT "#\n";
print OUTPUT "# new genestats file\n";
print OUTPUT "#\n";
print OUTPUT "splice3 10\n";
# AG
print OUTPUT "HELMINTH0000001 TTCATTCCAGGTAG\n";
print OUTPUT "HELMINTH0000002 TTTAATTCAGGTCA\n";
print OUTPUT "HELMINTH0000003 TCCTTTCCAGTAAC\n";
print OUTPUT "HELMINTH0000004 TAATTTTTAGGTGT\n";
print OUTPUT "HELMINTH0000005 TTTCATGCAGATTG\n";
print OUTPUT "HELMINTH0000006 TTGATTTCAGGAAG\n";
print OUTPUT "//\n";
print OUTPUT "splice5 5\n";
# GT
print OUTPUT "HELMINTH0000001 AAAAATGCATTTTT\n";
print OUTPUT "HELMINTH0000002 GAAGTTAGGAAAAT\n";
print OUTPUT "HELMINTH0000003 ACATGGAAGTTATT\n";
print OUTPUT "HELMINTH0000004 CATGGCATGTTCAC\n";
print OUTPUT "HELMINTH0000005 TCAGCTACCAAATT\n";
print OUTPUT "HELMINTH0000006 GCGTCCAAGTTCCT\n";
print OUTPUT "//\n";
print OUTPUT "# A G C T N\n";
print OUTPUT "intron_emission\n";
print OUTPUT "0.25 0.25 0.25 0.25 0.25\n";
print OUTPUT "//\n";
print OUTPUT "polyp_emission\n";
print OUTPUT "0.2 0.2 0.3 0.3 0.25\n";
print OUTPUT "//\n";
print OUTPUT "rnd_emission\n";
print OUTPUT "0.25 0.25 0.25 0.25 0.25 1.0\n";
print OUTPUT "//\n";
print OUTPUT "rndcodon\n";
print OUTPUT "AAA 5290.000000\n";
print OUTPUT "AAC 4795.000000\n";
print OUTPUT "AAG 8178.000000\n";
print OUTPUT "AAT 3305.000000\n";
print OUTPUT "ACA 6240.000000\n";
print OUTPUT "ACC 7728.000000\n";
print OUTPUT "ACG 3347.000000\n";
print OUTPUT "ACT 4930.000000\n";
print OUTPUT "AGA 8491.000000\n";
print OUTPUT "AGC 8639.000000\n";
print OUTPUT "AGG 8997.000000\n";
print OUTPUT "AGT 4417.000000\n";
print OUTPUT "ATA 1975.000000\n";
print OUTPUT "ATC 4973.000000\n";
print OUTPUT "ATG 6474.000000\n";
print OUTPUT "ATT 3083.000000\n";
print OUTPUT "CAA 7057.000000\n";
print OUTPUT "CAC 6815.000000\n";
print OUTPUT "CAG 11041.000000\n";
print OUTPUT "CAT 5779.000000\n";
print OUTPUT "CCA 10537.000000\n";
print OUTPUT "CCC 10307.000000\n";
print OUTPUT "CCG 5621.000000\n";
print OUTPUT "CCT 10134.000000\n";
print OUTPUT "CGA 3377.000000\n";
print OUTPUT "CGC 5146.000000\n";
print OUTPUT "CGG 5375.000000\n";
print OUTPUT "CGT 2765.000000\n";
print OUTPUT "CTA 3502.000000\n";
print OUTPUT "CTC 7465.000000\n";
print OUTPUT "CTG 13780.000000\n";
print OUTPUT "CTT 5453.000000\n";
print OUTPUT "GAA 7461.000000\n";
print OUTPUT "GAC 6937.000000\n";
print OUTPUT "GAG 9975.000000\n";
print OUTPUT "GAT 4949.000000\n";
print OUTPUT "GCA 7747.000000\n";
print OUTPUT "GCC 10890.000000\n";
print OUTPUT "GCG 4828.000000\n";
print OUTPUT "GCT 9371.000000\n";
print OUTPUT "GGA 10143.000000\n";
print OUTPUT "GGC 10400.000000\n";
print OUTPUT "GGG 8869.000000\n";
print OUTPUT "GGT 5567.000000\n";
print OUTPUT "GTA 2143.000000\n";
print OUTPUT "GTC 4593.000000\n";
print OUTPUT "GTG 8189.000000\n";
print OUTPUT "GTT 3021.000000\n";
print OUTPUT "TAA 1775.000000\n";
print OUTPUT "TAC 3687.000000\n";
print OUTPUT "TAG 1333.000000\n";
print OUTPUT "TAT 2477.000000\n";
print OUTPUT "TCA 6180.000000\n";
print OUTPUT "TCC 7668.000000\n";
print OUTPUT "TCG 2875.000000\n";
print OUTPUT "TCT 5767.000000\n";
print OUTPUT "TGA 7315.000000\n";
print OUTPUT "TGC 8625.000000\n";
print OUTPUT "TGG 11718.000000\n";
print OUTPUT "TGT 5197.000000\n";
print OUTPUT "TTA 1664.000000\n";
print OUTPUT "TTC 5462.000000\n";
print OUTPUT "TTG 4420.000000\n";
print OUTPUT "TTT 3453.000000\n";
print OUTPUT "//\n\n";
close(OUTPUT);
($errorcode,$errormsg) = &check_gtag_introns($output);
if ($errorcode != 18) { print STDERR "ERROR: test_check_gtag_introns: failed test3\n"; exit;}
system "rm -f $output";
return($errorcode,$errormsg);
}
#------------------------------------------------------------------#
# CHECK THAT THE SPLICE SITES IN THE GENEWISE PARAMETER FILE ARE MOSTLY GT...AG:
sub check_gtag_introns
{
my $output = $_[0]; # THE GENEWISE PARAMETER FILE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $line; #
my @temp; #
my $found_threesplice = 0; # SAYS WHETHER WE HAVE REACHED THE 3' SPLICE SITES
my $found_fivesplice = 0; # SAYS WHETHER WE HAVE REACHED THE 5' SPLICE SITES
my $seq; #
my $num_threesplice = 0; # NUMBER OF 3' SPLICE SITE SEQUENCES SEEN
my $num_threesplice_ag = 0; # NUMBER OF 3' SPLICE SITES WHERE THE INTRON ENDS IN 'AG' SEEN
my $num_fivesplice = 0; # NUMBER OF 5' SPLICE SITE SEQUENCES SEEN
my $num_fivesplice_gt = 0; # NUMBER OF 5' SPLICE SITES WHERE THE INTRON STARTS WITH 'GT' SEEN
my $pc_fivesplice_gt; # PERCENTAGE OF 5' SPLICE SITES WHERE THE INTRON STARTS WITH 'GT'
my $pc_threesplice_ag; # PERCENTAGE OF 3' SPLICE SITES WHERE THE INTRON ENDS WITH 'AG'
open(OUTPUT,"$output") || die "ERROR: check_gtag_introns: cannot open $output\n";
while(<OUTPUT>)
{
$line = $_;
chomp $line;
@temp = split(/\s+/,$line);
if ($line =~ /splice3/) { $found_threesplice = 1; }
elsif ($line =~ /splice5/) { $found_threesplice = 0; $found_fivesplice = 1;}
elsif ($line =~ /HELMINTH/)
{
if ($found_threesplice == 1)
{
$seq = $temp[1];
$num_threesplice++;
if (substr($seq,8,2) eq 'AG') { $num_threesplice_ag++;}
}
elsif ($found_fivesplice == 1)
{
$seq = $temp[1];
$num_fivesplice++;
if (substr($seq,4,2) eq 'GT') { $num_fivesplice_gt++;}
}
else
{
$errormsg = "ERROR: check_gtag_introns: found_threesplice $found_threesplice found_fivesplice $found_fivesplice\n";
$errorcode = 17; # ERRORCODE=17
return($errorcode,$errormsg);
}
}
}
close(OUTPUT);
# CHECK THAT >=90% OF THE INTRONS ARE GT...AG INTRONS:
$pc_fivesplice_gt = $num_fivesplice_gt*100/$num_fivesplice;
$pc_threesplice_ag = $num_threesplice_ag*100/$num_threesplice;
if ($pc_threesplice_ag < 90.0 || $pc_fivesplice_gt < 90.0)
{
$errormsg = "ERROR: check_gtag_introns: pc_threesplice_ag $pc_threesplice_ag pc_fivesplice_gt $pc_fivesplice_gt\n";
$errorcode = 18; # ERRORCODE=18
return($errorcode,$errormsg);
}
if ($PRINT_TEST_DATA == 1) { print STDERR "$output: $pc_fivesplice_gt % have GT at 5', $pc_threesplice_ag % have AG at 3' of intron\n";}
return($errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &make_genewise_paramfile
sub test_make_genewise_paramfile
{
my $outputdir = $_[0]; # DIRECTORY FOR WRITING OUTPUT FILES TO
my $fivesplice; # FASTA FILE OF 5' SPLICE SITES
my $threesplice; # FASTA FILE OF 3' SPLICE SITES
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAME
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR
my $output; # THE OUTPUT FILE NAME
my $expected_output; # FILE CONTAINING THE EXPECTED OUTPUT
my $differences; # DIFFERENES BETWEEN $output AND $expected_output
my $length_differences; # LENGTH OF $differences
my $line; #
$random_number = rand();
$fivesplice = $outputdir."/tmp".$random_number;
open(FIVESPLICE,">$fivesplice") || die "ERROR: test_make_genewise_paramfile: cannot open $fivesplice\n";
print FIVESPLICE ">KOG0142.1.R.1_5splice\n";
print FIVESPLICE "AAAAGTGCATTTTT\n";
print FIVESPLICE ">KOG0142.1.R.2_5splice\n";
print FIVESPLICE "GAAGGTAGGAAAAT\n";
print FIVESPLICE ">KOG0142.1.R.3_5splice\n";
print FIVESPLICE "ACATGTAAGTTATT\n";
print FIVESPLICE ">KOG1626.1.R.1_5splice\n";
print FIVESPLICE "CATGGTATGTTCAC\n";
print FIVESPLICE ">KOG3372.1.R.1_5splice\n";
print FIVESPLICE "TCAGGTACCAAATT\n";
print FIVESPLICE ">KOG3372.1.R.2_5splice\n";
print FIVESPLICE "GCGTGTAAGTTCCT\n";
close(FIVESPLICE);
$random_number = rand();
$threesplice = $outputdir."/tmp".$random_number;
open(THREESPLICE,">$threesplice") || die "ERROR: test_make_genewise_paramfile: cannot open $threesplice\n";
print THREESPLICE ">KOG0142.1.R.1_3splice\n";
print THREESPLICE "TTCATTCCAGGTAG\n";
print THREESPLICE ">KOG0142.1.R.2_3splice\n";
print THREESPLICE "TTTAATTCAGGTCA\n";
print THREESPLICE ">KOG0142.1.R.3_3splice\n";
print THREESPLICE "TCCTTTCCAGTAAC\n";
print THREESPLICE ">KOG1626.1.R.1_3splice\n";
print THREESPLICE "TAATTTTTAGGTGT\n";
print THREESPLICE ">KOG3372.1.R.1_3splice\n";
print THREESPLICE "TTTCATGCAGATTG\n";
print THREESPLICE ">KOG3372.1.R.2_3splice\n";
print THREESPLICE "TTGATTTCAGGAAG\n";
close(THREESPLICE);
$random_number = rand();
$output = "tmp".$random_number;
$random_number = rand();
$expected_output = $outputdir."/tmp".$random_number;
open(EXPECTED,">$expected_output") || die "ERROR: test_make_genewise_paramfile: cannot open $expected_output\n";
print EXPECTED "#\n";
print EXPECTED "# new genestats file\n";
print EXPECTED "#\n";
print EXPECTED "splice3 10\n";
print EXPECTED "HELMINTH0000001 TTCATTCCAGGTAG\n";
print EXPECTED "HELMINTH0000002 TTTAATTCAGGTCA\n";
print EXPECTED "HELMINTH0000003 TCCTTTCCAGTAAC\n";
print EXPECTED "HELMINTH0000004 TAATTTTTAGGTGT\n";
print EXPECTED "HELMINTH0000005 TTTCATGCAGATTG\n";
print EXPECTED "HELMINTH0000006 TTGATTTCAGGAAG\n";
print EXPECTED "//\n";
print EXPECTED "splice5 5\n";
print EXPECTED "HELMINTH0000001 AAAAGTGCATTTTT\n";
print EXPECTED "HELMINTH0000002 GAAGGTAGGAAAAT\n";
print EXPECTED "HELMINTH0000003 ACATGTAAGTTATT\n";
print EXPECTED "HELMINTH0000004 CATGGTATGTTCAC\n";
print EXPECTED "HELMINTH0000005 TCAGGTACCAAATT\n";
print EXPECTED "HELMINTH0000006 GCGTGTAAGTTCCT\n";
print EXPECTED "//\n";
print EXPECTED "# A G C T N\n";
print EXPECTED "intron_emission\n";
print EXPECTED "0.25 0.25 0.25 0.25 0.25\n";
print EXPECTED "//\n";
print EXPECTED "polyp_emission\n";
print EXPECTED "0.2 0.2 0.3 0.3 0.25\n";
print EXPECTED "//\n";
print EXPECTED "rnd_emission\n";
print EXPECTED "0.25 0.25 0.25 0.25 0.25 1.0\n";
print EXPECTED "//\n";
print EXPECTED "rndcodon\n";
print EXPECTED "AAA 5290.000000\n";
print EXPECTED "AAC 4795.000000\n";
print EXPECTED "AAG 8178.000000\n";
print EXPECTED "AAT 3305.000000\n";
print EXPECTED "ACA 6240.000000\n";
print EXPECTED "ACC 7728.000000\n";
print EXPECTED "ACG 3347.000000\n";
print EXPECTED "ACT 4930.000000\n";
print EXPECTED "AGA 8491.000000\n";
print EXPECTED "AGC 8639.000000\n";
print EXPECTED "AGG 8997.000000\n";
print EXPECTED "AGT 4417.000000\n";
print EXPECTED "ATA 1975.000000\n";
print EXPECTED "ATC 4973.000000\n";
print EXPECTED "ATG 6474.000000\n";
print EXPECTED "ATT 3083.000000\n";
print EXPECTED "CAA 7057.000000\n";
print EXPECTED "CAC 6815.000000\n";
print EXPECTED "CAG 11041.000000\n";
print EXPECTED "CAT 5779.000000\n";
print EXPECTED "CCA 10537.000000\n";
print EXPECTED "CCC 10307.000000\n";
print EXPECTED "CCG 5621.000000\n";
print EXPECTED "CCT 10134.000000\n";
print EXPECTED "CGA 3377.000000\n";
print EXPECTED "CGC 5146.000000\n";
print EXPECTED "CGG 5375.000000\n";
print EXPECTED "CGT 2765.000000\n";
print EXPECTED "CTA 3502.000000\n";
print EXPECTED "CTC 7465.000000\n";
print EXPECTED "CTG 13780.000000\n";
print EXPECTED "CTT 5453.000000\n";
print EXPECTED "GAA 7461.000000\n";
print EXPECTED "GAC 6937.000000\n";
print EXPECTED "GAG 9975.000000\n";
print EXPECTED "GAT 4949.000000\n";
print EXPECTED "GCA 7747.000000\n";
print EXPECTED "GCC 10890.000000\n";
print EXPECTED "GCG 4828.000000\n";
print EXPECTED "GCT 9371.000000\n";
print EXPECTED "GGA 10143.000000\n";
print EXPECTED "GGC 10400.000000\n";
print EXPECTED "GGG 8869.000000\n";
print EXPECTED "GGT 5567.000000\n";
print EXPECTED "GTA 2143.000000\n";
print EXPECTED "GTC 4593.000000\n";
print EXPECTED "GTG 8189.000000\n";
print EXPECTED "GTT 3021.000000\n";
print EXPECTED "TAA 1775.000000\n";
print EXPECTED "TAC 3687.000000\n";
print EXPECTED "TAG 1333.000000\n";
print EXPECTED "TAT 2477.000000\n";
print EXPECTED "TCA 6180.000000\n";
print EXPECTED "TCC 7668.000000\n";
print EXPECTED "TCG 2875.000000\n";
print EXPECTED "TCT 5767.000000\n";
print EXPECTED "TGA 7315.000000\n";
print EXPECTED "TGC 8625.000000\n";
print EXPECTED "TGG 11718.000000\n";
print EXPECTED "TGT 5197.000000\n";
print EXPECTED "TTA 1664.000000\n";
print EXPECTED "TTC 5462.000000\n";
print EXPECTED "TTG 4420.000000\n";
print EXPECTED "TTT 3453.000000\n";
print EXPECTED "//\n\n";
close(EXPECTED);
($errorcode,$errormsg) = &make_genewise_paramfile($fivesplice,$threesplice,$outputdir,$output);
if ($errorcode != 0) { print STDERR "ERROR: test_make_genewise_paramfile: failed test1\n"; exit;}
$differences = "";
$output = $outputdir."/".$output;
open(TEMP,"diff $output $expected_output |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_make_genewise_paramfile: failed test1 (files $output $expected_output)\n"; exit;}
system "rm -f $fivesplice";
system "rm -f $threesplice";
system "rm -f $output";
system "rm -f $expected_output";
$random_number = rand();
$fivesplice = $outputdir."/tmp".$random_number;
open(FIVESPLICE,">$fivesplice") || die "ERROR: test_make_genewise_paramfile: cannot open $fivesplice\n";
print FIVESPLICE ">KOG0142.1.R.1_5splice\n";
print FIVESPLICE "AAAAGTGCATTTTT\n";
print FIVESPLICE ">KOG0142.1.R.2_5splice\n";
print FIVESPLICE "none\n";
print FIVESPLICE ">KOG0142.1.R.3_5splice\n";
print FIVESPLICE "ACATGTAAGTTATT\n";
print FIVESPLICE ">KOG1626.1.R.1_5splice\n";
print FIVESPLICE "CATGGTATGTTCAC\n";
print FIVESPLICE ">KOG3372.1.R.1_5splice\n";
print FIVESPLICE "TCAGGTACCAAATT\n";
print FIVESPLICE ">KOG3372.1.R.2_5splice\n";
print FIVESPLICE "GCGTGTAAGTTCCT\n";
close(FIVESPLICE);
$random_number = rand();
$threesplice = $outputdir."/tmp".$random_number;
open(THREESPLICE,">$threesplice") || die "ERROR: test_make_genewise_paramfile: cannot open $threesplice\n";
print THREESPLICE ">KOG0142.1.R.1_3splice\n";
print THREESPLICE "TTCATTCCAGGTAG\n";
print THREESPLICE ">KOG0142.1.R.2_3splice\n";
print THREESPLICE "TTTAATTCAGGTCA\n";
print THREESPLICE ">KOG0142.1.R.3_3splice\n";
print THREESPLICE "TCCTTTCCAGTAAC\n";
print THREESPLICE ">KOG1626.1.R.1_3splice\n";
print THREESPLICE "TAATTTTTAGGTGT\n";
print THREESPLICE ">KOG3372.1.R.1_3splice\n";
print THREESPLICE "TTTCATGCAGATTG\n";
print THREESPLICE ">KOG3372.1.R.2_3splice\n";
print THREESPLICE "TTGATTTCAGGAAG\n";
close(THREESPLICE);
$random_number = rand();
$output = "tmp".$random_number;
($errorcode,$errormsg) = &make_genewise_paramfile($fivesplice,$threesplice,$outputdir,$output);
if ($errorcode != 14) { print STDERR "ERROR: test_make_genewise_paramfile: failed test2\n"; exit;}
$output = $outputdir."/".$output;
system "rm -f $fivesplice";
system "rm -f $threesplice";
system "rm -f $output";
}
#------------------------------------------------------------------#
# MAKE A GENEWISE PARAMETER FILE:
sub make_genewise_paramfile
{
my $fivesplice = $_[0]; # FASTA FILE OF 5' SPLICE SITES
my $threesplice = $_[1]; # FASTA FILE OF 3' SPLICE SITES
my $outputdir = $_[2]; # DIRECTORY FOR WRITING OUTPUT FILES TO
my $output = $_[3]; # THE GENEWISE PARAMETER FILE
my $no_introns; # NUMBER OF SPLICE SITES FOR INTRONS PRINTED OUT
my %NEW_NAME = (); # HASH TABLE WITH NAME FOR A SPLICE SITE
my $name; # NAME FOR A SPLICE SITE
my $splice; # SPLICE SITE SEQUENCE
my $line; #
my @temp; #
my $new_name; # NEW NAME TO USE FOR A SPLICE SITE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
$output = $outputdir."/".$output;
open(OUTPUT,">$output") || die "ERROR: make_genewise_paramfile: cannot open $output\n";
# PRINT OUT THE HEADER FOR THE PARAMETER FILE (COPIED FROM THE
# GENEWISE PARAMETER FILE /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/gene.stat):
print OUTPUT "#\n";
print OUTPUT "# new genestats file\n";
print OUTPUT "#\n";
print OUTPUT "splice3 10\n";
# READ IN THE FILE OF 3' SPLICE SITES:
$no_introns = 0;
$name = "none";
$splice = "none";
open(THREESPLICE,"$threesplice") || die "ERROR: make_genewise_paramfile: cannot open $threesplice.\n";
while(<THREESPLICE>)
{
$line = $_;
chomp $line;
@temp = split(/\s+/,$line);
if (substr($line,0,1) eq '>')
{
if ($name ne 'none' && $splice ne 'none')
{
print OUTPUT "$name $splice\n";
}
$name = $temp[0];
@temp = split(/_/,$name); # eg. KOG0142.1.R.2_5splice
$name = $temp[0];
$name = substr($name,1,length($name)-1);
# FIND A NEW NAME FOR THIS SEQUENCE:
$no_introns++;
($new_name,$errorcode,$errormsg) = &new_intron_name($name,$no_introns);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
$NEW_NAME{$name} = $new_name;
$name = $new_name;
}
else
{
if ($temp[0] eq 'none')
{
$errormsg = "ERROR: make_genewise_paramfile: sequence $temp[0] for name $name\n";
$errorcode = 14; # ERRORCODE=14
return($errorcode,$errormsg);
}
$splice = $temp[0];
}
}
close(THREESPLICE);
if ($name ne 'none' && $splice ne 'none')
{
print OUTPUT "$name $splice\n";
}
# READ IN THE FILE OF 5' SPLICE SITES:
print OUTPUT "//\n";
print OUTPUT "splice5 5\n";
$name = "none";
$splice = "none";
open(FIVESPLICE,"$fivesplice") || die "ERROR: make_genewise_paramfile: cannot open $fivesplice.\n";
while(<FIVESPLICE>)
{
$line = $_;
chomp $line;
@temp = split(/\s+/,$line);
if (substr($line,0,1) eq '>')
{
if ($name ne 'none' && $splice ne 'none')
{
print OUTPUT "$name $splice\n";
}
$name = $temp[0];
@temp = split(/_/,$name);
$name = $temp[0];
$name = substr($name,1,length($name)-1);
if (!($NEW_NAME{$name}))
{
$errormsg = "ERROR: make_genewise_paramfile: do not know new name for $name\n";
$errorcode = 7; # ERRORCODE=7
return($errorcode,$errormsg);
}
$new_name = $NEW_NAME{$name};
$name = $new_name;
}
else
{
if ($temp[0] eq 'none')
{
$errormsg = "ERROR: make_genewise_paramfile: sequence $temp[0] for name $name\n";
$errorcode = 14; # ERRORCODE=14
return($errorcode,$errormsg);
}
$splice = $temp[0];
}
}
close(FIVESPLICE);
if ($name ne 'none' && $splice ne 'none')
{
print OUTPUT "$name $splice\n";
}
# WRITE OUT THE REST OF THE PARAMETERS IN THE PARAMETER FILE, COPIED FROM
# THE GENEWISE PARAMETER FILE /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/gene.stat)
print OUTPUT "//\n";
print OUTPUT "# A G C T N\n";
print OUTPUT "intron_emission\n";
print OUTPUT "0.25 0.25 0.25 0.25 0.25\n";
print OUTPUT "//\n";
print OUTPUT "polyp_emission\n";
print OUTPUT "0.2 0.2 0.3 0.3 0.25\n";
print OUTPUT "//\n";
print OUTPUT "rnd_emission\n";
print OUTPUT "0.25 0.25 0.25 0.25 0.25 1.0\n";
print OUTPUT "//\n";
print OUTPUT "rndcodon\n";
print OUTPUT "AAA 5290.000000\n";
print OUTPUT "AAC 4795.000000\n";
print OUTPUT "AAG 8178.000000\n";
print OUTPUT "AAT 3305.000000\n";
print OUTPUT "ACA 6240.000000\n";
print OUTPUT "ACC 7728.000000\n";
print OUTPUT "ACG 3347.000000\n";
print OUTPUT "ACT 4930.000000\n";
print OUTPUT "AGA 8491.000000\n";
print OUTPUT "AGC 8639.000000\n";
print OUTPUT "AGG 8997.000000\n";
print OUTPUT "AGT 4417.000000\n";
print OUTPUT "ATA 1975.000000\n";
print OUTPUT "ATC 4973.000000\n";
print OUTPUT "ATG 6474.000000\n";
print OUTPUT "ATT 3083.000000\n";
print OUTPUT "CAA 7057.000000\n";
print OUTPUT "CAC 6815.000000\n";
print OUTPUT "CAG 11041.000000\n";
print OUTPUT "CAT 5779.000000\n";
print OUTPUT "CCA 10537.000000\n";
print OUTPUT "CCC 10307.000000\n";
print OUTPUT "CCG 5621.000000\n";
print OUTPUT "CCT 10134.000000\n";
print OUTPUT "CGA 3377.000000\n";
print OUTPUT "CGC 5146.000000\n";
print OUTPUT "CGG 5375.000000\n";
print OUTPUT "CGT 2765.000000\n";
print OUTPUT "CTA 3502.000000\n";
print OUTPUT "CTC 7465.000000\n";
print OUTPUT "CTG 13780.000000\n";
print OUTPUT "CTT 5453.000000\n";
print OUTPUT "GAA 7461.000000\n";
print OUTPUT "GAC 6937.000000\n";
print OUTPUT "GAG 9975.000000\n";
print OUTPUT "GAT 4949.000000\n";
print OUTPUT "GCA 7747.000000\n";
print OUTPUT "GCC 10890.000000\n";
print OUTPUT "GCG 4828.000000\n";
print OUTPUT "GCT 9371.000000\n";
print OUTPUT "GGA 10143.000000\n";
print OUTPUT "GGC 10400.000000\n";
print OUTPUT "GGG 8869.000000\n";
print OUTPUT "GGT 5567.000000\n";
print OUTPUT "GTA 2143.000000\n";
print OUTPUT "GTC 4593.000000\n";
print OUTPUT "GTG 8189.000000\n";
print OUTPUT "GTT 3021.000000\n";
print OUTPUT "TAA 1775.000000\n";
print OUTPUT "TAC 3687.000000\n";
print OUTPUT "TAG 1333.000000\n";
print OUTPUT "TAT 2477.000000\n";
print OUTPUT "TCA 6180.000000\n";
print OUTPUT "TCC 7668.000000\n";
print OUTPUT "TCG 2875.000000\n";
print OUTPUT "TCT 5767.000000\n";
print OUTPUT "TGA 7315.000000\n";
print OUTPUT "TGC 8625.000000\n";
print OUTPUT "TGG 11718.000000\n";
print OUTPUT "TGT 5197.000000\n";
print OUTPUT "TTA 1664.000000\n";
print OUTPUT "TTC 5462.000000\n";
print OUTPUT "TTG 4420.000000\n";
print OUTPUT "TTT 3453.000000\n";
print OUTPUT "//\n\n";
close(OUTPUT);
return($errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &new_intron_name
sub test_new_intron_name
{
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR
my $new_name; # NEW NAME FOR AN INTRON
($new_name,$errorcode,$errormsg) = &new_intron_name("KOG0363.1.R.6_3splice",3);
if ($errorcode != 0 || $new_name ne 'HELMINTH0000003') { print STDERR "ERROR: test_new_intron_name: failed test1\n"; exit;}
($new_name,$errorcode,$errormsg) = &new_intron_name("KOG0363.1.R.6_3splice",33);
if ($errorcode != 0 || $new_name ne 'HELMINTH0000033') { print STDERR "ERROR: test_new_intron_name: failed test2\n"; exit;}
}
#------------------------------------------------------------------#
# WORK OUT A NEW NAME FOR AN INTRON:
sub new_intron_name
{
my $name = $_[0]; # ORIGINAL NAME OF AN INTRON
my $no_introns = $_[1]; # NUMBER OF INTRONS WE HAVE SEEN ALREADY
my $new_name; # NEW NAME FOR AN INTRON
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none"; # RETURNED AS 'none' IF THERE IS NO ERROR
my $length = length($no_introns); # LENGTH OF THE NUMBER OF INTRONS
my $no_zeroes = 7 - $length; # NUMBER OF ZEROES TO PUT IN $new_name
my $i; #
$new_name = "HELMINTH";
for ($i = 1; $i <= $no_zeroes; $i++) { $new_name = $new_name."0";}
$new_name = $new_name.$no_introns;
return($new_name,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &make_splicesite_fasta
sub test_make_splicesite_fasta
{
my $outputdir = $_[0]; # DIRECTORY FOR WRITING OUTPUT FILES IN
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES
my $expected_splicefasta; # EXPECTED SPLICE SITE FASTA FILE
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR
my $differences; # DIFFERENCES BETWEEN $new_intron_gff AND $expected_new_intron_gff
my $length_differences; # LENGTH OF $differences
my $line; #
my $splicefasta; # THE SPLICE SITE FASTA FILE
my $splicegff; # THE SPLICE SITE GFF FILE
my $fasta; # THE GENOME FASTA FILE
$random_number = rand();
$splicegff = $outputdir."/tmp".$random_number;
open(SPLICEGFF,">$splicegff") || die "ERROR: test_make_splicesite_fasta: cannot open $splicegff\n";
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t11\t15\t.\t+\t.\tGENE1.F.1_3splice\n";
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t31\t35\t.\t+\t.\tGENE1.F.2_3splice\n";
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t41\t45\t.\t+\t.\tGENE1.F.3_3splice\n";
print SPLICEGFF "scaffold2\tcurated\t3_splice_site\t11\t15\t.\t+\t.\tGENE2.F.1_3splice\n";
close(SPLICEGFF);
$random_number = rand();
$fasta = $outputdir."/tmp".$random_number;
open(FASTA,">$fasta") || die "ERROR: test_make_splicesite_fasta: cannot open $fasta\n";
print FASTA ">scaffold1\n";
print FASTA "ABCDEFGHIJ\n";
print FASTA "KLMNOPQRST\n";
print FASTA "UVWXYZabcd\n";
print FASTA "efghijklmn\n";
print FASTA "opqrstuvwx\n";
print FASTA ">scaffold2\n";
print FASTA "ABCDEFGHIJ\n";
print FASTA "KLMNOPQRST\n";
print FASTA "UVWXYZabcd\n";
print FASTA "efghijklmn\n";
print FASTA "opqrstuvwx\n";
close(FASTA);
$random_number = rand();
$expected_splicefasta = $outputdir."/tmp".$random_number;
open(EXPECTED,">$expected_splicefasta") || die "ERROR: test_make_splicesite_fasta: cannot open $expected_splicefasta\n";
print EXPECTED ">GENE1.F.1_3splice\n";
print EXPECTED "KLMNO\n";
print EXPECTED ">GENE1.F.2_3splice\n";
print EXPECTED "EFGHI\n";
print EXPECTED ">GENE1.F.3_3splice\n";
print EXPECTED "OPQRS\n";
print EXPECTED ">GENE2.F.1_3splice\n";
print EXPECTED "KLMNO\n";
close(EXPECTED);
($splicefasta,$errorcode,$errormsg) = &make_splicesite_fasta($splicegff,$outputdir,$fasta);
if ($errorcode != 0) { print STDERR "ERROR: test_make_splicesite_fasta: failed test1 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
$differences = "";
open(TEMP,"diff $splicefasta $expected_splicefasta |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_make_splicesite_fasta: failed test1 (files $splicefasta $expected_splicefasta)\n"; exit;}
system "rm -f $splicefasta";
system "rm -f $expected_splicefasta";
system "rm -f $splicegff";
system "rm -f $fasta";
$random_number = rand();
$splicegff = $outputdir."/tmp".$random_number;
open(SPLICEGFF,">$splicegff") || die "ERROR: test_make_splicesite_fasta: cannot open $splicegff\n";
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t11\t15\t.\t+\t.\tGENE1.F.1_3splice\n";
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t31\t35\t.\t+\t.\tGENE1.F.2_3splice\n";
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t41\t45\t.\t+\t.\tGENE1.F.3_3splice\n";
print SPLICEGFF "scaffold2\tcurated\t3_splice_site\t11\t25\t.\t+\t.\tGENE2.F.1_3splice\n";
close(SPLICEGFF);
$random_number = rand();
$fasta = $outputdir."/tmp".$random_number;
open(FASTA,">$fasta") || die "ERROR: test_make_splicesite_fasta: cannot open $fasta\n";
print FASTA ">scaffold1\n";
print FASTA "ABCDEFGHIJ\n";
print FASTA "KLMNOPQRST\n";
print FASTA "UVWXYZabcd\n";
print FASTA "efghijklmn\n";
print FASTA "opqrstuvwx\n";
print FASTA ">scaffold2\n";
print FASTA "ABCDEFGHIJ\n";
print FASTA "KLMNOPQRST\n";
print FASTA "UVWXYZabcd\n";
print FASTA "efghijklmn\n";
print FASTA "opqrstuvwx\n";
print FASTA ">scaffold1\n";
print FASTA "ABCDEFGHIJ\n";
print FASTA "KLMNOPQRST\n";
print FASTA "UVWXYZabcd\n";
print FASTA "efghijklmn\n";
close(FASTA);
$random_number = rand();
($splicefasta,$errorcode,$errormsg) = &make_splicesite_fasta($splicegff,$outputdir,$fasta);
if ($errorcode != 7) { print STDERR "ERROR: test_make_splicesite_fasta: failed test2 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
system "rm -f $splicefasta";
system "rm -f $splicegff";
system "rm -f $fasta";
$random_number = rand();
$splicegff = $outputdir."/tmp".$random_number;
open(SPLICEGFF,">$splicegff") || die "ERROR: test_make_splicesite_fasta: cannot open $splicegff\n";
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t11\t15\t.\t+\t.\tGENE1.F.1_3splice\n";
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t31\t35\t.\t+\t.\tGENE1.F.2_3splice\n";
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t41\t45\t.\t+\t.\tGENE1.F.3_3splice\n";
print SPLICEGFF "scaffold2\tcurated\t3_splice_site\t25\t11\t.\t+\t.\tGENE2.F.1_3splice\n";
close(SPLICEGFF);
$random_number = rand();
$fasta = $outputdir."/tmp".$random_number;
open(FASTA,">$fasta") || die "ERROR: test_make_splicesite_fasta: cannot open $fasta\n";
print FASTA ">scaffold1\n";
print FASTA "ABCDEFGHIJ\n";
print FASTA "KLMNOPQRST\n";
print FASTA "UVWXYZabcd\n";
print FASTA "efghijklmn\n";
print FASTA "opqrstuvwx\n";
print FASTA ">scaffold2\n";
print FASTA "ABCDEFGHIJ\n";
print FASTA "KLMNOPQRST\n";
print FASTA "UVWXYZabcd\n";
print FASTA "efghijklmn\n";
print FASTA "opqrstuvwx\n";
close(FASTA);
$random_number = rand();
($splicefasta,$errorcode,$errormsg) = &make_splicesite_fasta($splicegff,$outputdir,$fasta);
if ($errorcode != 8) { print STDERR "ERROR: test_make_splicesite_fasta: failed test3 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
system "rm -f $splicefasta";
system "rm -f $splicegff";
system "rm -f $fasta";
$random_number = rand();
$splicegff = $outputdir."/tmp".$random_number;
open(SPLICEGFF,">$splicegff") || die "ERROR: test_make_splicesite_fasta: cannot open $splicegff\n";
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t11\t15\t.\t+\t.\tGENE1.F.1_3splice\n";
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t31\t35\t.\t+\t.\tGENE1.F.2_3splice\n";
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t41\t45\t.\t+\t.\tGENE1.F.3_3splice\n";
print SPLICEGFF "scaffold3\tcurated\t3_splice_site\t11\t25\t.\t+\t.\tGENE2.F.1_3splice\n";
close(SPLICEGFF);
$random_number = rand();
$fasta = $outputdir."/tmp".$random_number;
open(FASTA,">$fasta") || die "ERROR: test_make_splicesite_fasta: cannot open $fasta\n";
print FASTA ">scaffold1\n";
print FASTA "ABCDEFGHIJ\n";
print FASTA "KLMNOPQRST\n";
print FASTA "UVWXYZabcd\n";
print FASTA "efghijklmn\n";
print FASTA "opqrstuvwx\n";
print FASTA ">scaffold2\n";
print FASTA "ABCDEFGHIJ\n";
print FASTA "KLMNOPQRST\n";
print FASTA "UVWXYZabcd\n";
print FASTA "efghijklmn\n";
print FASTA "opqrstuvwx\n";
close(FASTA);
$random_number = rand();
($splicefasta,$errorcode,$errormsg) = &make_splicesite_fasta($splicegff,$outputdir,$fasta);
if ($errorcode != 9) { print STDERR "ERROR: test_make_splicesite_fasta: failed test4 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
system "rm -f $splicefasta";
system "rm -f $splicegff";
system "rm -f $fasta";
}
#------------------------------------------------------------------#
# MAKE A FASTA FILE WITH THE SPLICE SITE SEQUENCES:
sub make_splicesite_fasta
{
my $splicegff = $_[0]; # THE SPLICE SITE GFF FILE
my $outputdir = $_[1]; # THE DIRECTORY FOR WRITING OUTPUT FILES
my $fasta = $_[2]; # THE GENOME FASTA FILE
my $splicefasta = "none";# THE SPLICE SITE FASTA FILE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $line; #
my @temp; #
my $seq = ""; # SEQUENCE OF A CONTIG/CHROMOSOME
my %SEQ = (); # HASH TABLE FOR STORING SEQUENCES
my $name = "none";# NAME OF A CONTIG/CHROMOSOME
my $random_number; # RANDOM NUMBER FOR TEMPORARY FILE NAME
my $chunk_start; # START OF THE CHUNK OF DNA WE WANT
my $chunk_end; # END OF THE CHUNK OF DNA WE WANT
my $chunk_length; # LENGTH OF THE CHUNK OF DNA WE WANT
my $seq_length; # LENGTH OF THE CONTIG/CHROMOSOME SEQUENCE
my $chunk_seq; # SEQUENCE OF THE CHUNK OF DNA NEAR THE SPLICE SITE
my $chunk_name; # NAME OF THE CHUNK OF DNA NEAR THE SPLICE SITE
my $offset; # A COUNTER USED IN PRINTING OUT THE SEQUENCE $chunk_seq
my $seqline; # A LINE OF THE $chunk_seq
my $strand; # STRAND OF THE INTRON
# READ IN THE GENOME SEQUENCE FILE:
open(FASTA,"$fasta") || die "ERROR: make_splicesite_fasta: cannot open fasta file $fasta.\n";
while(<FASTA>)
{
$line = $_;
chomp $line;
if (substr($line,0,1) eq '>')
{
if ($name ne "none")
{
$seq =~ tr/[a-z]/[A-Z]/;
$seq =~ s/\s+//g;
if ($SEQ{$name})
{
$errormsg = "ERROR: make_splicesite_fasta: already have sequence for $name.\n";
$errorcode = 7;
return($splicefasta,$errorcode,$errormsg);
}
$SEQ{$name} = $seq;
$seq = "";
}
@temp = split(/\s+/,$line);
$name = $temp[0];
$name = substr($name,1,length($name)-1);
$name =~ tr/[a-z]/[A-Z]/;
}
else
{
$seq = $seq.$line;
}
}
close(FASTA);
# STORE THE LAST SEQUENCE:
if ($name ne "none")
{
$seq =~ tr/[a-z]/[A-Z]/;
if ($SEQ{$name})
{
$errormsg = "ERROR: make_splicesite_fasta: already have sequence for $name.\n";
$errorcode = 7;
return($splicefasta,$errorcode,$errormsg);
}
$SEQ{$name} = $seq;
$seq = "";
}
# READ IN THE GFF FILE, AND PRINT THE OUTPUT FILE:
$random_number = rand();
$splicefasta = $outputdir."/tmp".$random_number;
open(SPLICEFASTA,">$splicefasta") || die "ERROR: make_splicesite_fasta: cannot open $splicefasta\n";
# READ IN THE GFF FILE:
open(GFF,$splicegff) || die "ERROR: make_splicesite_fasta: cannot open $splicegff.\n";
while(<GFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
$name = $temp[0];
$name =~ tr/[a-z]/[A-Z]/;
$chunk_start = $temp[3];
$chunk_end = $temp[4];
$strand = $temp[6];
$chunk_name = $temp[8];
$chunk_length = $chunk_end - $chunk_start + 1;
if ($chunk_length < 0)
{
$errormsg = "ERROR: make_splicesite_fasta: chunk_length $chunk_length line $line\n";
$errorcode = 8; # ERRORCODE=8
return($splicefasta,$errorcode,$errormsg);
}
if (!($SEQ{$name}))
{
$errormsg = "ERROR: make_splicesite_fasta: do not know sequence for $name.\n";
$errorcode = 9; # ERRORCODE=9
return($splicefasta,$errorcode,$errormsg);
}
$seq = $SEQ{$name};
$seq_length = length($seq);
if ($seq_length == 0)
{
$errormsg = "ERROR: make_splicesite_fasta: seq_length $seq_length for name $name.\n";
$errorcode = 10; # ERRORCODE=10
return($splicefasta,$errorcode,$errormsg);
}
$chunk_seq = substr($seq,$chunk_start-1,$chunk_length);
# FIND THE REVERSE-COMPLEMENT OF THE BASES IF $strand IS "-":
if ($strand eq '-')
{
($chunk_seq,$errorcode,$errormsg) = &reverse_complement($chunk_seq);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
}
# PRINT THE CHUNK $chunk_name IN THE OUTPUT FILE:
print SPLICEFASTA ">$chunk_name\n";
# PRINT OUT THE $chunk_seq IN LINES THAT ARE 60 bp LONG:
$offset = 0;
while ($offset < $chunk_length)
{
$seqline = substr($chunk_seq,$offset,60);
print SPLICEFASTA "$seqline\n";
$offset = $offset + 60;
}
}
close(GFF);
close(SPLICEFASTA);
return($splicefasta,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
sub test_reverse_complement
{
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR
my $seq; # SEQUENCE TO FIND THE REVERSE COMPLEMENT OF
my $revcomp; # THE REVERSE COMPLEMENT SEQUENCE
($revcomp,$errorcode,$errormsg) = &reverse_complement("AACCTTGG");
if ($errorcode != 0 && $revcomp ne 'CCAAGGTT') { print STDERR "ERROR: test_reverse_complement: failed test1\n"; exit;}
($revcomp,$errorcode,$errormsg) = &reverse_complement('');
if ($errorcode != 15) { print STDERR "ERROR: test_reverse_complement: failed test2\n"; exit;}
}
#------------------------------------------------------------------#
# GET THE REVERSE COMPLEMENT OF A SEQUENCE:
# SUBROUTINE SYNOPSIS: reverse_complement(): finds the reverse complement of a sequence
sub reverse_complement
{
my $seq = $_[0]; ## SEQUENCE THAT WE WANT TO FIND THE COMPLEMENT OF
my $complement = ""; ## COMPLEMENT OF SEQUENCE $seq
my $errorcode = 0; ## RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";## RETURNED AS 'none' IF THERE IS NO ERROR
if ($seq eq '')
{
$errormsg = "ERROR: find_complement: seq is $seq\n";
$errorcode = 15; # ERRORCODE=15
return($complement,$errorcode,$errormsg);
}
$seq =~ tr/[a-z]/[A-Z]/; # CHANGE TO UPPERCASE
$complement = $seq;
# SWAP As WITH Ts:
$complement =~ s/A/1/g; # SUBSTITUTE '1' FOR 'A'
$complement =~ s/T/A/g; # SUBSTITUTE 'A' FOR 'T'
$complement =~ s/1/T/g; # SUBSTITUTE 'T' FOR '1'
# SWAP Gs WITH Cs:
$complement =~ s/G/1/g; # SUBSTITUTE '1' FOR 'G'
$complement =~ s/C/G/g; # SUBSITTUTE 'G' FOR 'C'
$complement =~ s/1/C/g; # SUBSTITUTE 'C' FOR '1'
# FIND THE REVERSE COMPLEMENT:
$complement = reverse $complement;
if ($complement eq '')
{
$errormsg = "ERROR: reverse_complement: complement $complement (seq $seq)\n";
$errorcode = 16; # ERRORCODE=16
return($complement,$errorcode,$errormsg);
}
return($complement,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &make_splicesite_gff
sub test_make_splicesite_gff
{
my $outputdir = $_[0]; ## DIRECTORY FOR WRITING OUTPUT FILES IN
my $random_number; ## RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES
my $expected_splicegff; ## EXPECTED SPLICE SITE GFF
my $errorcode; ## RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR
my $errormsg; ## RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR
my $differences; ## DIFFERENCES BETWEEN $new_intron_gff AND $expected_new_intron_gff
my $length_differences; ## LENGTH OF $differences
my $line; ##
my $intron_gff; ## THE INTRON GFF FILE
my $splicegff; ## THE SPLICE SITE GFF FILE
$random_number = rand();
$intron_gff = $outputdir."/tmp".$random_number;
open(INTRONGFF,">$intron_gff") || die "ERROR: test_remove_duplicate_introns: cannot open $intron_gff\n";
print INTRONGFF "scaffold1\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n";
print INTRONGFF "scaffold1\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n";
print INTRONGFF "scaffold1\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n";
print INTRONGFF "scaffold2\tcurated\tintron\t116\t119\t.\t.\t.\tgene1.f.1\n";
print INTRONGFF "scaffold2\tcurated\tintron\t126\t129\t.\t.\t.\tgene1.f.2\n";
print INTRONGFF "scaffold2\tcurated\tintron\t136\t139\t.\t.\t.\tgene1.f.3\n";
print INTRONGFF "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene2.f.1\n";
print INTRONGFF "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene2.f.2\n";
print INTRONGFF "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene2.f.3\n";
close(INTRONGFF);
($splicegff,$errorcode,$errormsg) = &make_splicesite_gff($intron_gff,$outputdir,'threeprime',9,4);
if ($errorcode != 0) { print STDERR "ERROR: test_make_splicesite_gff: failed test1 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
$random_number = rand();
$expected_splicegff = $outputdir."/tmp".$random_number;
open(EXPECTED,">$expected_splicegff") || die "ERROR: test_make_splicesite_gff: cannot open $expected_splicegff\n";
print EXPECTED "scaffold1\tcurated\t3_splice_site\t10\t23\t.\t+\t.\tGENE1.F.1_3splice\n";
print EXPECTED "scaffold1\tcurated\t3_splice_site\t20\t33\t.\t+\t.\tGENE1.F.2_3splice\n";
print EXPECTED "scaffold1\tcurated\t3_splice_site\t30\t43\t.\t+\t.\tGENE1.F.3_3splice\n";
print EXPECTED "scaffold2\tcurated\t3_splice_site\t110\t123\t.\t+\t.\tGENE1.F.1_3splice\n";
print EXPECTED "scaffold2\tcurated\t3_splice_site\t120\t133\t.\t+\t.\tGENE1.F.2_3splice\n";
print EXPECTED "scaffold2\tcurated\t3_splice_site\t130\t143\t.\t+\t.\tGENE1.F.3_3splice\n";
print EXPECTED "scaffold2\tcurated\t3_splice_site\t10\t23\t.\t+\t.\tGENE2.F.1_3splice\n";
print EXPECTED "scaffold2\tcurated\t3_splice_site\t20\t33\t.\t+\t.\tGENE2.F.2_3splice\n";
print EXPECTED "scaffold2\tcurated\t3_splice_site\t30\t43\t.\t+\t.\tGENE2.F.3_3splice\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $splicegff $expected_splicegff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_make_splicesite_gff: failed test1 (files $splicegff $expected_splicegff)\n"; exit;}
system "rm -f $splicegff";
system "rm -f $expected_splicegff";
system "rm -f $intron_gff";
$random_number = rand();
$intron_gff = $outputdir."/tmp".$random_number;
open(INTRONGFF,">$intron_gff") || die "ERROR: test_remove_duplicate_introns: cannot open $intron_gff\n";
print INTRONGFF "scaffold1\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n";
print INTRONGFF "scaffold1\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n";
print INTRONGFF "scaffold1\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n";
print INTRONGFF "scaffold2\tcurated\tintron\t116\t119\t.\t.\t.\tgene1.f.1\n";
print INTRONGFF "scaffold2\tcurated\tintron\t126\t129\t.\t.\t.\tgene1.f.2\n";
print INTRONGFF "scaffold2\tcurated\tintron\t136\t139\t.\t.\t.\tgene1.f.3\n";
print INTRONGFF "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene2.f.1\n";
print INTRONGFF "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene2.f.2\n";
print INTRONGFF "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene2.f.3\n";
close(INTRONGFF);
($splicegff,$errorcode,$errormsg) = &make_splicesite_gff($intron_gff,$outputdir,'prime',9,4);
if ($errorcode != 6) { print STDERR "ERROR: test_make_splicesite_gff: failed test2 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
system "rm -f $intron_gff";
$random_number = rand();
$intron_gff = $outputdir."/tmp".$random_number;
open(INTRONGFF,">$intron_gff") || die "ERROR: test_remove_duplicate_introns: cannot open $intron_gff\n";
print INTRONGFF "scaffold1\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.1\n";
print INTRONGFF "scaffold1\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.2\n";
print INTRONGFF "scaffold1\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.3\n";
print INTRONGFF "scaffold2\tcurated\tintron\t116\t119\t.\t.\t.\tgene1.1\n";
print INTRONGFF "scaffold2\tcurated\tintron\t126\t129\t.\t.\t.\tgene1.2\n";
print INTRONGFF "scaffold2\tcurated\tintron\t136\t139\t.\t.\t.\tgene1.3\n";
print INTRONGFF "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene2.1\n";
print INTRONGFF "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene2.2\n";
print INTRONGFF "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene2.3\n";
close(INTRONGFF);
($splicegff,$errorcode,$errormsg) = &make_splicesite_gff($intron_gff,$outputdir,'threeprime',9,4);
if ($errorcode != 5) { print STDERR "ERROR: test_make_splicesite_gff: failed test3 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
system "rm -f $intron_gff";
$random_number = rand();
$intron_gff = $outputdir."/tmp".$random_number;
open(INTRONGFF,">$intron_gff") || die "ERROR: test_remove_duplicate_introns: cannot open $intron_gff\n";
print INTRONGFF "scaffold1\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n";
print INTRONGFF "scaffold1\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n";
print INTRONGFF "scaffold1\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n";
print INTRONGFF "scaffold2\tcurated\tintron\t116\t119\t.\t.\t.\tgene1.f.1\n";
print INTRONGFF "scaffold2\tcurated\tintron\t126\t129\t.\t.\t.\tgene1.f.2\n";
print INTRONGFF "scaffold2\tcurated\tintron\t136\t139\t.\t.\t.\tgene1.f.3\n";
print INTRONGFF "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene2.f.1\n";
print INTRONGFF "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene2.f.2\n";
print INTRONGFF "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene2.f.3\n";
close(INTRONGFF);
($splicegff,$errorcode,$errormsg) = &make_splicesite_gff($intron_gff,$outputdir,'fiveprime',4,9);
if ($errorcode != 0) { print STDERR "ERROR: test_make_splicesite_gff: failed test4 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
$random_number = rand();
$expected_splicegff = $outputdir."/tmp".$random_number;
open(EXPECTED,">$expected_splicegff") || die "ERROR: test_make_splicesite_gff: cannot open $expected_splicegff\n";
print EXPECTED "scaffold1\tcurated\t5_splice_site\t12\t25\t.\t+\t.\tGENE1.F.1_5splice\n";
print EXPECTED "scaffold1\tcurated\t5_splice_site\t22\t35\t.\t+\t.\tGENE1.F.2_5splice\n";
print EXPECTED "scaffold1\tcurated\t5_splice_site\t32\t45\t.\t+\t.\tGENE1.F.3_5splice\n";
print EXPECTED "scaffold2\tcurated\t5_splice_site\t112\t125\t.\t+\t.\tGENE1.F.1_5splice\n";
print EXPECTED "scaffold2\tcurated\t5_splice_site\t122\t135\t.\t+\t.\tGENE1.F.2_5splice\n";
print EXPECTED "scaffold2\tcurated\t5_splice_site\t132\t145\t.\t+\t.\tGENE1.F.3_5splice\n";
print EXPECTED "scaffold2\tcurated\t5_splice_site\t12\t25\t.\t+\t.\tGENE2.F.1_5splice\n";
print EXPECTED "scaffold2\tcurated\t5_splice_site\t22\t35\t.\t+\t.\tGENE2.F.2_5splice\n";
print EXPECTED "scaffold2\tcurated\t5_splice_site\t32\t45\t.\t+\t.\tGENE2.F.3_5splice\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $splicegff $expected_splicegff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_make_splicesite_gff: failed test4 (files $splicegff $expected_splicegff)\n"; exit;}
system "rm -f $splicegff";
system "rm -f $expected_splicegff";
system "rm -f $intron_gff";
}
#------------------------------------------------------------------#
# MAKE A GFF FILE WITH COORDINATES OF REGIONS AROUND INTRON SPLICE SITES:
sub make_splicesite_gff
{
my $intron_gff = $_[0]; # THE INTRON GFF FILE
my $outputdir = $_[1]; # DIRECTORY FOR WRITING OUTPUT FILES TO
my $spliceend = $_[2]; # WHICH SPLICE SITE TO USE ('threeprime' OR 'fiveprime')
my $fiveprimebp = $_[3]; # BASES TO TAKE 5' OF THE SPLICE SITE (DEFINED BY 3'-MOST BASE OF INTRON FOR 3' SPLICE SITE, OR 5'-MOST FOR 5' SPLICE)
my $threeprimebp = $_[4]; # BASES TO TAKE 3' OF THE SPLICE SITE (DEFINED BY 3'-MOST BASE OF INTRON FOR 3' SPLICE SITE, OR 5'-MOST FOR 5' SPLICE)
my $line; #
my @temp; #
my $chromosome; # CHROMOSOME OF THE INTRON
my $start; # START OF THE INTRON
my $end; # END OF THE INTRON
my $intron_strand; # STRAND OF THE INTRON
my $intron; # NAME OF THE INTRON
my $intron_no; # NUMBER OF THE INTRON IN THE GENE
my $strand; # STRAND OF THE GENE, INFERRED FROM THE GENE NAME
my $gene; # NAME OF THE GENE
my $splicegff = "none";# SPLICE SITE GFF FILE
my $splice_start; # START OF THE SPLICE SITE REGION
my $splice_end; # END OF THE SPLICE SITE REGION
my $splice; # NAME OF THE SPLICE SITE REGION
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAME
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $i; #
# CHECK THAT $spliceend IS 'threeprime' OR 'fiveprime'
if ($spliceend ne 'threeprime' && $spliceend ne 'fiveprime')
{
$errormsg = "ERROR: make_splicesite_gff: spliceend $spliceend (should be threeprime/fiveprime)\n";
$errorcode = 6; # ERRORCODE=6
return($splicegff,$errorcode,$errormsg);
}
# READ THROUGH THE INTRON GFF FILE:
$random_number = rand();
$splicegff = $outputdir."/tmp".$random_number;
open(SPLICEGFF,">$splicegff") || die "ERROR: make_splicesite_gff: cannot open $splicegff\n";
open(INTRON_GFF,"$intron_gff") || die "ERROR: make_splicesite_gff: cannot open $intron_gff.\n";
while(<INTRON_GFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
$chromosome = $temp[0];
$start = $temp[3];
$end = $temp[4];
$intron = $temp[8];
@temp = split(/\./,$intron);
$intron_no = $temp[$#temp];
# FIND THE INTRON STRAND FROM THE INTRON NAME:
if ($intron =~ /\.r/ || $intron =~ /\.f/)
{
$strand = $temp[$#temp-1];
$gene = "";
for ($i = 0; $i < $#temp-1; $i++) { $gene = $gene.".".$temp[$i];}
$gene = substr($gene,1,length($gene)-1);
$gene =~ tr/[a-z]/[A-Z]/;
$intron = $gene.".".$strand.".".$intron_no;
$intron =~ tr/[a-z]/[A-Z]/;
# DEFINE THE SPLICE SITE REGION
if ($strand eq 'f')
{
# THE SPLICE SITE IS DEFINED BY 3'-MOST BASE OF INTRON, FOR 3' SPLICE SITE:
if ($spliceend eq 'threeprime')
{
$splice_start= $end - $fiveprimebp;
$splice_end = $end + $threeprimebp;
}
# THE SPLICE SITE IS DEFINED BY 5'-MOST BASE OF INTRON, FOR 5' SPLICE SITE:
elsif ($spliceend eq 'fiveprime')
{
$splice_start = $start - $fiveprimebp;
$splice_end = $start + $threeprimebp;
}
$intron_strand = "+";
}
elsif ($strand eq 'r' || $strand eq 'R')
{
# THE SPLICE SITE IS DEFINED BY 3'-MOST BASE OF INTRON, FOR 3' SPLICE SITE:
if ($spliceend eq 'threeprime')
{
$splice_start = $start - $threeprimebp;
$splice_end = $start + $fiveprimebp;
}
# THE SPLICE SITE IS DEFINED BY 5'-MOST BASE OF INTRON, FOR 5' SPLICE SITE:
elsif ($spliceend eq 'fiveprime')
{
$splice_start = $end - $threeprimebp;
$splice_end = $end + $fiveprimebp;
}
$intron_strand = "-";
}
if ($spliceend eq 'threeprime') { $splice = $intron."_3splice"; }
elsif ($spliceend eq 'fiveprime') { $splice = $intron."_5splice"; }
if ($spliceend eq 'threeprime')
{
print SPLICEGFF "$chromosome\tcurated\t3_splice_site\t$splice_start\t$splice_end\t.\t$intron_strand\t.\t$splice\n";
}
elsif ($spliceend eq 'fiveprime')
{
print SPLICEGFF "$chromosome\tcurated\t5_splice_site\t$splice_start\t$splice_end\t.\t$intron_strand\t.\t$splice\n";
}
}
else
{
$errormsg = "ERROR: make_splice_site_gff: intron name $intron\n";
$errorcode = 5; # ERRORCODE=5
return($splicegff,$errorcode,$errormsg);
}
}
close(INTRON_GFF);
close(SPLICEGFF);
return($splicegff,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &remove_duplicate_introns
sub test_remove_duplicate_introns
{
my $outputdir = $_[0]; ## DIRECTORY FOR WRITING OUTPUT FILES IN
my $random_number; ## RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES
my $expected_new_intron_gff; ## EXPECTED NEW INTRON GFF
my $errorcode; ## RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR
my $errormsg; ## RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR
my $differences; ## DIFFERENCES BETWEEN $new_intron_gff AND $expected_new_intron_gff
my $length_differences; ## LENGTH OF $differences
my $line; ##
my $intron_gff; ## THE INTRON GFF FILE
my $new_intron_gff; ## THE NEW INTRON GFF FILE
$random_number = rand();
$intron_gff = $outputdir."/tmp".$random_number;
open(INTRONGFF,">$intron_gff") || die "ERROR: test_remove_duplicate_introns: cannot open $intron_gff\n";
print INTRONGFF "scaffold1\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n";
print INTRONGFF "scaffold1\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n";
print INTRONGFF "scaffold1\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n";
print INTRONGFF "scaffold2\tcurated\tintron\t116\t119\t.\t.\t.\tgene1.f.1\n";
print INTRONGFF "scaffold2\tcurated\tintron\t126\t129\t.\t.\t.\tgene1.f.2\n";
print INTRONGFF "scaffold2\tcurated\tintron\t136\t139\t.\t.\t.\tgene1.f.3\n";
print INTRONGFF "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene2.f.1\n";
print INTRONGFF "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene2.f.2\n";
print INTRONGFF "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene2.f.3\n";
print INTRONGFF "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene3.f.1\n";
print INTRONGFF "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene3.f.2\n";
print INTRONGFF "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene3.f.3\n";
close(INTRONGFF);
($new_intron_gff,$errorcode,$errormsg) = &remove_duplicate_introns($intron_gff,$outputdir);
if ($errorcode != 0) { print STDERR "ERROR: test_remove_duplicate_introns: failed test1 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
$random_number = rand();
$expected_new_intron_gff = $outputdir."/tmp".$random_number;
open(EXPECTED,">$expected_new_intron_gff") || die "ERROR: test_remove_duplicate_introns: cannot open $expected_new_intron_gff\n";
print EXPECTED "scaffold1\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n";
print EXPECTED "scaffold1\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n";
print EXPECTED "scaffold1\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n";
print EXPECTED "scaffold2\tcurated\tintron\t116\t119\t.\t.\t.\tgene1.f.1\n";
print EXPECTED "scaffold2\tcurated\tintron\t126\t129\t.\t.\t.\tgene1.f.2\n";
print EXPECTED "scaffold2\tcurated\tintron\t136\t139\t.\t.\t.\tgene1.f.3\n";
print EXPECTED "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene2.f.1\n";
print EXPECTED "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene2.f.2\n";
print EXPECTED "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene2.f.3\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $new_intron_gff $expected_new_intron_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_remove_duplicate_introns: failed test1 (files $new_intron_gff $expected_new_intron_gff)\n"; exit;}
system "rm -f $new_intron_gff";
system "rm -f $expected_new_intron_gff";
system "rm -f $intron_gff";
$random_number = rand();
$intron_gff = $outputdir."/tmp".$random_number;
open(INTRONGFF,">$intron_gff") || die "ERROR: test_remove_duplicate_introns: cannot open $intron_gff\n";
print INTRONGFF "scaffold1\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n";
print INTRONGFF "scaffold1\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n";
print INTRONGFF "\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n";
print INTRONGFF "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n";
print INTRONGFF "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n";
print INTRONGFF "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n";
print INTRONGFF "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene2.f.1\n";
print INTRONGFF "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene2.f.2\n";
print INTRONGFF "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene2.f.3\n";
print INTRONGFF "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene3.f.1\n";
print INTRONGFF "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene3.f.2\n";
print INTRONGFF "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene3.f.3\n";
close(INTRONGFF);
($new_intron_gff,$errorcode,$errormsg) = &remove_duplicate_introns($intron_gff,$outputdir);
if ($errorcode != 4) { print STDERR "ERROR: test_remove_duplicate_introns: failed test2 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
system "rm -f $new_intron_gff";
system "rm -f $intron_gff";
}
#------------------------------------------------------------------#
# REMOVE ANY DUPLICATE INTRONS (WITH THE SAME COORDINATES) FROM THE GFF FILE OF INTRONS:
# DUPLICATE INTRONS CAN ARISE IF THE TRAINING DATA INCLUDES SEVERAL SPLICE-FORMS OF THE
# SAME GENE, THAT SHARE INTRONS.
sub remove_duplicate_introns
{
my $intron_gff = $_[0]; # GFF FILE OF INTRONS
my $outputdir = $_[1]; # DIRECTORY FOR WRITING OUTPUT FILES TO
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $new_intron_gff = "none";# A NEW INTRON GFF FILE, WITH DUPLICATE INTRONS REMOVED
my %SEEN = (); # A HASH TABLE TO RECORD WHETHER WE HAVE SEEN AN INTRON BEFORE
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES
my $line; #
my @temp; #
my $chrom; # CHROMOSOME OF AN INTRON
my $start; # START OF AN INTRON
my $end; # END OF AN INTRON
my $strand; # STRAND OF AN INTRON
my $coords; # COORDINATES OF AN INTRON
$random_number = rand();
$new_intron_gff = $outputdir."/tmp".$random_number;
open(NEWINTRONGFF,">$new_intron_gff") || die "ERROR: remove_duplicate_introns: cannot open $new_intron_gff\n";
open(INTRONGFF,"$intron_gff") || die "ERROR: remove_duplicate_introns: cannot open $intron_gff.\n";
while(<INTRONGFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
$chrom = $temp[0];
$start = $temp[3];
$end = $temp[4];
$strand = $temp[6];
if ($chrom eq '' || $start eq '' || $end eq '' || $strand eq '')
{
$errormsg = "ERROR: remove_duplicate_introns: chrom $chrom start $start end $end strand $strand line $line\n";
$errorcode = 4; # ERRORCODE=4
return($new_intron_gff,$errorcode,$errormsg);
}
$coords = $chrom."=".$start."=".$end."=".$strand;
if (!($SEEN{$coords}))
{
print NEWINTRONGFF "$line\n";
$SEEN{$coords} = 1;
}
}
close(INTRONGFF);
close(NEWINTRONGFF);
system "rm -f $intron_gff";
return($new_intron_gff,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &make_intron_gff
sub test_make_intron_gff
{
my $outputdir = $_[0]; ## DIRECTORY FOR WRITING OUTPUT FILES IN
my $random_number; ## RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES
my $exon_gff; ## SORTED GFF FILE OF EXONS IN THE TRAINING SET
my $expected_intron_gff; ## EXPECTED INTRON GFF
my $errorcode; ## RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR
my $errormsg; ## RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR
my $differences; ## DIFFERENCES BETWEEN $intron_gff AND $expected_intron_gff
my $length_differences; ## LENGTH OF $differences
my $line; ##
my $intron_gff; ## THE INTRON GFF FILE
# TEST CASE WHEN exons_names_sameas_genes = 'yes':
$random_number = rand();
$exon_gff = $outputdir."/tmp".$random_number;
open(EXONGFF,">$exon_gff") || die "ERROR: test_sort_exon_gff: cannot open $exon_gff\n";
print EXONGFF "scaffold1\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold1\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold1\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold1\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold2\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold2\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold2\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold2\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene2\n";
print EXONGFF "scaffold2\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene2\n";
print EXONGFF "scaffold2\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene2\n";
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene2\n";
close(EXONGFF);
($intron_gff,$errorcode,$errormsg) = &make_intron_gff($exon_gff,$outputdir,"yes");
if ($errorcode != 0) { print STDERR "ERROR: test_make_intron_gff: failed test1 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
$random_number = rand();
$expected_intron_gff = $outputdir."/tmp".$random_number;
open(EXPECTED,">$expected_intron_gff") || die "ERROR: test_make_intron_gff: cannot open $expected_intron_gff\n";
print EXPECTED "scaffold1\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n";
print EXPECTED "scaffold1\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n";
print EXPECTED "scaffold1\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n";
print EXPECTED "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n";
print EXPECTED "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n";
print EXPECTED "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n";
print EXPECTED "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene2.f.1\n";
print EXPECTED "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene2.f.2\n";
print EXPECTED "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene2.f.3\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $intron_gff $expected_intron_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_make_intron_gff: failed test1 (files $intron_gff $expected_intron_gff)\n"; exit;}
system "rm -f $intron_gff";
system "rm -f $expected_intron_gff";
system "rm -f $exon_gff";
# TEST CASE WHEN exons_names_sameas_genes = 'no':
$random_number = rand();
$exon_gff = $outputdir."/tmp".$random_number;
open(EXONGFF,">$exon_gff") || die "ERROR: test_sort_exon_gff: cannot open $exon_gff\n";
print EXONGFF "scaffold1\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene1.1\n";
print EXONGFF "scaffold1\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene1.2\n";
print EXONGFF "scaffold1\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene1.3\n";
print EXONGFF "scaffold1\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1.4\n";
print EXONGFF "scaffold2\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene1.1\n";
print EXONGFF "scaffold2\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene1.2\n";
print EXONGFF "scaffold2\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene1.3\n";
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1.4\n";
print EXONGFF "scaffold2\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene2.1\n";
print EXONGFF "scaffold2\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene2.2\n";
print EXONGFF "scaffold2\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene2.3\n";
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene2.4\n";
close(EXONGFF);
($intron_gff,$errorcode,$errormsg) = &make_intron_gff($exon_gff,$outputdir,"no");
if ($errorcode != 0) { print STDERR "ERROR: test_make_intron_gff: failed test2\n"; exit;}
$random_number = rand();
$expected_intron_gff = $outputdir."/tmp".$random_number;
open(EXPECTED,">$expected_intron_gff") || die "ERROR: test_make_intron_gff: cannot open $expected_intron_gff\n";
print EXPECTED "scaffold1\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n";
print EXPECTED "scaffold1\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n";
print EXPECTED "scaffold1\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n";
print EXPECTED "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n";
print EXPECTED "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n";
print EXPECTED "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n";
print EXPECTED "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene2.f.1\n";
print EXPECTED "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene2.f.2\n";
print EXPECTED "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene2.f.3\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $intron_gff $expected_intron_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_make_intron_gff: failed test2 (files $intron_gff $expected_intron_gff)\n"; exit;}
system "rm -f $intron_gff";
system "rm -f $expected_intron_gff";
system "rm -f $exon_gff";
# TEST CASE WHEN exons_names_sameas_genes IS NOT 'yes' OR 'no':
$random_number = rand();
$exon_gff = $outputdir."/tmp".$random_number;
open(EXONGFF,">$exon_gff") || die "ERROR: test_sort_exon_gff: cannot open $exon_gff\n";
print EXONGFF "scaffold1\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene1.1\n";
print EXONGFF "scaffold1\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene1.2\n";
print EXONGFF "scaffold1\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene1.3\n";
print EXONGFF "scaffold1\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1.4\n";
print EXONGFF "scaffold2\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene1.1\n";
print EXONGFF "scaffold2\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene1.2\n";
print EXONGFF "scaffold2\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene1.3\n";
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1.4\n";
print EXONGFF "scaffold2\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene2.1\n";
print EXONGFF "scaffold2\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene2.2\n";
print EXONGFF "scaffold2\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene2.3\n";
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene2.4\n";
close(EXONGFF);
($intron_gff,$errorcode,$errormsg) = &make_intron_gff($exon_gff,$outputdir,"hello");
if ($errorcode != 2) { print STDERR "ERROR: test_make_intron_gff: failed test3\n"; exit;}
system "rm -f $exon_gff";
# TEST CASE WHEN INTRON LENGTHS ARE 0:
$random_number = rand();
$exon_gff = $outputdir."/tmp".$random_number;
open(EXONGFF,">$exon_gff") || die "ERROR: test_sort_exon_gff: cannot open $exon_gff\n";
print EXONGFF "scaffold1\tcegma\tFirst\t10\t20\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold1\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold1\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold1\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold2\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold2\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold2\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold2\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene2\n";
print EXONGFF "scaffold2\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene2\n";
print EXONGFF "scaffold2\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene2\n";
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene2\n";
close(EXONGFF);
($intron_gff,$errorcode,$errormsg) = &make_intron_gff($exon_gff,$outputdir,"yes");
if ($errorcode != 3) { print STDERR "ERROR: test_make_intron_gff: failed test4\n"; exit;}
system "rm -f $intron_gff";
system "rm -f $exon_gff";
}
#------------------------------------------------------------------#
# MAKE A GFF FILE OF THE INTRONS:
sub make_intron_gff
{
my $sorted_exon_gff = $_[0]; # SORTED GFF FILE OF EXONS
my $outputdir = $_[1]; # DIRECTORY FOR WRITING OUTPUT FILES IN
my $exons_names_sameas_genes = $_[2]; # SAYS WHETHER THE EXONS ARE NAMED THE SAME AS THE GENES IN THE INPUT GFF FILE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none"; # RETURNED AS 'none' IF THERE IS NO ERROR
my $intron_gff = "none"; # THE INTRON GFF FILE
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES
my $line; #
my @temp; #
my $exon; # EXON NAME
my $gene; # GENE NAME
my @temp2; #
my $i; #
my $prev_gene = "none"; # PREVIOUS GENE NAME
my $chrom; # NAME OF THE CHROMOSOME
my $prev_chrom = "none"; # NAME OF THE PREVIOUS CHROMOSOME
my $intron_no = 0; # NUMBER OF AN INTRON ALONG A GENE
my $exon_start; # START COORDINATE OF EXON
my $exon_end; # END COORDINATE OF EXON
my $strand; # STRAND OF EXON
my $prev_exon_end; # END COORDINATE OF THE LAST EXON
my $intron_start; # START COORDINATE OF THE INTRON
my $intron_end; # END COORDINATE OF THE INTRON
my $intron; # NAME FOR AN INTRON
# CHECK $exons_names_sameas_genes IS 'yes' OR 'no':
if ($exons_names_sameas_genes ne 'yes' && $exons_names_sameas_genes ne 'no')
{
$errormsg = "ERROR: make_intron_gff: exons_names_sameas_genes is $exons_names_sameas_genes (should be yes/no)\n";
$errorcode = 2; # ERRORCODE=2
return($intron_gff,$errorcode,$errormsg);
}
# OPEN AN OUTPUT INTRON GFF FILE:
$random_number = rand();
$intron_gff = $outputdir."/tmp".$random_number;
open(INTRONGFF,">$intron_gff") || die "ERROR: make_intron_gff: cannot open $intron_gff\n";
# READ THROUGH THE SORTED EXON GFF:
open(EXONGFF,"$sorted_exon_gff") || die "ERROR: make_intron_gff: cannot open $sorted_exon_gff\n";
while(<EXONGFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
$chrom = $temp[0];
$exon = $temp[8]; # eg. KOG0018.10
$exon_start = $temp[3];
$exon_end = $temp[4];
$strand = $temp[6];
if ($strand eq '+') { $strand = "f";} elsif ($strand eq '-') { $strand = "r";}
# GET THE NAME OF THE GENE:
if ($exon =~ /\"/) # SOME GFF FILES HAVE THE EXON NAME IN "" IN THE LAST COLUMN
{
@temp2 = split(/\"/,$exon);
$exon = $temp2[1];
}
if ($exon =~ /Parent=CDS:/) # WORMBASES USES THIS FORMAT IN SOME GFF FILES
{
@temp2 = split(/Parent=CDS:/,$exon);
$exon = $temp2[1];
}
# IF THE EXONS ARE JUST NAMED THE SAME AS THE GENES:
if ($exons_names_sameas_genes eq 'yes')
{
$gene = $exon;
}
else # WE ASSUME THAT THE EXON NAMES ARE MADE BY ADDING .1,.2,.3, etc. TO THE GENE NAME, AS IN C. ELEGANS:
{
$gene = "";
@temp2 = split(/\./,$exon);
for ($i = 0; $i <= $#temp2-1; $i++) { $gene = $gene.".".$temp2[$i];}
if ($gene eq '') { $gene = $exon; }
else { $gene = substr($gene,1,length($gene)-1);}
}
# IF THE GENE NAME IS THE SAME AS THE NAME OF THE PREVIOUS GENE ON THIS CHROMOSOME, THE REGION BETWEEN IS ASSUMED TO BE AN INTRON:
if ($gene eq $prev_gene && $chrom eq $prev_chrom)
{
$intron_no++;
$intron_start = $prev_exon_end + 1;
$intron_end = $exon_start - 1;
if ($intron_start > $intron_end)
{
# THIS COULD HAPPEN BECAUSE OF AN ERROR IN THE GFF FILE WHERE THERE
# IS AN INTRON OF LENGTH 0:
$errormsg = "ERROR: make_intron_gff: intron_start $intron_start before end $intron_end for gene $gene on chrom $chrom.\n";
$errorcode = 3; # ERRORCODE=3
return($intron_gff,$errorcode,$errormsg);
}
else
{
$intron = $gene.".".$strand.".".$intron_no;
print INTRONGFF "$chrom curated intron $intron_start $intron_end . . . $intron\n";
}
}
else
{
$intron_no = 0;
}
$prev_gene = $gene;
$prev_chrom = $chrom;
$prev_exon_end = $exon_end;
}
close(EXONGFF);
# CLOSE THE INTRON GFF FILE:
close(INTRONGFF);
# DELETE THE SORTED EXON GFF, IT IS NO LONGER NEEDED:
system "rm -f $sorted_exon_gff";
return($intron_gff,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &sort_exon_gff
sub test_sort_exon_gff
{
my $outputdir = $_[0]; # DIRECTORY FOR WRITING OUTPUT FILES IN
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES
my $exon_gff; # GFF FILE OF EXONS IN THE TRAINING SET
my $expected_sorted_gff; # EXPECTED SORTED EXON GFF
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR
my $sorted_exon_gff; # SORTED EXON GFF FILE
my $differences; # DIFFERENCES BETWEEN $sorted_exon_gff AND $expected_sorted_gff
my $length_differences; # LENGTH OF $differences
my $line; #
$random_number = rand();
$exon_gff = $outputdir."/tmp".$random_number;
open(EXONGFF,">$exon_gff") || die "ERROR: test_sort_exon_gff: cannot open $exon_gff\n";
print EXONGFF "scaffold2\tcegma\tInternal\t30\t40\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold1\tcegma\tFirst\t10\t20\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold2\tcegma\tInternal\t20\t30\t5.5\t+\t0\tgene2\n";
print EXONGFF "scaffold1\tcegma\tInternal\t20\t30\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold2\tcegma\tFirst\t10\t20\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold1\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold2\tcegma\tInternal\t20\t30\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold2\tcegma\tInternal\t30\t40\t5.5\t+\t0\tgene2\n";
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold1\tcegma\tInternal\t30\t40\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold2\tcegma\tFirst\t10\t20\t5.5\t+\t0\tgene2\n";
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene2\n";
close(EXONGFF);
($sorted_exon_gff,$errorcode,$errormsg) = &sort_exon_gff($exon_gff,$outputdir);
if ($errorcode != 0) { print STDERR "ERROR: test_sort_exon_gff: failed test1 (errorcode $errorcode)\n"; exit;}
$random_number = rand();
$expected_sorted_gff = $outputdir."/tmp".$random_number;
open(EXPECTED,">$expected_sorted_gff") || die "ERROR: test_sort_exon_gff: cannot open $expected_sorted_gff\n";
print EXPECTED "scaffold1\tcegma\tFirst\t10\t20\t5.5\t+\t0\tgene1\n";
print EXPECTED "scaffold1\tcegma\tInternal\t20\t30\t5.5\t+\t0\tgene1\n";
print EXPECTED "scaffold1\tcegma\tInternal\t30\t40\t5.5\t+\t0\tgene1\n";
print EXPECTED "scaffold1\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1\n";
print EXPECTED "scaffold2\tcegma\tFirst\t10\t20\t5.5\t+\t0\tgene1\n";
print EXPECTED "scaffold2\tcegma\tInternal\t20\t30\t5.5\t+\t0\tgene1\n";
print EXPECTED "scaffold2\tcegma\tInternal\t30\t40\t5.5\t+\t0\tgene1\n";
print EXPECTED "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1\n";
print EXPECTED "scaffold2\tcegma\tFirst\t10\t20\t5.5\t+\t0\tgene2\n";
print EXPECTED "scaffold2\tcegma\tInternal\t20\t30\t5.5\t+\t0\tgene2\n";
print EXPECTED "scaffold2\tcegma\tInternal\t30\t40\t5.5\t+\t0\tgene2\n";
print EXPECTED "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene2\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $sorted_exon_gff $expected_sorted_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_sort_exon_gff: failed test1 (files $sorted_exon_gff $expected_sorted_gff)\n"; exit;}
system "rm -f $sorted_exon_gff";
system "rm -f $expected_sorted_gff";
system "rm -f $exon_gff";
$random_number = rand();
$exon_gff = $outputdir."/tmp".$random_number;
open(EXONGFF,">$exon_gff") || die "ERROR: test_sort_exon_gff: cannot open $exon_gff\n";
print EXONGFF "scaffold2\tcegma\tInternal\t30\t40\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold1\tcegma\tFirst\t10\t20\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold2\tcegma\tInternal\t20\t30\t5.5\t+\t0\tgene2\n";
print EXONGFF "scaffold1\tcegma\tInternal\t20\t30\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold2\tcegma\tCDS\t10\t20\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold1\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold2\tcegma\tInternal\t20\t30\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold2\tcegma\tInternal\t30\t40\t5.5\t+\t0\tgene2\n";
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold1\tcegma\tInternal\t30\t40\t5.5\t+\t0\tgene1\n";
print EXONGFF "scaffold2\tcegma\tFirst\t10\t20\t5.5\t+\t0\tgene2\n";
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene2\n";
close(EXONGFF);
($sorted_exon_gff,$errorcode,$errormsg) = &sort_exon_gff($exon_gff,$outputdir);
if ($errorcode != 1) { print STDERR "ERROR: test_sort_exon_gff: failed test2\n"; exit;}
system "rm -f $exon_gff";
}
#------------------------------------------------------------------#
# SORT THE GFF FILE BY CHROMOSOME, THEN BY GENE, THEN BY START IN THE GENE:
sub sort_exon_gff
{
my $exon_gff = $_[0]; # THE GFF FILE OF EXONS IN THE TRAINING SET
my $outputdir = $_[1]; # DIRECTORY FOR WRITING OUTPUT FILES TO
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $line; #
my @temp; #
my $feature_type; # FEATURE TYPE IN THE GFF FILE
my $sorted_exon_gff = "none";# SORTED EXON GFF FILE
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAME
my $cmd; # COMMAND TO RUN
# CHECK THAT THE $exon_gff FILE ONLY HAS FEATURES OF ONE TYPE 'Single', 'First', 'Internal', 'Terminal', 'Initial', or 'coding_exon' (DIFFERENT
# NAMES USED FOR CODING EXONS IN DIFFERENT GFF FILES):
open(EXONGFF,"$exon_gff") || die "ERROR: sort_exon_gff: cannot open $exon_gff\n";
while(<EXONGFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
$feature_type = $temp[2];
$feature_type =~ tr/[A-Z]/[a-z]/;
if ($feature_type ne 'first' && $feature_type ne 'internal' && $feature_type ne 'terminal' && $feature_type ne 'single' &&
$feature_type ne 'coding_exon' && $feature_type ne 'initial')
{
$errormsg = "ERROR: sort_exon_gff: $exon_gff has features of type $feature_type (should be first/initial/internal/terminal/single/coding_exon)\n";
$errorcode = 1; # ERRORCODE=1
return($sorted_exon_gff,$errorcode,$errormsg);
}
}
close(EXONGFF);
# SORT THE EXON GFF FILE:
$random_number = rand();
$sorted_exon_gff = $outputdir."/tmp".$random_number;
$cmd = "sort $exon_gff -k1,1 -k9,9 -k4,4n > $sorted_exon_gff";
system "$cmd";
sleep(1);
return($sorted_exon_gff,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &print_error
sub test_print_error
{
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR
($errormsg,$errorcode) = &print_error(45,45,1);
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message','My error message',1);
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;}
($errormsg,$errorcode) = &print_error('none',45,1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message', 0, 1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;}
}
#------------------------------------------------------------------#
# PRINT OUT AN ERROR MESSAGE AND EXIT.
sub print_error
{
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED.
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED.
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/)
{
if ($called_from_test == 1)
{
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n";
exit;
}
}
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/))
{
if ($called_from_test == 1)
{
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n";
exit;
}
}
if ($errormsg eq 'none' || $errorcode == 0)
{
if ($called_from_test == 1)
{
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n";
exit;
}
}
else
{
print STDERR "$errormsg";
exit;
}
return($errormsg,$errorcode);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment