Last active
October 13, 2017 15:54
Perl script to make an intron parameter file for GeneWise
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
=head1 NAME | |
make_genewise_paramfile.pl | |
=head1 SYNOPSIS | |
make_genewise_paramfile.pl training_exon_gff fasta outputdir exons_names_sameas_genes output | |
where training_exon_gff is the input gff file of exons in training set genes, | |
fasta is the fasta file of the genome sequence, | |
outputdir is the output directory for writing output files, | |
exons_names_sameas_genes says whether the exons have the same names as genes in the input gff (yes/no), | |
output is the output file name. | |
=head1 DESCRIPTION | |
This script takes an input gff file of exons in training set genes (<training_exon_gff>), | |
and the genome fasta file (<fasta>), and uses them to make an intron parameter file that | |
can be used with GeneWise (Wise 2.4.1), using the GeneWise -genestats option. | |
Note that this perl script just uses training data for the intron splice sites. | |
It takes the rest of the parameters from the standard GeneWise parameter file gene.stat | |
that comes with GeneWise. | |
=head1 VERSION | |
Perl script last edited 2-Nov-2012. | |
=head1 CONTACT | |
alc@sanger.ac.uk (Avril Coghlan) | |
=cut | |
# | |
# Perl script make_genewise_paramfile.pl | |
# Written by Avril Coghlan (alc@sanger.ac.uk) | |
# 2-Nov-12. | |
# Last edited 2-Nov-2012. | |
# SCRIPT SYNOPSIS: make_genewise_paramfile: given a gff file of training genes for a species, makes an intron parameter file for use in GeneWise for that species. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
use strict; | |
use warnings; | |
my $num_args = $#ARGV + 1; | |
if ($num_args != 5) | |
{ | |
print "Usage of make_genewise_paramfile.pl\n\n"; | |
print "perl make_genewise_paramfile.pl <training_exon_gff> <fasta> <outputdir> <exons_names_sameas_genes> <output>\n"; | |
print "where <training_exon_gff> is the input gff file of exons in the training set genes,\n"; | |
print " <fasta> is the fasta file of the genome sequence,\n"; | |
print " <outputdir> is the output directory for writing output files,\n"; | |
print " <exons_names_sameas_genes> says whether the exons have the same names as genes in the input gff file,\n"; | |
print " <output> is the output file name\n"; | |
print "For example, >perl make_genewise_paramfile.pl output.cegma.gff S07.Im.SS.Gf.Rr.scaffolds.fa\n"; | |
print "/nfs/users/nfs_a/alc/Documents/GeneWise50Helminths yes mygenestat.stat\n"; | |
exit; | |
} | |
# FIND THE INPUT GFF FILE OF EXONS IN THE TRAINING SET GENES: | |
my $training_exon_gff = $ARGV[0]; | |
# FIND THE FASTA FILE OF THE GENOME SEQUENCE: | |
my $fasta = $ARGV[1]; | |
# FIND THE DIRECTORY TO USE FOR OUTPUT FILES: | |
my $outputdir = $ARGV[2]; | |
# FIND OUT IF THE EXON NAMES ARE THE SAME AS GENE NAMES IN THE INPUT GFF FILE: | |
my $exons_names_sameas_genes = $ARGV[3]; | |
# FIND THE OUTPUT FILE NAME: | |
my $output = $ARGV[4]; | |
#------------------------------------------------------------------# | |
# TEST SUBROUTINES: | |
my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING. | |
&test_sort_exon_gff($outputdir); | |
&test_make_intron_gff($outputdir); | |
&test_remove_duplicate_introns($outputdir); | |
&test_make_splicesite_gff($outputdir); | |
&test_reverse_complement; | |
&test_make_splicesite_fasta($outputdir); | |
&test_new_intron_name; | |
&test_make_genewise_paramfile($outputdir); | |
&test_check_gtag_introns($outputdir); | |
&test_print_error; | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
&run_main_program($outputdir,$training_exon_gff,$fasta,$exons_names_sameas_genes,$output); | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
sub run_main_program | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN. | |
my $exon_gff = $_[1]; # THE GFF FILE OF EXONS IN THE TRAINING SET | |
my $fasta = $_[2]; # THE FASTA FILE OF THE GENOME SEQUENCE. | |
my $exons_names_sameas_genes = $_[3]; # SAYS WHETHER THE EXONS ARE NAMED THE SAME AS THE GENES IN THE INPUT GFF FILE | |
my $output = $_[4]; # THE GENEWISE PARAMETER FILE | |
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR. | |
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR. | |
my $sorted_exon_gff; # SORTED VERSION OF $exon_gff | |
my $intron_gff; # THE INTRON GFF FILE | |
my $splice_gff_3prime; # GFF FILE FOR 3' SPLICE SITES | |
my $splice_gff_5prime; # GFF FILE FOR 5' SPLICE SITES | |
my $splice_fasta_3prime; # FASTA FILE FOR 3' SPLICE SITES | |
my $splice_fasta_5prime; # FASTA FILE FOR 5' SPLICE SITES | |
# SORT THE GFF FILE BY CHROMOSOME, THEN BY GENE, THEN BY START IN THE GENE: | |
($sorted_exon_gff,$errorcode,$errormsg) = &sort_exon_gff($exon_gff,$outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# MAKE A GFF FILE OF THE INTRONS: | |
($intron_gff,$errorcode,$errormsg) = &make_intron_gff($sorted_exon_gff,$outputdir,$exons_names_sameas_genes); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# REMOVE ANY DUPLICATE INTRONS FROM THE GFF FILE OF INTRONS: | |
($intron_gff,$errorcode,$errormsg) = &remove_duplicate_introns($intron_gff,$outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# MAKE A GFF FILE WITH COORDINATES OF THE 3' SPLICE SITE OF THE INTRONS (9 bp 5' OF THE A | |
# OF THE <--AG AND 4 bp 3' OF THE A): | |
($splice_gff_3prime,$errorcode,$errormsg) = &make_splicesite_gff($intron_gff,$outputdir,'threeprime',9,4); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# MAKE A GFF FILE WITH COORDINATES OF THE 5' SPLICE SITE OF THE INTRONS (4 bp 5' OF THE G OF THE GT--> | |
# AND 9 bp 3' OF THE G): | |
($splice_gff_5prime,$errorcode,$errormsg) = &make_splicesite_gff($intron_gff,$outputdir,'fiveprime',4,9); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# MAKE A FASTA FILE WITH THE 3' SPLICE SITE SEQUENCES: | |
($splice_fasta_3prime,$errorcode,$errormsg) = &make_splicesite_fasta($splice_gff_3prime,$outputdir,$fasta); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# MAKE A FASTA FILE WITH THE 5' SPLICE SITE SEQUENCES: | |
($splice_fasta_5prime,$errorcode,$errormsg) = &make_splicesite_fasta($splice_gff_5prime,$outputdir,$fasta); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# MAKE A GENEWISE PARAMETER FILE: | |
($errorcode,$errormsg) = &make_genewise_paramfile($splice_fasta_5prime,$splice_fasta_3prime,$outputdir,$output); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# CHECK THAT THE SPLICE SITES IN THE GENEWISE PARAMETER FILE ARE MOSTLY GT...AG: | |
($errorcode,$errormsg) = &check_gtag_introns($output); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
} | |
#------------------------------------------------------------------# | |
# TEST &check_gtag_introns | |
sub test_check_gtag_introns | |
{ | |
my $outputdir = $_[0]; # DIRECTORY FOR WRITING OUTPUT FILES INTO | |
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR | |
my $output; # GENEWISE PARAMETER FILE | |
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES | |
$random_number = rand(); | |
$output = $outputdir."/tmp".$random_number; | |
open(OUTPUT,">$output") || die "ERROR: test_check_gtag_introns: cannot open $output\n"; | |
print OUTPUT "#\n"; | |
print OUTPUT "# new genestats file\n"; | |
print OUTPUT "#\n"; | |
print OUTPUT "splice3 10\n"; | |
# AG | |
print OUTPUT "HELMINTH0000001 TTCATTCCAGGTAG\n"; | |
print OUTPUT "HELMINTH0000002 TTTAATTCAGGTCA\n"; | |
print OUTPUT "HELMINTH0000003 TCCTTTCCAGTAAC\n"; | |
print OUTPUT "HELMINTH0000004 TAATTTTTAGGTGT\n"; | |
print OUTPUT "HELMINTH0000005 TTTCATGCAGATTG\n"; | |
print OUTPUT "HELMINTH0000006 TTGATTTCAGGAAG\n"; | |
print OUTPUT "//\n"; | |
print OUTPUT "splice5 5\n"; | |
# GT | |
print OUTPUT "HELMINTH0000001 AAAAGTGCATTTTT\n"; | |
print OUTPUT "HELMINTH0000002 GAAGGTAGGAAAAT\n"; | |
print OUTPUT "HELMINTH0000003 ACATGTAAGTTATT\n"; | |
print OUTPUT "HELMINTH0000004 CATGGTATGTTCAC\n"; | |
print OUTPUT "HELMINTH0000005 TCAGGTACCAAATT\n"; | |
print OUTPUT "HELMINTH0000006 GCGTGTAAGTTCCT\n"; | |
print OUTPUT "//\n"; | |
print OUTPUT "# A G C T N\n"; | |
print OUTPUT "intron_emission\n"; | |
print OUTPUT "0.25 0.25 0.25 0.25 0.25\n"; | |
print OUTPUT "//\n"; | |
print OUTPUT "polyp_emission\n"; | |
print OUTPUT "0.2 0.2 0.3 0.3 0.25\n"; | |
print OUTPUT "//\n"; | |
print OUTPUT "rnd_emission\n"; | |
print OUTPUT "0.25 0.25 0.25 0.25 0.25 1.0\n"; | |
print OUTPUT "//\n"; | |
print OUTPUT "rndcodon\n"; | |
print OUTPUT "AAA 5290.000000\n"; | |
print OUTPUT "AAC 4795.000000\n"; | |
print OUTPUT "AAG 8178.000000\n"; | |
print OUTPUT "AAT 3305.000000\n"; | |
print OUTPUT "ACA 6240.000000\n"; | |
print OUTPUT "ACC 7728.000000\n"; | |
print OUTPUT "ACG 3347.000000\n"; | |
print OUTPUT "ACT 4930.000000\n"; | |
print OUTPUT "AGA 8491.000000\n"; | |
print OUTPUT "AGC 8639.000000\n"; | |
print OUTPUT "AGG 8997.000000\n"; | |
print OUTPUT "AGT 4417.000000\n"; | |
print OUTPUT "ATA 1975.000000\n"; | |
print OUTPUT "ATC 4973.000000\n"; | |
print OUTPUT "ATG 6474.000000\n"; | |
print OUTPUT "ATT 3083.000000\n"; | |
print OUTPUT "CAA 7057.000000\n"; | |
print OUTPUT "CAC 6815.000000\n"; | |
print OUTPUT "CAG 11041.000000\n"; | |
print OUTPUT "CAT 5779.000000\n"; | |
print OUTPUT "CCA 10537.000000\n"; | |
print OUTPUT "CCC 10307.000000\n"; | |
print OUTPUT "CCG 5621.000000\n"; | |
print OUTPUT "CCT 10134.000000\n"; | |
print OUTPUT "CGA 3377.000000\n"; | |
print OUTPUT "CGC 5146.000000\n"; | |
print OUTPUT "CGG 5375.000000\n"; | |
print OUTPUT "CGT 2765.000000\n"; | |
print OUTPUT "CTA 3502.000000\n"; | |
print OUTPUT "CTC 7465.000000\n"; | |
print OUTPUT "CTG 13780.000000\n"; | |
print OUTPUT "CTT 5453.000000\n"; | |
print OUTPUT "GAA 7461.000000\n"; | |
print OUTPUT "GAC 6937.000000\n"; | |
print OUTPUT "GAG 9975.000000\n"; | |
print OUTPUT "GAT 4949.000000\n"; | |
print OUTPUT "GCA 7747.000000\n"; | |
print OUTPUT "GCC 10890.000000\n"; | |
print OUTPUT "GCG 4828.000000\n"; | |
print OUTPUT "GCT 9371.000000\n"; | |
print OUTPUT "GGA 10143.000000\n"; | |
print OUTPUT "GGC 10400.000000\n"; | |
print OUTPUT "GGG 8869.000000\n"; | |
print OUTPUT "GGT 5567.000000\n"; | |
print OUTPUT "GTA 2143.000000\n"; | |
print OUTPUT "GTC 4593.000000\n"; | |
print OUTPUT "GTG 8189.000000\n"; | |
print OUTPUT "GTT 3021.000000\n"; | |
print OUTPUT "TAA 1775.000000\n"; | |
print OUTPUT "TAC 3687.000000\n"; | |
print OUTPUT "TAG 1333.000000\n"; | |
print OUTPUT "TAT 2477.000000\n"; | |
print OUTPUT "TCA 6180.000000\n"; | |
print OUTPUT "TCC 7668.000000\n"; | |
print OUTPUT "TCG 2875.000000\n"; | |
print OUTPUT "TCT 5767.000000\n"; | |
print OUTPUT "TGA 7315.000000\n"; | |
print OUTPUT "TGC 8625.000000\n"; | |
print OUTPUT "TGG 11718.000000\n"; | |
print OUTPUT "TGT 5197.000000\n"; | |
print OUTPUT "TTA 1664.000000\n"; | |
print OUTPUT "TTC 5462.000000\n"; | |
print OUTPUT "TTG 4420.000000\n"; | |
print OUTPUT "TTT 3453.000000\n"; | |
print OUTPUT "//\n\n"; | |
close(OUTPUT); | |
($errorcode,$errormsg) = &check_gtag_introns($output); | |
if ($errorcode != 0) { print STDERR "ERROR: test_check_gtag_introns: failed test1\n"; exit;} | |
system "rm -f $output"; | |
$random_number = rand(); | |
$output = $outputdir."/tmp".$random_number; | |
open(OUTPUT,">$output") || die "ERROR: test_check_gtag_introns: cannot open $output\n"; | |
print OUTPUT "#\n"; | |
print OUTPUT "# new genestats file\n"; | |
print OUTPUT "#\n"; | |
print OUTPUT "splice3 10\n"; | |
# AG | |
print OUTPUT "HELMINTH0000001 TTCATTCCACGTAG\n"; | |
print OUTPUT "HELMINTH0000002 TTTAATTCCGGTCA\n"; | |
print OUTPUT "HELMINTH0000003 TCCTTTCCATTAAC\n"; | |
print OUTPUT "HELMINTH0000004 TAATTTTTTGGTGT\n"; | |
print OUTPUT "HELMINTH0000005 TTTCATGCAAATTG\n"; | |
print OUTPUT "HELMINTH0000006 TTGATTTCGGGAAG\n"; | |
print OUTPUT "//\n"; | |
print OUTPUT "splice5 5\n"; | |
# GT | |
print OUTPUT "HELMINTH0000001 AAAAGTGCATTTTT\n"; | |
print OUTPUT "HELMINTH0000002 GAAGGTAGGAAAAT\n"; | |
print OUTPUT "HELMINTH0000003 ACATGTAAGTTATT\n"; | |
print OUTPUT "HELMINTH0000004 CATGGTATGTTCAC\n"; | |
print OUTPUT "HELMINTH0000005 TCAGGTACCAAATT\n"; | |
print OUTPUT "HELMINTH0000006 GCGTGTAAGTTCCT\n"; | |
print OUTPUT "//\n"; | |
print OUTPUT "# A G C T N\n"; | |
print OUTPUT "intron_emission\n"; | |
print OUTPUT "0.25 0.25 0.25 0.25 0.25\n"; | |
print OUTPUT "//\n"; | |
print OUTPUT "polyp_emission\n"; | |
print OUTPUT "0.2 0.2 0.3 0.3 0.25\n"; | |
print OUTPUT "//\n"; | |
print OUTPUT "rnd_emission\n"; | |
print OUTPUT "0.25 0.25 0.25 0.25 0.25 1.0\n"; | |
print OUTPUT "//\n"; | |
print OUTPUT "rndcodon\n"; | |
print OUTPUT "AAA 5290.000000\n"; | |
print OUTPUT "AAC 4795.000000\n"; | |
print OUTPUT "AAG 8178.000000\n"; | |
print OUTPUT "AAT 3305.000000\n"; | |
print OUTPUT "ACA 6240.000000\n"; | |
print OUTPUT "ACC 7728.000000\n"; | |
print OUTPUT "ACG 3347.000000\n"; | |
print OUTPUT "ACT 4930.000000\n"; | |
print OUTPUT "AGA 8491.000000\n"; | |
print OUTPUT "AGC 8639.000000\n"; | |
print OUTPUT "AGG 8997.000000\n"; | |
print OUTPUT "AGT 4417.000000\n"; | |
print OUTPUT "ATA 1975.000000\n"; | |
print OUTPUT "ATC 4973.000000\n"; | |
print OUTPUT "ATG 6474.000000\n"; | |
print OUTPUT "ATT 3083.000000\n"; | |
print OUTPUT "CAA 7057.000000\n"; | |
print OUTPUT "CAC 6815.000000\n"; | |
print OUTPUT "CAG 11041.000000\n"; | |
print OUTPUT "CAT 5779.000000\n"; | |
print OUTPUT "CCA 10537.000000\n"; | |
print OUTPUT "CCC 10307.000000\n"; | |
print OUTPUT "CCG 5621.000000\n"; | |
print OUTPUT "CCT 10134.000000\n"; | |
print OUTPUT "CGA 3377.000000\n"; | |
print OUTPUT "CGC 5146.000000\n"; | |
print OUTPUT "CGG 5375.000000\n"; | |
print OUTPUT "CGT 2765.000000\n"; | |
print OUTPUT "CTA 3502.000000\n"; | |
print OUTPUT "CTC 7465.000000\n"; | |
print OUTPUT "CTG 13780.000000\n"; | |
print OUTPUT "CTT 5453.000000\n"; | |
print OUTPUT "GAA 7461.000000\n"; | |
print OUTPUT "GAC 6937.000000\n"; | |
print OUTPUT "GAG 9975.000000\n"; | |
print OUTPUT "GAT 4949.000000\n"; | |
print OUTPUT "GCA 7747.000000\n"; | |
print OUTPUT "GCC 10890.000000\n"; | |
print OUTPUT "GCG 4828.000000\n"; | |
print OUTPUT "GCT 9371.000000\n"; | |
print OUTPUT "GGA 10143.000000\n"; | |
print OUTPUT "GGC 10400.000000\n"; | |
print OUTPUT "GGG 8869.000000\n"; | |
print OUTPUT "GGT 5567.000000\n"; | |
print OUTPUT "GTA 2143.000000\n"; | |
print OUTPUT "GTC 4593.000000\n"; | |
print OUTPUT "GTG 8189.000000\n"; | |
print OUTPUT "GTT 3021.000000\n"; | |
print OUTPUT "TAA 1775.000000\n"; | |
print OUTPUT "TAC 3687.000000\n"; | |
print OUTPUT "TAG 1333.000000\n"; | |
print OUTPUT "TAT 2477.000000\n"; | |
print OUTPUT "TCA 6180.000000\n"; | |
print OUTPUT "TCC 7668.000000\n"; | |
print OUTPUT "TCG 2875.000000\n"; | |
print OUTPUT "TCT 5767.000000\n"; | |
print OUTPUT "TGA 7315.000000\n"; | |
print OUTPUT "TGC 8625.000000\n"; | |
print OUTPUT "TGG 11718.000000\n"; | |
print OUTPUT "TGT 5197.000000\n"; | |
print OUTPUT "TTA 1664.000000\n"; | |
print OUTPUT "TTC 5462.000000\n"; | |
print OUTPUT "TTG 4420.000000\n"; | |
print OUTPUT "TTT 3453.000000\n"; | |
print OUTPUT "//\n\n"; | |
close(OUTPUT); | |
($errorcode,$errormsg) = &check_gtag_introns($output); | |
if ($errorcode != 18) { print STDERR "ERROR: test_check_gtag_introns: failed test2\n"; exit;} | |
system "rm -f $output"; | |
$random_number = rand(); | |
$output = $outputdir."/tmp".$random_number; | |
open(OUTPUT,">$output") || die "ERROR: test_check_gtag_introns: cannot open $output\n"; | |
print OUTPUT "#\n"; | |
print OUTPUT "# new genestats file\n"; | |
print OUTPUT "#\n"; | |
print OUTPUT "splice3 10\n"; | |
# AG | |
print OUTPUT "HELMINTH0000001 TTCATTCCAGGTAG\n"; | |
print OUTPUT "HELMINTH0000002 TTTAATTCAGGTCA\n"; | |
print OUTPUT "HELMINTH0000003 TCCTTTCCAGTAAC\n"; | |
print OUTPUT "HELMINTH0000004 TAATTTTTAGGTGT\n"; | |
print OUTPUT "HELMINTH0000005 TTTCATGCAGATTG\n"; | |
print OUTPUT "HELMINTH0000006 TTGATTTCAGGAAG\n"; | |
print OUTPUT "//\n"; | |
print OUTPUT "splice5 5\n"; | |
# GT | |
print OUTPUT "HELMINTH0000001 AAAAATGCATTTTT\n"; | |
print OUTPUT "HELMINTH0000002 GAAGTTAGGAAAAT\n"; | |
print OUTPUT "HELMINTH0000003 ACATGGAAGTTATT\n"; | |
print OUTPUT "HELMINTH0000004 CATGGCATGTTCAC\n"; | |
print OUTPUT "HELMINTH0000005 TCAGCTACCAAATT\n"; | |
print OUTPUT "HELMINTH0000006 GCGTCCAAGTTCCT\n"; | |
print OUTPUT "//\n"; | |
print OUTPUT "# A G C T N\n"; | |
print OUTPUT "intron_emission\n"; | |
print OUTPUT "0.25 0.25 0.25 0.25 0.25\n"; | |
print OUTPUT "//\n"; | |
print OUTPUT "polyp_emission\n"; | |
print OUTPUT "0.2 0.2 0.3 0.3 0.25\n"; | |
print OUTPUT "//\n"; | |
print OUTPUT "rnd_emission\n"; | |
print OUTPUT "0.25 0.25 0.25 0.25 0.25 1.0\n"; | |
print OUTPUT "//\n"; | |
print OUTPUT "rndcodon\n"; | |
print OUTPUT "AAA 5290.000000\n"; | |
print OUTPUT "AAC 4795.000000\n"; | |
print OUTPUT "AAG 8178.000000\n"; | |
print OUTPUT "AAT 3305.000000\n"; | |
print OUTPUT "ACA 6240.000000\n"; | |
print OUTPUT "ACC 7728.000000\n"; | |
print OUTPUT "ACG 3347.000000\n"; | |
print OUTPUT "ACT 4930.000000\n"; | |
print OUTPUT "AGA 8491.000000\n"; | |
print OUTPUT "AGC 8639.000000\n"; | |
print OUTPUT "AGG 8997.000000\n"; | |
print OUTPUT "AGT 4417.000000\n"; | |
print OUTPUT "ATA 1975.000000\n"; | |
print OUTPUT "ATC 4973.000000\n"; | |
print OUTPUT "ATG 6474.000000\n"; | |
print OUTPUT "ATT 3083.000000\n"; | |
print OUTPUT "CAA 7057.000000\n"; | |
print OUTPUT "CAC 6815.000000\n"; | |
print OUTPUT "CAG 11041.000000\n"; | |
print OUTPUT "CAT 5779.000000\n"; | |
print OUTPUT "CCA 10537.000000\n"; | |
print OUTPUT "CCC 10307.000000\n"; | |
print OUTPUT "CCG 5621.000000\n"; | |
print OUTPUT "CCT 10134.000000\n"; | |
print OUTPUT "CGA 3377.000000\n"; | |
print OUTPUT "CGC 5146.000000\n"; | |
print OUTPUT "CGG 5375.000000\n"; | |
print OUTPUT "CGT 2765.000000\n"; | |
print OUTPUT "CTA 3502.000000\n"; | |
print OUTPUT "CTC 7465.000000\n"; | |
print OUTPUT "CTG 13780.000000\n"; | |
print OUTPUT "CTT 5453.000000\n"; | |
print OUTPUT "GAA 7461.000000\n"; | |
print OUTPUT "GAC 6937.000000\n"; | |
print OUTPUT "GAG 9975.000000\n"; | |
print OUTPUT "GAT 4949.000000\n"; | |
print OUTPUT "GCA 7747.000000\n"; | |
print OUTPUT "GCC 10890.000000\n"; | |
print OUTPUT "GCG 4828.000000\n"; | |
print OUTPUT "GCT 9371.000000\n"; | |
print OUTPUT "GGA 10143.000000\n"; | |
print OUTPUT "GGC 10400.000000\n"; | |
print OUTPUT "GGG 8869.000000\n"; | |
print OUTPUT "GGT 5567.000000\n"; | |
print OUTPUT "GTA 2143.000000\n"; | |
print OUTPUT "GTC 4593.000000\n"; | |
print OUTPUT "GTG 8189.000000\n"; | |
print OUTPUT "GTT 3021.000000\n"; | |
print OUTPUT "TAA 1775.000000\n"; | |
print OUTPUT "TAC 3687.000000\n"; | |
print OUTPUT "TAG 1333.000000\n"; | |
print OUTPUT "TAT 2477.000000\n"; | |
print OUTPUT "TCA 6180.000000\n"; | |
print OUTPUT "TCC 7668.000000\n"; | |
print OUTPUT "TCG 2875.000000\n"; | |
print OUTPUT "TCT 5767.000000\n"; | |
print OUTPUT "TGA 7315.000000\n"; | |
print OUTPUT "TGC 8625.000000\n"; | |
print OUTPUT "TGG 11718.000000\n"; | |
print OUTPUT "TGT 5197.000000\n"; | |
print OUTPUT "TTA 1664.000000\n"; | |
print OUTPUT "TTC 5462.000000\n"; | |
print OUTPUT "TTG 4420.000000\n"; | |
print OUTPUT "TTT 3453.000000\n"; | |
print OUTPUT "//\n\n"; | |
close(OUTPUT); | |
($errorcode,$errormsg) = &check_gtag_introns($output); | |
if ($errorcode != 18) { print STDERR "ERROR: test_check_gtag_introns: failed test3\n"; exit;} | |
system "rm -f $output"; | |
return($errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# CHECK THAT THE SPLICE SITES IN THE GENEWISE PARAMETER FILE ARE MOSTLY GT...AG: | |
sub check_gtag_introns | |
{ | |
my $output = $_[0]; # THE GENEWISE PARAMETER FILE | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $line; # | |
my @temp; # | |
my $found_threesplice = 0; # SAYS WHETHER WE HAVE REACHED THE 3' SPLICE SITES | |
my $found_fivesplice = 0; # SAYS WHETHER WE HAVE REACHED THE 5' SPLICE SITES | |
my $seq; # | |
my $num_threesplice = 0; # NUMBER OF 3' SPLICE SITE SEQUENCES SEEN | |
my $num_threesplice_ag = 0; # NUMBER OF 3' SPLICE SITES WHERE THE INTRON ENDS IN 'AG' SEEN | |
my $num_fivesplice = 0; # NUMBER OF 5' SPLICE SITE SEQUENCES SEEN | |
my $num_fivesplice_gt = 0; # NUMBER OF 5' SPLICE SITES WHERE THE INTRON STARTS WITH 'GT' SEEN | |
my $pc_fivesplice_gt; # PERCENTAGE OF 5' SPLICE SITES WHERE THE INTRON STARTS WITH 'GT' | |
my $pc_threesplice_ag; # PERCENTAGE OF 3' SPLICE SITES WHERE THE INTRON ENDS WITH 'AG' | |
open(OUTPUT,"$output") || die "ERROR: check_gtag_introns: cannot open $output\n"; | |
while(<OUTPUT>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
if ($line =~ /splice3/) { $found_threesplice = 1; } | |
elsif ($line =~ /splice5/) { $found_threesplice = 0; $found_fivesplice = 1;} | |
elsif ($line =~ /HELMINTH/) | |
{ | |
if ($found_threesplice == 1) | |
{ | |
$seq = $temp[1]; | |
$num_threesplice++; | |
if (substr($seq,8,2) eq 'AG') { $num_threesplice_ag++;} | |
} | |
elsif ($found_fivesplice == 1) | |
{ | |
$seq = $temp[1]; | |
$num_fivesplice++; | |
if (substr($seq,4,2) eq 'GT') { $num_fivesplice_gt++;} | |
} | |
else | |
{ | |
$errormsg = "ERROR: check_gtag_introns: found_threesplice $found_threesplice found_fivesplice $found_fivesplice\n"; | |
$errorcode = 17; # ERRORCODE=17 | |
return($errorcode,$errormsg); | |
} | |
} | |
} | |
close(OUTPUT); | |
# CHECK THAT >=90% OF THE INTRONS ARE GT...AG INTRONS: | |
$pc_fivesplice_gt = $num_fivesplice_gt*100/$num_fivesplice; | |
$pc_threesplice_ag = $num_threesplice_ag*100/$num_threesplice; | |
if ($pc_threesplice_ag < 90.0 || $pc_fivesplice_gt < 90.0) | |
{ | |
$errormsg = "ERROR: check_gtag_introns: pc_threesplice_ag $pc_threesplice_ag pc_fivesplice_gt $pc_fivesplice_gt\n"; | |
$errorcode = 18; # ERRORCODE=18 | |
return($errorcode,$errormsg); | |
} | |
if ($PRINT_TEST_DATA == 1) { print STDERR "$output: $pc_fivesplice_gt % have GT at 5', $pc_threesplice_ag % have AG at 3' of intron\n";} | |
return($errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &make_genewise_paramfile | |
sub test_make_genewise_paramfile | |
{ | |
my $outputdir = $_[0]; # DIRECTORY FOR WRITING OUTPUT FILES TO | |
my $fivesplice; # FASTA FILE OF 5' SPLICE SITES | |
my $threesplice; # FASTA FILE OF 3' SPLICE SITES | |
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAME | |
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR | |
my $output; # THE OUTPUT FILE NAME | |
my $expected_output; # FILE CONTAINING THE EXPECTED OUTPUT | |
my $differences; # DIFFERENES BETWEEN $output AND $expected_output | |
my $length_differences; # LENGTH OF $differences | |
my $line; # | |
$random_number = rand(); | |
$fivesplice = $outputdir."/tmp".$random_number; | |
open(FIVESPLICE,">$fivesplice") || die "ERROR: test_make_genewise_paramfile: cannot open $fivesplice\n"; | |
print FIVESPLICE ">KOG0142.1.R.1_5splice\n"; | |
print FIVESPLICE "AAAAGTGCATTTTT\n"; | |
print FIVESPLICE ">KOG0142.1.R.2_5splice\n"; | |
print FIVESPLICE "GAAGGTAGGAAAAT\n"; | |
print FIVESPLICE ">KOG0142.1.R.3_5splice\n"; | |
print FIVESPLICE "ACATGTAAGTTATT\n"; | |
print FIVESPLICE ">KOG1626.1.R.1_5splice\n"; | |
print FIVESPLICE "CATGGTATGTTCAC\n"; | |
print FIVESPLICE ">KOG3372.1.R.1_5splice\n"; | |
print FIVESPLICE "TCAGGTACCAAATT\n"; | |
print FIVESPLICE ">KOG3372.1.R.2_5splice\n"; | |
print FIVESPLICE "GCGTGTAAGTTCCT\n"; | |
close(FIVESPLICE); | |
$random_number = rand(); | |
$threesplice = $outputdir."/tmp".$random_number; | |
open(THREESPLICE,">$threesplice") || die "ERROR: test_make_genewise_paramfile: cannot open $threesplice\n"; | |
print THREESPLICE ">KOG0142.1.R.1_3splice\n"; | |
print THREESPLICE "TTCATTCCAGGTAG\n"; | |
print THREESPLICE ">KOG0142.1.R.2_3splice\n"; | |
print THREESPLICE "TTTAATTCAGGTCA\n"; | |
print THREESPLICE ">KOG0142.1.R.3_3splice\n"; | |
print THREESPLICE "TCCTTTCCAGTAAC\n"; | |
print THREESPLICE ">KOG1626.1.R.1_3splice\n"; | |
print THREESPLICE "TAATTTTTAGGTGT\n"; | |
print THREESPLICE ">KOG3372.1.R.1_3splice\n"; | |
print THREESPLICE "TTTCATGCAGATTG\n"; | |
print THREESPLICE ">KOG3372.1.R.2_3splice\n"; | |
print THREESPLICE "TTGATTTCAGGAAG\n"; | |
close(THREESPLICE); | |
$random_number = rand(); | |
$output = "tmp".$random_number; | |
$random_number = rand(); | |
$expected_output = $outputdir."/tmp".$random_number; | |
open(EXPECTED,">$expected_output") || die "ERROR: test_make_genewise_paramfile: cannot open $expected_output\n"; | |
print EXPECTED "#\n"; | |
print EXPECTED "# new genestats file\n"; | |
print EXPECTED "#\n"; | |
print EXPECTED "splice3 10\n"; | |
print EXPECTED "HELMINTH0000001 TTCATTCCAGGTAG\n"; | |
print EXPECTED "HELMINTH0000002 TTTAATTCAGGTCA\n"; | |
print EXPECTED "HELMINTH0000003 TCCTTTCCAGTAAC\n"; | |
print EXPECTED "HELMINTH0000004 TAATTTTTAGGTGT\n"; | |
print EXPECTED "HELMINTH0000005 TTTCATGCAGATTG\n"; | |
print EXPECTED "HELMINTH0000006 TTGATTTCAGGAAG\n"; | |
print EXPECTED "//\n"; | |
print EXPECTED "splice5 5\n"; | |
print EXPECTED "HELMINTH0000001 AAAAGTGCATTTTT\n"; | |
print EXPECTED "HELMINTH0000002 GAAGGTAGGAAAAT\n"; | |
print EXPECTED "HELMINTH0000003 ACATGTAAGTTATT\n"; | |
print EXPECTED "HELMINTH0000004 CATGGTATGTTCAC\n"; | |
print EXPECTED "HELMINTH0000005 TCAGGTACCAAATT\n"; | |
print EXPECTED "HELMINTH0000006 GCGTGTAAGTTCCT\n"; | |
print EXPECTED "//\n"; | |
print EXPECTED "# A G C T N\n"; | |
print EXPECTED "intron_emission\n"; | |
print EXPECTED "0.25 0.25 0.25 0.25 0.25\n"; | |
print EXPECTED "//\n"; | |
print EXPECTED "polyp_emission\n"; | |
print EXPECTED "0.2 0.2 0.3 0.3 0.25\n"; | |
print EXPECTED "//\n"; | |
print EXPECTED "rnd_emission\n"; | |
print EXPECTED "0.25 0.25 0.25 0.25 0.25 1.0\n"; | |
print EXPECTED "//\n"; | |
print EXPECTED "rndcodon\n"; | |
print EXPECTED "AAA 5290.000000\n"; | |
print EXPECTED "AAC 4795.000000\n"; | |
print EXPECTED "AAG 8178.000000\n"; | |
print EXPECTED "AAT 3305.000000\n"; | |
print EXPECTED "ACA 6240.000000\n"; | |
print EXPECTED "ACC 7728.000000\n"; | |
print EXPECTED "ACG 3347.000000\n"; | |
print EXPECTED "ACT 4930.000000\n"; | |
print EXPECTED "AGA 8491.000000\n"; | |
print EXPECTED "AGC 8639.000000\n"; | |
print EXPECTED "AGG 8997.000000\n"; | |
print EXPECTED "AGT 4417.000000\n"; | |
print EXPECTED "ATA 1975.000000\n"; | |
print EXPECTED "ATC 4973.000000\n"; | |
print EXPECTED "ATG 6474.000000\n"; | |
print EXPECTED "ATT 3083.000000\n"; | |
print EXPECTED "CAA 7057.000000\n"; | |
print EXPECTED "CAC 6815.000000\n"; | |
print EXPECTED "CAG 11041.000000\n"; | |
print EXPECTED "CAT 5779.000000\n"; | |
print EXPECTED "CCA 10537.000000\n"; | |
print EXPECTED "CCC 10307.000000\n"; | |
print EXPECTED "CCG 5621.000000\n"; | |
print EXPECTED "CCT 10134.000000\n"; | |
print EXPECTED "CGA 3377.000000\n"; | |
print EXPECTED "CGC 5146.000000\n"; | |
print EXPECTED "CGG 5375.000000\n"; | |
print EXPECTED "CGT 2765.000000\n"; | |
print EXPECTED "CTA 3502.000000\n"; | |
print EXPECTED "CTC 7465.000000\n"; | |
print EXPECTED "CTG 13780.000000\n"; | |
print EXPECTED "CTT 5453.000000\n"; | |
print EXPECTED "GAA 7461.000000\n"; | |
print EXPECTED "GAC 6937.000000\n"; | |
print EXPECTED "GAG 9975.000000\n"; | |
print EXPECTED "GAT 4949.000000\n"; | |
print EXPECTED "GCA 7747.000000\n"; | |
print EXPECTED "GCC 10890.000000\n"; | |
print EXPECTED "GCG 4828.000000\n"; | |
print EXPECTED "GCT 9371.000000\n"; | |
print EXPECTED "GGA 10143.000000\n"; | |
print EXPECTED "GGC 10400.000000\n"; | |
print EXPECTED "GGG 8869.000000\n"; | |
print EXPECTED "GGT 5567.000000\n"; | |
print EXPECTED "GTA 2143.000000\n"; | |
print EXPECTED "GTC 4593.000000\n"; | |
print EXPECTED "GTG 8189.000000\n"; | |
print EXPECTED "GTT 3021.000000\n"; | |
print EXPECTED "TAA 1775.000000\n"; | |
print EXPECTED "TAC 3687.000000\n"; | |
print EXPECTED "TAG 1333.000000\n"; | |
print EXPECTED "TAT 2477.000000\n"; | |
print EXPECTED "TCA 6180.000000\n"; | |
print EXPECTED "TCC 7668.000000\n"; | |
print EXPECTED "TCG 2875.000000\n"; | |
print EXPECTED "TCT 5767.000000\n"; | |
print EXPECTED "TGA 7315.000000\n"; | |
print EXPECTED "TGC 8625.000000\n"; | |
print EXPECTED "TGG 11718.000000\n"; | |
print EXPECTED "TGT 5197.000000\n"; | |
print EXPECTED "TTA 1664.000000\n"; | |
print EXPECTED "TTC 5462.000000\n"; | |
print EXPECTED "TTG 4420.000000\n"; | |
print EXPECTED "TTT 3453.000000\n"; | |
print EXPECTED "//\n\n"; | |
close(EXPECTED); | |
($errorcode,$errormsg) = &make_genewise_paramfile($fivesplice,$threesplice,$outputdir,$output); | |
if ($errorcode != 0) { print STDERR "ERROR: test_make_genewise_paramfile: failed test1\n"; exit;} | |
$differences = ""; | |
$output = $outputdir."/".$output; | |
open(TEMP,"diff $output $expected_output |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_make_genewise_paramfile: failed test1 (files $output $expected_output)\n"; exit;} | |
system "rm -f $fivesplice"; | |
system "rm -f $threesplice"; | |
system "rm -f $output"; | |
system "rm -f $expected_output"; | |
$random_number = rand(); | |
$fivesplice = $outputdir."/tmp".$random_number; | |
open(FIVESPLICE,">$fivesplice") || die "ERROR: test_make_genewise_paramfile: cannot open $fivesplice\n"; | |
print FIVESPLICE ">KOG0142.1.R.1_5splice\n"; | |
print FIVESPLICE "AAAAGTGCATTTTT\n"; | |
print FIVESPLICE ">KOG0142.1.R.2_5splice\n"; | |
print FIVESPLICE "none\n"; | |
print FIVESPLICE ">KOG0142.1.R.3_5splice\n"; | |
print FIVESPLICE "ACATGTAAGTTATT\n"; | |
print FIVESPLICE ">KOG1626.1.R.1_5splice\n"; | |
print FIVESPLICE "CATGGTATGTTCAC\n"; | |
print FIVESPLICE ">KOG3372.1.R.1_5splice\n"; | |
print FIVESPLICE "TCAGGTACCAAATT\n"; | |
print FIVESPLICE ">KOG3372.1.R.2_5splice\n"; | |
print FIVESPLICE "GCGTGTAAGTTCCT\n"; | |
close(FIVESPLICE); | |
$random_number = rand(); | |
$threesplice = $outputdir."/tmp".$random_number; | |
open(THREESPLICE,">$threesplice") || die "ERROR: test_make_genewise_paramfile: cannot open $threesplice\n"; | |
print THREESPLICE ">KOG0142.1.R.1_3splice\n"; | |
print THREESPLICE "TTCATTCCAGGTAG\n"; | |
print THREESPLICE ">KOG0142.1.R.2_3splice\n"; | |
print THREESPLICE "TTTAATTCAGGTCA\n"; | |
print THREESPLICE ">KOG0142.1.R.3_3splice\n"; | |
print THREESPLICE "TCCTTTCCAGTAAC\n"; | |
print THREESPLICE ">KOG1626.1.R.1_3splice\n"; | |
print THREESPLICE "TAATTTTTAGGTGT\n"; | |
print THREESPLICE ">KOG3372.1.R.1_3splice\n"; | |
print THREESPLICE "TTTCATGCAGATTG\n"; | |
print THREESPLICE ">KOG3372.1.R.2_3splice\n"; | |
print THREESPLICE "TTGATTTCAGGAAG\n"; | |
close(THREESPLICE); | |
$random_number = rand(); | |
$output = "tmp".$random_number; | |
($errorcode,$errormsg) = &make_genewise_paramfile($fivesplice,$threesplice,$outputdir,$output); | |
if ($errorcode != 14) { print STDERR "ERROR: test_make_genewise_paramfile: failed test2\n"; exit;} | |
$output = $outputdir."/".$output; | |
system "rm -f $fivesplice"; | |
system "rm -f $threesplice"; | |
system "rm -f $output"; | |
} | |
#------------------------------------------------------------------# | |
# MAKE A GENEWISE PARAMETER FILE: | |
sub make_genewise_paramfile | |
{ | |
my $fivesplice = $_[0]; # FASTA FILE OF 5' SPLICE SITES | |
my $threesplice = $_[1]; # FASTA FILE OF 3' SPLICE SITES | |
my $outputdir = $_[2]; # DIRECTORY FOR WRITING OUTPUT FILES TO | |
my $output = $_[3]; # THE GENEWISE PARAMETER FILE | |
my $no_introns; # NUMBER OF SPLICE SITES FOR INTRONS PRINTED OUT | |
my %NEW_NAME = (); # HASH TABLE WITH NAME FOR A SPLICE SITE | |
my $name; # NAME FOR A SPLICE SITE | |
my $splice; # SPLICE SITE SEQUENCE | |
my $line; # | |
my @temp; # | |
my $new_name; # NEW NAME TO USE FOR A SPLICE SITE | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
$output = $outputdir."/".$output; | |
open(OUTPUT,">$output") || die "ERROR: make_genewise_paramfile: cannot open $output\n"; | |
# PRINT OUT THE HEADER FOR THE PARAMETER FILE (COPIED FROM THE | |
# GENEWISE PARAMETER FILE /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/gene.stat): | |
print OUTPUT "#\n"; | |
print OUTPUT "# new genestats file\n"; | |
print OUTPUT "#\n"; | |
print OUTPUT "splice3 10\n"; | |
# READ IN THE FILE OF 3' SPLICE SITES: | |
$no_introns = 0; | |
$name = "none"; | |
$splice = "none"; | |
open(THREESPLICE,"$threesplice") || die "ERROR: make_genewise_paramfile: cannot open $threesplice.\n"; | |
while(<THREESPLICE>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
if (substr($line,0,1) eq '>') | |
{ | |
if ($name ne 'none' && $splice ne 'none') | |
{ | |
print OUTPUT "$name $splice\n"; | |
} | |
$name = $temp[0]; | |
@temp = split(/_/,$name); # eg. KOG0142.1.R.2_5splice | |
$name = $temp[0]; | |
$name = substr($name,1,length($name)-1); | |
# FIND A NEW NAME FOR THIS SEQUENCE: | |
$no_introns++; | |
($new_name,$errorcode,$errormsg) = &new_intron_name($name,$no_introns); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
$NEW_NAME{$name} = $new_name; | |
$name = $new_name; | |
} | |
else | |
{ | |
if ($temp[0] eq 'none') | |
{ | |
$errormsg = "ERROR: make_genewise_paramfile: sequence $temp[0] for name $name\n"; | |
$errorcode = 14; # ERRORCODE=14 | |
return($errorcode,$errormsg); | |
} | |
$splice = $temp[0]; | |
} | |
} | |
close(THREESPLICE); | |
if ($name ne 'none' && $splice ne 'none') | |
{ | |
print OUTPUT "$name $splice\n"; | |
} | |
# READ IN THE FILE OF 5' SPLICE SITES: | |
print OUTPUT "//\n"; | |
print OUTPUT "splice5 5\n"; | |
$name = "none"; | |
$splice = "none"; | |
open(FIVESPLICE,"$fivesplice") || die "ERROR: make_genewise_paramfile: cannot open $fivesplice.\n"; | |
while(<FIVESPLICE>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
if (substr($line,0,1) eq '>') | |
{ | |
if ($name ne 'none' && $splice ne 'none') | |
{ | |
print OUTPUT "$name $splice\n"; | |
} | |
$name = $temp[0]; | |
@temp = split(/_/,$name); | |
$name = $temp[0]; | |
$name = substr($name,1,length($name)-1); | |
if (!($NEW_NAME{$name})) | |
{ | |
$errormsg = "ERROR: make_genewise_paramfile: do not know new name for $name\n"; | |
$errorcode = 7; # ERRORCODE=7 | |
return($errorcode,$errormsg); | |
} | |
$new_name = $NEW_NAME{$name}; | |
$name = $new_name; | |
} | |
else | |
{ | |
if ($temp[0] eq 'none') | |
{ | |
$errormsg = "ERROR: make_genewise_paramfile: sequence $temp[0] for name $name\n"; | |
$errorcode = 14; # ERRORCODE=14 | |
return($errorcode,$errormsg); | |
} | |
$splice = $temp[0]; | |
} | |
} | |
close(FIVESPLICE); | |
if ($name ne 'none' && $splice ne 'none') | |
{ | |
print OUTPUT "$name $splice\n"; | |
} | |
# WRITE OUT THE REST OF THE PARAMETERS IN THE PARAMETER FILE, COPIED FROM | |
# THE GENEWISE PARAMETER FILE /nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/wisecfg/gene.stat) | |
print OUTPUT "//\n"; | |
print OUTPUT "# A G C T N\n"; | |
print OUTPUT "intron_emission\n"; | |
print OUTPUT "0.25 0.25 0.25 0.25 0.25\n"; | |
print OUTPUT "//\n"; | |
print OUTPUT "polyp_emission\n"; | |
print OUTPUT "0.2 0.2 0.3 0.3 0.25\n"; | |
print OUTPUT "//\n"; | |
print OUTPUT "rnd_emission\n"; | |
print OUTPUT "0.25 0.25 0.25 0.25 0.25 1.0\n"; | |
print OUTPUT "//\n"; | |
print OUTPUT "rndcodon\n"; | |
print OUTPUT "AAA 5290.000000\n"; | |
print OUTPUT "AAC 4795.000000\n"; | |
print OUTPUT "AAG 8178.000000\n"; | |
print OUTPUT "AAT 3305.000000\n"; | |
print OUTPUT "ACA 6240.000000\n"; | |
print OUTPUT "ACC 7728.000000\n"; | |
print OUTPUT "ACG 3347.000000\n"; | |
print OUTPUT "ACT 4930.000000\n"; | |
print OUTPUT "AGA 8491.000000\n"; | |
print OUTPUT "AGC 8639.000000\n"; | |
print OUTPUT "AGG 8997.000000\n"; | |
print OUTPUT "AGT 4417.000000\n"; | |
print OUTPUT "ATA 1975.000000\n"; | |
print OUTPUT "ATC 4973.000000\n"; | |
print OUTPUT "ATG 6474.000000\n"; | |
print OUTPUT "ATT 3083.000000\n"; | |
print OUTPUT "CAA 7057.000000\n"; | |
print OUTPUT "CAC 6815.000000\n"; | |
print OUTPUT "CAG 11041.000000\n"; | |
print OUTPUT "CAT 5779.000000\n"; | |
print OUTPUT "CCA 10537.000000\n"; | |
print OUTPUT "CCC 10307.000000\n"; | |
print OUTPUT "CCG 5621.000000\n"; | |
print OUTPUT "CCT 10134.000000\n"; | |
print OUTPUT "CGA 3377.000000\n"; | |
print OUTPUT "CGC 5146.000000\n"; | |
print OUTPUT "CGG 5375.000000\n"; | |
print OUTPUT "CGT 2765.000000\n"; | |
print OUTPUT "CTA 3502.000000\n"; | |
print OUTPUT "CTC 7465.000000\n"; | |
print OUTPUT "CTG 13780.000000\n"; | |
print OUTPUT "CTT 5453.000000\n"; | |
print OUTPUT "GAA 7461.000000\n"; | |
print OUTPUT "GAC 6937.000000\n"; | |
print OUTPUT "GAG 9975.000000\n"; | |
print OUTPUT "GAT 4949.000000\n"; | |
print OUTPUT "GCA 7747.000000\n"; | |
print OUTPUT "GCC 10890.000000\n"; | |
print OUTPUT "GCG 4828.000000\n"; | |
print OUTPUT "GCT 9371.000000\n"; | |
print OUTPUT "GGA 10143.000000\n"; | |
print OUTPUT "GGC 10400.000000\n"; | |
print OUTPUT "GGG 8869.000000\n"; | |
print OUTPUT "GGT 5567.000000\n"; | |
print OUTPUT "GTA 2143.000000\n"; | |
print OUTPUT "GTC 4593.000000\n"; | |
print OUTPUT "GTG 8189.000000\n"; | |
print OUTPUT "GTT 3021.000000\n"; | |
print OUTPUT "TAA 1775.000000\n"; | |
print OUTPUT "TAC 3687.000000\n"; | |
print OUTPUT "TAG 1333.000000\n"; | |
print OUTPUT "TAT 2477.000000\n"; | |
print OUTPUT "TCA 6180.000000\n"; | |
print OUTPUT "TCC 7668.000000\n"; | |
print OUTPUT "TCG 2875.000000\n"; | |
print OUTPUT "TCT 5767.000000\n"; | |
print OUTPUT "TGA 7315.000000\n"; | |
print OUTPUT "TGC 8625.000000\n"; | |
print OUTPUT "TGG 11718.000000\n"; | |
print OUTPUT "TGT 5197.000000\n"; | |
print OUTPUT "TTA 1664.000000\n"; | |
print OUTPUT "TTC 5462.000000\n"; | |
print OUTPUT "TTG 4420.000000\n"; | |
print OUTPUT "TTT 3453.000000\n"; | |
print OUTPUT "//\n\n"; | |
close(OUTPUT); | |
return($errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &new_intron_name | |
sub test_new_intron_name | |
{ | |
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR | |
my $new_name; # NEW NAME FOR AN INTRON | |
($new_name,$errorcode,$errormsg) = &new_intron_name("KOG0363.1.R.6_3splice",3); | |
if ($errorcode != 0 || $new_name ne 'HELMINTH0000003') { print STDERR "ERROR: test_new_intron_name: failed test1\n"; exit;} | |
($new_name,$errorcode,$errormsg) = &new_intron_name("KOG0363.1.R.6_3splice",33); | |
if ($errorcode != 0 || $new_name ne 'HELMINTH0000033') { print STDERR "ERROR: test_new_intron_name: failed test2\n"; exit;} | |
} | |
#------------------------------------------------------------------# | |
# WORK OUT A NEW NAME FOR AN INTRON: | |
sub new_intron_name | |
{ | |
my $name = $_[0]; # ORIGINAL NAME OF AN INTRON | |
my $no_introns = $_[1]; # NUMBER OF INTRONS WE HAVE SEEN ALREADY | |
my $new_name; # NEW NAME FOR AN INTRON | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none"; # RETURNED AS 'none' IF THERE IS NO ERROR | |
my $length = length($no_introns); # LENGTH OF THE NUMBER OF INTRONS | |
my $no_zeroes = 7 - $length; # NUMBER OF ZEROES TO PUT IN $new_name | |
my $i; # | |
$new_name = "HELMINTH"; | |
for ($i = 1; $i <= $no_zeroes; $i++) { $new_name = $new_name."0";} | |
$new_name = $new_name.$no_introns; | |
return($new_name,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &make_splicesite_fasta | |
sub test_make_splicesite_fasta | |
{ | |
my $outputdir = $_[0]; # DIRECTORY FOR WRITING OUTPUT FILES IN | |
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES | |
my $expected_splicefasta; # EXPECTED SPLICE SITE FASTA FILE | |
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR | |
my $differences; # DIFFERENCES BETWEEN $new_intron_gff AND $expected_new_intron_gff | |
my $length_differences; # LENGTH OF $differences | |
my $line; # | |
my $splicefasta; # THE SPLICE SITE FASTA FILE | |
my $splicegff; # THE SPLICE SITE GFF FILE | |
my $fasta; # THE GENOME FASTA FILE | |
$random_number = rand(); | |
$splicegff = $outputdir."/tmp".$random_number; | |
open(SPLICEGFF,">$splicegff") || die "ERROR: test_make_splicesite_fasta: cannot open $splicegff\n"; | |
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t11\t15\t.\t+\t.\tGENE1.F.1_3splice\n"; | |
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t31\t35\t.\t+\t.\tGENE1.F.2_3splice\n"; | |
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t41\t45\t.\t+\t.\tGENE1.F.3_3splice\n"; | |
print SPLICEGFF "scaffold2\tcurated\t3_splice_site\t11\t15\t.\t+\t.\tGENE2.F.1_3splice\n"; | |
close(SPLICEGFF); | |
$random_number = rand(); | |
$fasta = $outputdir."/tmp".$random_number; | |
open(FASTA,">$fasta") || die "ERROR: test_make_splicesite_fasta: cannot open $fasta\n"; | |
print FASTA ">scaffold1\n"; | |
print FASTA "ABCDEFGHIJ\n"; | |
print FASTA "KLMNOPQRST\n"; | |
print FASTA "UVWXYZabcd\n"; | |
print FASTA "efghijklmn\n"; | |
print FASTA "opqrstuvwx\n"; | |
print FASTA ">scaffold2\n"; | |
print FASTA "ABCDEFGHIJ\n"; | |
print FASTA "KLMNOPQRST\n"; | |
print FASTA "UVWXYZabcd\n"; | |
print FASTA "efghijklmn\n"; | |
print FASTA "opqrstuvwx\n"; | |
close(FASTA); | |
$random_number = rand(); | |
$expected_splicefasta = $outputdir."/tmp".$random_number; | |
open(EXPECTED,">$expected_splicefasta") || die "ERROR: test_make_splicesite_fasta: cannot open $expected_splicefasta\n"; | |
print EXPECTED ">GENE1.F.1_3splice\n"; | |
print EXPECTED "KLMNO\n"; | |
print EXPECTED ">GENE1.F.2_3splice\n"; | |
print EXPECTED "EFGHI\n"; | |
print EXPECTED ">GENE1.F.3_3splice\n"; | |
print EXPECTED "OPQRS\n"; | |
print EXPECTED ">GENE2.F.1_3splice\n"; | |
print EXPECTED "KLMNO\n"; | |
close(EXPECTED); | |
($splicefasta,$errorcode,$errormsg) = &make_splicesite_fasta($splicegff,$outputdir,$fasta); | |
if ($errorcode != 0) { print STDERR "ERROR: test_make_splicesite_fasta: failed test1 (errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
$differences = ""; | |
open(TEMP,"diff $splicefasta $expected_splicefasta |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_make_splicesite_fasta: failed test1 (files $splicefasta $expected_splicefasta)\n"; exit;} | |
system "rm -f $splicefasta"; | |
system "rm -f $expected_splicefasta"; | |
system "rm -f $splicegff"; | |
system "rm -f $fasta"; | |
$random_number = rand(); | |
$splicegff = $outputdir."/tmp".$random_number; | |
open(SPLICEGFF,">$splicegff") || die "ERROR: test_make_splicesite_fasta: cannot open $splicegff\n"; | |
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t11\t15\t.\t+\t.\tGENE1.F.1_3splice\n"; | |
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t31\t35\t.\t+\t.\tGENE1.F.2_3splice\n"; | |
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t41\t45\t.\t+\t.\tGENE1.F.3_3splice\n"; | |
print SPLICEGFF "scaffold2\tcurated\t3_splice_site\t11\t25\t.\t+\t.\tGENE2.F.1_3splice\n"; | |
close(SPLICEGFF); | |
$random_number = rand(); | |
$fasta = $outputdir."/tmp".$random_number; | |
open(FASTA,">$fasta") || die "ERROR: test_make_splicesite_fasta: cannot open $fasta\n"; | |
print FASTA ">scaffold1\n"; | |
print FASTA "ABCDEFGHIJ\n"; | |
print FASTA "KLMNOPQRST\n"; | |
print FASTA "UVWXYZabcd\n"; | |
print FASTA "efghijklmn\n"; | |
print FASTA "opqrstuvwx\n"; | |
print FASTA ">scaffold2\n"; | |
print FASTA "ABCDEFGHIJ\n"; | |
print FASTA "KLMNOPQRST\n"; | |
print FASTA "UVWXYZabcd\n"; | |
print FASTA "efghijklmn\n"; | |
print FASTA "opqrstuvwx\n"; | |
print FASTA ">scaffold1\n"; | |
print FASTA "ABCDEFGHIJ\n"; | |
print FASTA "KLMNOPQRST\n"; | |
print FASTA "UVWXYZabcd\n"; | |
print FASTA "efghijklmn\n"; | |
close(FASTA); | |
$random_number = rand(); | |
($splicefasta,$errorcode,$errormsg) = &make_splicesite_fasta($splicegff,$outputdir,$fasta); | |
if ($errorcode != 7) { print STDERR "ERROR: test_make_splicesite_fasta: failed test2 (errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
system "rm -f $splicefasta"; | |
system "rm -f $splicegff"; | |
system "rm -f $fasta"; | |
$random_number = rand(); | |
$splicegff = $outputdir."/tmp".$random_number; | |
open(SPLICEGFF,">$splicegff") || die "ERROR: test_make_splicesite_fasta: cannot open $splicegff\n"; | |
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t11\t15\t.\t+\t.\tGENE1.F.1_3splice\n"; | |
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t31\t35\t.\t+\t.\tGENE1.F.2_3splice\n"; | |
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t41\t45\t.\t+\t.\tGENE1.F.3_3splice\n"; | |
print SPLICEGFF "scaffold2\tcurated\t3_splice_site\t25\t11\t.\t+\t.\tGENE2.F.1_3splice\n"; | |
close(SPLICEGFF); | |
$random_number = rand(); | |
$fasta = $outputdir."/tmp".$random_number; | |
open(FASTA,">$fasta") || die "ERROR: test_make_splicesite_fasta: cannot open $fasta\n"; | |
print FASTA ">scaffold1\n"; | |
print FASTA "ABCDEFGHIJ\n"; | |
print FASTA "KLMNOPQRST\n"; | |
print FASTA "UVWXYZabcd\n"; | |
print FASTA "efghijklmn\n"; | |
print FASTA "opqrstuvwx\n"; | |
print FASTA ">scaffold2\n"; | |
print FASTA "ABCDEFGHIJ\n"; | |
print FASTA "KLMNOPQRST\n"; | |
print FASTA "UVWXYZabcd\n"; | |
print FASTA "efghijklmn\n"; | |
print FASTA "opqrstuvwx\n"; | |
close(FASTA); | |
$random_number = rand(); | |
($splicefasta,$errorcode,$errormsg) = &make_splicesite_fasta($splicegff,$outputdir,$fasta); | |
if ($errorcode != 8) { print STDERR "ERROR: test_make_splicesite_fasta: failed test3 (errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
system "rm -f $splicefasta"; | |
system "rm -f $splicegff"; | |
system "rm -f $fasta"; | |
$random_number = rand(); | |
$splicegff = $outputdir."/tmp".$random_number; | |
open(SPLICEGFF,">$splicegff") || die "ERROR: test_make_splicesite_fasta: cannot open $splicegff\n"; | |
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t11\t15\t.\t+\t.\tGENE1.F.1_3splice\n"; | |
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t31\t35\t.\t+\t.\tGENE1.F.2_3splice\n"; | |
print SPLICEGFF "scaffold1\tcurated\t3_splice_site\t41\t45\t.\t+\t.\tGENE1.F.3_3splice\n"; | |
print SPLICEGFF "scaffold3\tcurated\t3_splice_site\t11\t25\t.\t+\t.\tGENE2.F.1_3splice\n"; | |
close(SPLICEGFF); | |
$random_number = rand(); | |
$fasta = $outputdir."/tmp".$random_number; | |
open(FASTA,">$fasta") || die "ERROR: test_make_splicesite_fasta: cannot open $fasta\n"; | |
print FASTA ">scaffold1\n"; | |
print FASTA "ABCDEFGHIJ\n"; | |
print FASTA "KLMNOPQRST\n"; | |
print FASTA "UVWXYZabcd\n"; | |
print FASTA "efghijklmn\n"; | |
print FASTA "opqrstuvwx\n"; | |
print FASTA ">scaffold2\n"; | |
print FASTA "ABCDEFGHIJ\n"; | |
print FASTA "KLMNOPQRST\n"; | |
print FASTA "UVWXYZabcd\n"; | |
print FASTA "efghijklmn\n"; | |
print FASTA "opqrstuvwx\n"; | |
close(FASTA); | |
$random_number = rand(); | |
($splicefasta,$errorcode,$errormsg) = &make_splicesite_fasta($splicegff,$outputdir,$fasta); | |
if ($errorcode != 9) { print STDERR "ERROR: test_make_splicesite_fasta: failed test4 (errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
system "rm -f $splicefasta"; | |
system "rm -f $splicegff"; | |
system "rm -f $fasta"; | |
} | |
#------------------------------------------------------------------# | |
# MAKE A FASTA FILE WITH THE SPLICE SITE SEQUENCES: | |
sub make_splicesite_fasta | |
{ | |
my $splicegff = $_[0]; # THE SPLICE SITE GFF FILE | |
my $outputdir = $_[1]; # THE DIRECTORY FOR WRITING OUTPUT FILES | |
my $fasta = $_[2]; # THE GENOME FASTA FILE | |
my $splicefasta = "none";# THE SPLICE SITE FASTA FILE | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $line; # | |
my @temp; # | |
my $seq = ""; # SEQUENCE OF A CONTIG/CHROMOSOME | |
my %SEQ = (); # HASH TABLE FOR STORING SEQUENCES | |
my $name = "none";# NAME OF A CONTIG/CHROMOSOME | |
my $random_number; # RANDOM NUMBER FOR TEMPORARY FILE NAME | |
my $chunk_start; # START OF THE CHUNK OF DNA WE WANT | |
my $chunk_end; # END OF THE CHUNK OF DNA WE WANT | |
my $chunk_length; # LENGTH OF THE CHUNK OF DNA WE WANT | |
my $seq_length; # LENGTH OF THE CONTIG/CHROMOSOME SEQUENCE | |
my $chunk_seq; # SEQUENCE OF THE CHUNK OF DNA NEAR THE SPLICE SITE | |
my $chunk_name; # NAME OF THE CHUNK OF DNA NEAR THE SPLICE SITE | |
my $offset; # A COUNTER USED IN PRINTING OUT THE SEQUENCE $chunk_seq | |
my $seqline; # A LINE OF THE $chunk_seq | |
my $strand; # STRAND OF THE INTRON | |
# READ IN THE GENOME SEQUENCE FILE: | |
open(FASTA,"$fasta") || die "ERROR: make_splicesite_fasta: cannot open fasta file $fasta.\n"; | |
while(<FASTA>) | |
{ | |
$line = $_; | |
chomp $line; | |
if (substr($line,0,1) eq '>') | |
{ | |
if ($name ne "none") | |
{ | |
$seq =~ tr/[a-z]/[A-Z]/; | |
$seq =~ s/\s+//g; | |
if ($SEQ{$name}) | |
{ | |
$errormsg = "ERROR: make_splicesite_fasta: already have sequence for $name.\n"; | |
$errorcode = 7; | |
return($splicefasta,$errorcode,$errormsg); | |
} | |
$SEQ{$name} = $seq; | |
$seq = ""; | |
} | |
@temp = split(/\s+/,$line); | |
$name = $temp[0]; | |
$name = substr($name,1,length($name)-1); | |
$name =~ tr/[a-z]/[A-Z]/; | |
} | |
else | |
{ | |
$seq = $seq.$line; | |
} | |
} | |
close(FASTA); | |
# STORE THE LAST SEQUENCE: | |
if ($name ne "none") | |
{ | |
$seq =~ tr/[a-z]/[A-Z]/; | |
if ($SEQ{$name}) | |
{ | |
$errormsg = "ERROR: make_splicesite_fasta: already have sequence for $name.\n"; | |
$errorcode = 7; | |
return($splicefasta,$errorcode,$errormsg); | |
} | |
$SEQ{$name} = $seq; | |
$seq = ""; | |
} | |
# READ IN THE GFF FILE, AND PRINT THE OUTPUT FILE: | |
$random_number = rand(); | |
$splicefasta = $outputdir."/tmp".$random_number; | |
open(SPLICEFASTA,">$splicefasta") || die "ERROR: make_splicesite_fasta: cannot open $splicefasta\n"; | |
# READ IN THE GFF FILE: | |
open(GFF,$splicegff) || die "ERROR: make_splicesite_fasta: cannot open $splicegff.\n"; | |
while(<GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
$name = $temp[0]; | |
$name =~ tr/[a-z]/[A-Z]/; | |
$chunk_start = $temp[3]; | |
$chunk_end = $temp[4]; | |
$strand = $temp[6]; | |
$chunk_name = $temp[8]; | |
$chunk_length = $chunk_end - $chunk_start + 1; | |
if ($chunk_length < 0) | |
{ | |
$errormsg = "ERROR: make_splicesite_fasta: chunk_length $chunk_length line $line\n"; | |
$errorcode = 8; # ERRORCODE=8 | |
return($splicefasta,$errorcode,$errormsg); | |
} | |
if (!($SEQ{$name})) | |
{ | |
$errormsg = "ERROR: make_splicesite_fasta: do not know sequence for $name.\n"; | |
$errorcode = 9; # ERRORCODE=9 | |
return($splicefasta,$errorcode,$errormsg); | |
} | |
$seq = $SEQ{$name}; | |
$seq_length = length($seq); | |
if ($seq_length == 0) | |
{ | |
$errormsg = "ERROR: make_splicesite_fasta: seq_length $seq_length for name $name.\n"; | |
$errorcode = 10; # ERRORCODE=10 | |
return($splicefasta,$errorcode,$errormsg); | |
} | |
$chunk_seq = substr($seq,$chunk_start-1,$chunk_length); | |
# FIND THE REVERSE-COMPLEMENT OF THE BASES IF $strand IS "-": | |
if ($strand eq '-') | |
{ | |
($chunk_seq,$errorcode,$errormsg) = &reverse_complement($chunk_seq); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
} | |
# PRINT THE CHUNK $chunk_name IN THE OUTPUT FILE: | |
print SPLICEFASTA ">$chunk_name\n"; | |
# PRINT OUT THE $chunk_seq IN LINES THAT ARE 60 bp LONG: | |
$offset = 0; | |
while ($offset < $chunk_length) | |
{ | |
$seqline = substr($chunk_seq,$offset,60); | |
print SPLICEFASTA "$seqline\n"; | |
$offset = $offset + 60; | |
} | |
} | |
close(GFF); | |
close(SPLICEFASTA); | |
return($splicefasta,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
sub test_reverse_complement | |
{ | |
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR | |
my $seq; # SEQUENCE TO FIND THE REVERSE COMPLEMENT OF | |
my $revcomp; # THE REVERSE COMPLEMENT SEQUENCE | |
($revcomp,$errorcode,$errormsg) = &reverse_complement("AACCTTGG"); | |
if ($errorcode != 0 && $revcomp ne 'CCAAGGTT') { print STDERR "ERROR: test_reverse_complement: failed test1\n"; exit;} | |
($revcomp,$errorcode,$errormsg) = &reverse_complement(''); | |
if ($errorcode != 15) { print STDERR "ERROR: test_reverse_complement: failed test2\n"; exit;} | |
} | |
#------------------------------------------------------------------# | |
# GET THE REVERSE COMPLEMENT OF A SEQUENCE: | |
# SUBROUTINE SYNOPSIS: reverse_complement(): finds the reverse complement of a sequence | |
sub reverse_complement | |
{ | |
my $seq = $_[0]; ## SEQUENCE THAT WE WANT TO FIND THE COMPLEMENT OF | |
my $complement = ""; ## COMPLEMENT OF SEQUENCE $seq | |
my $errorcode = 0; ## RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";## RETURNED AS 'none' IF THERE IS NO ERROR | |
if ($seq eq '') | |
{ | |
$errormsg = "ERROR: find_complement: seq is $seq\n"; | |
$errorcode = 15; # ERRORCODE=15 | |
return($complement,$errorcode,$errormsg); | |
} | |
$seq =~ tr/[a-z]/[A-Z]/; # CHANGE TO UPPERCASE | |
$complement = $seq; | |
# SWAP As WITH Ts: | |
$complement =~ s/A/1/g; # SUBSTITUTE '1' FOR 'A' | |
$complement =~ s/T/A/g; # SUBSTITUTE 'A' FOR 'T' | |
$complement =~ s/1/T/g; # SUBSTITUTE 'T' FOR '1' | |
# SWAP Gs WITH Cs: | |
$complement =~ s/G/1/g; # SUBSTITUTE '1' FOR 'G' | |
$complement =~ s/C/G/g; # SUBSITTUTE 'G' FOR 'C' | |
$complement =~ s/1/C/g; # SUBSTITUTE 'C' FOR '1' | |
# FIND THE REVERSE COMPLEMENT: | |
$complement = reverse $complement; | |
if ($complement eq '') | |
{ | |
$errormsg = "ERROR: reverse_complement: complement $complement (seq $seq)\n"; | |
$errorcode = 16; # ERRORCODE=16 | |
return($complement,$errorcode,$errormsg); | |
} | |
return($complement,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &make_splicesite_gff | |
sub test_make_splicesite_gff | |
{ | |
my $outputdir = $_[0]; ## DIRECTORY FOR WRITING OUTPUT FILES IN | |
my $random_number; ## RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES | |
my $expected_splicegff; ## EXPECTED SPLICE SITE GFF | |
my $errorcode; ## RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; ## RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR | |
my $differences; ## DIFFERENCES BETWEEN $new_intron_gff AND $expected_new_intron_gff | |
my $length_differences; ## LENGTH OF $differences | |
my $line; ## | |
my $intron_gff; ## THE INTRON GFF FILE | |
my $splicegff; ## THE SPLICE SITE GFF FILE | |
$random_number = rand(); | |
$intron_gff = $outputdir."/tmp".$random_number; | |
open(INTRONGFF,">$intron_gff") || die "ERROR: test_remove_duplicate_introns: cannot open $intron_gff\n"; | |
print INTRONGFF "scaffold1\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n"; | |
print INTRONGFF "scaffold1\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n"; | |
print INTRONGFF "scaffold1\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t116\t119\t.\t.\t.\tgene1.f.1\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t126\t129\t.\t.\t.\tgene1.f.2\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t136\t139\t.\t.\t.\tgene1.f.3\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene2.f.1\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene2.f.2\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene2.f.3\n"; | |
close(INTRONGFF); | |
($splicegff,$errorcode,$errormsg) = &make_splicesite_gff($intron_gff,$outputdir,'threeprime',9,4); | |
if ($errorcode != 0) { print STDERR "ERROR: test_make_splicesite_gff: failed test1 (errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
$random_number = rand(); | |
$expected_splicegff = $outputdir."/tmp".$random_number; | |
open(EXPECTED,">$expected_splicegff") || die "ERROR: test_make_splicesite_gff: cannot open $expected_splicegff\n"; | |
print EXPECTED "scaffold1\tcurated\t3_splice_site\t10\t23\t.\t+\t.\tGENE1.F.1_3splice\n"; | |
print EXPECTED "scaffold1\tcurated\t3_splice_site\t20\t33\t.\t+\t.\tGENE1.F.2_3splice\n"; | |
print EXPECTED "scaffold1\tcurated\t3_splice_site\t30\t43\t.\t+\t.\tGENE1.F.3_3splice\n"; | |
print EXPECTED "scaffold2\tcurated\t3_splice_site\t110\t123\t.\t+\t.\tGENE1.F.1_3splice\n"; | |
print EXPECTED "scaffold2\tcurated\t3_splice_site\t120\t133\t.\t+\t.\tGENE1.F.2_3splice\n"; | |
print EXPECTED "scaffold2\tcurated\t3_splice_site\t130\t143\t.\t+\t.\tGENE1.F.3_3splice\n"; | |
print EXPECTED "scaffold2\tcurated\t3_splice_site\t10\t23\t.\t+\t.\tGENE2.F.1_3splice\n"; | |
print EXPECTED "scaffold2\tcurated\t3_splice_site\t20\t33\t.\t+\t.\tGENE2.F.2_3splice\n"; | |
print EXPECTED "scaffold2\tcurated\t3_splice_site\t30\t43\t.\t+\t.\tGENE2.F.3_3splice\n"; | |
close(EXPECTED); | |
$differences = ""; | |
open(TEMP,"diff $splicegff $expected_splicegff |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_make_splicesite_gff: failed test1 (files $splicegff $expected_splicegff)\n"; exit;} | |
system "rm -f $splicegff"; | |
system "rm -f $expected_splicegff"; | |
system "rm -f $intron_gff"; | |
$random_number = rand(); | |
$intron_gff = $outputdir."/tmp".$random_number; | |
open(INTRONGFF,">$intron_gff") || die "ERROR: test_remove_duplicate_introns: cannot open $intron_gff\n"; | |
print INTRONGFF "scaffold1\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n"; | |
print INTRONGFF "scaffold1\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n"; | |
print INTRONGFF "scaffold1\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t116\t119\t.\t.\t.\tgene1.f.1\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t126\t129\t.\t.\t.\tgene1.f.2\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t136\t139\t.\t.\t.\tgene1.f.3\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene2.f.1\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene2.f.2\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene2.f.3\n"; | |
close(INTRONGFF); | |
($splicegff,$errorcode,$errormsg) = &make_splicesite_gff($intron_gff,$outputdir,'prime',9,4); | |
if ($errorcode != 6) { print STDERR "ERROR: test_make_splicesite_gff: failed test2 (errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
system "rm -f $intron_gff"; | |
$random_number = rand(); | |
$intron_gff = $outputdir."/tmp".$random_number; | |
open(INTRONGFF,">$intron_gff") || die "ERROR: test_remove_duplicate_introns: cannot open $intron_gff\n"; | |
print INTRONGFF "scaffold1\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.1\n"; | |
print INTRONGFF "scaffold1\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.2\n"; | |
print INTRONGFF "scaffold1\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.3\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t116\t119\t.\t.\t.\tgene1.1\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t126\t129\t.\t.\t.\tgene1.2\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t136\t139\t.\t.\t.\tgene1.3\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene2.1\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene2.2\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene2.3\n"; | |
close(INTRONGFF); | |
($splicegff,$errorcode,$errormsg) = &make_splicesite_gff($intron_gff,$outputdir,'threeprime',9,4); | |
if ($errorcode != 5) { print STDERR "ERROR: test_make_splicesite_gff: failed test3 (errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
system "rm -f $intron_gff"; | |
$random_number = rand(); | |
$intron_gff = $outputdir."/tmp".$random_number; | |
open(INTRONGFF,">$intron_gff") || die "ERROR: test_remove_duplicate_introns: cannot open $intron_gff\n"; | |
print INTRONGFF "scaffold1\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n"; | |
print INTRONGFF "scaffold1\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n"; | |
print INTRONGFF "scaffold1\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t116\t119\t.\t.\t.\tgene1.f.1\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t126\t129\t.\t.\t.\tgene1.f.2\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t136\t139\t.\t.\t.\tgene1.f.3\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene2.f.1\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene2.f.2\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene2.f.3\n"; | |
close(INTRONGFF); | |
($splicegff,$errorcode,$errormsg) = &make_splicesite_gff($intron_gff,$outputdir,'fiveprime',4,9); | |
if ($errorcode != 0) { print STDERR "ERROR: test_make_splicesite_gff: failed test4 (errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
$random_number = rand(); | |
$expected_splicegff = $outputdir."/tmp".$random_number; | |
open(EXPECTED,">$expected_splicegff") || die "ERROR: test_make_splicesite_gff: cannot open $expected_splicegff\n"; | |
print EXPECTED "scaffold1\tcurated\t5_splice_site\t12\t25\t.\t+\t.\tGENE1.F.1_5splice\n"; | |
print EXPECTED "scaffold1\tcurated\t5_splice_site\t22\t35\t.\t+\t.\tGENE1.F.2_5splice\n"; | |
print EXPECTED "scaffold1\tcurated\t5_splice_site\t32\t45\t.\t+\t.\tGENE1.F.3_5splice\n"; | |
print EXPECTED "scaffold2\tcurated\t5_splice_site\t112\t125\t.\t+\t.\tGENE1.F.1_5splice\n"; | |
print EXPECTED "scaffold2\tcurated\t5_splice_site\t122\t135\t.\t+\t.\tGENE1.F.2_5splice\n"; | |
print EXPECTED "scaffold2\tcurated\t5_splice_site\t132\t145\t.\t+\t.\tGENE1.F.3_5splice\n"; | |
print EXPECTED "scaffold2\tcurated\t5_splice_site\t12\t25\t.\t+\t.\tGENE2.F.1_5splice\n"; | |
print EXPECTED "scaffold2\tcurated\t5_splice_site\t22\t35\t.\t+\t.\tGENE2.F.2_5splice\n"; | |
print EXPECTED "scaffold2\tcurated\t5_splice_site\t32\t45\t.\t+\t.\tGENE2.F.3_5splice\n"; | |
close(EXPECTED); | |
$differences = ""; | |
open(TEMP,"diff $splicegff $expected_splicegff |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_make_splicesite_gff: failed test4 (files $splicegff $expected_splicegff)\n"; exit;} | |
system "rm -f $splicegff"; | |
system "rm -f $expected_splicegff"; | |
system "rm -f $intron_gff"; | |
} | |
#------------------------------------------------------------------# | |
# MAKE A GFF FILE WITH COORDINATES OF REGIONS AROUND INTRON SPLICE SITES: | |
sub make_splicesite_gff | |
{ | |
my $intron_gff = $_[0]; # THE INTRON GFF FILE | |
my $outputdir = $_[1]; # DIRECTORY FOR WRITING OUTPUT FILES TO | |
my $spliceend = $_[2]; # WHICH SPLICE SITE TO USE ('threeprime' OR 'fiveprime') | |
my $fiveprimebp = $_[3]; # BASES TO TAKE 5' OF THE SPLICE SITE (DEFINED BY 3'-MOST BASE OF INTRON FOR 3' SPLICE SITE, OR 5'-MOST FOR 5' SPLICE) | |
my $threeprimebp = $_[4]; # BASES TO TAKE 3' OF THE SPLICE SITE (DEFINED BY 3'-MOST BASE OF INTRON FOR 3' SPLICE SITE, OR 5'-MOST FOR 5' SPLICE) | |
my $line; # | |
my @temp; # | |
my $chromosome; # CHROMOSOME OF THE INTRON | |
my $start; # START OF THE INTRON | |
my $end; # END OF THE INTRON | |
my $intron_strand; # STRAND OF THE INTRON | |
my $intron; # NAME OF THE INTRON | |
my $intron_no; # NUMBER OF THE INTRON IN THE GENE | |
my $strand; # STRAND OF THE GENE, INFERRED FROM THE GENE NAME | |
my $gene; # NAME OF THE GENE | |
my $splicegff = "none";# SPLICE SITE GFF FILE | |
my $splice_start; # START OF THE SPLICE SITE REGION | |
my $splice_end; # END OF THE SPLICE SITE REGION | |
my $splice; # NAME OF THE SPLICE SITE REGION | |
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAME | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $i; # | |
# CHECK THAT $spliceend IS 'threeprime' OR 'fiveprime' | |
if ($spliceend ne 'threeprime' && $spliceend ne 'fiveprime') | |
{ | |
$errormsg = "ERROR: make_splicesite_gff: spliceend $spliceend (should be threeprime/fiveprime)\n"; | |
$errorcode = 6; # ERRORCODE=6 | |
return($splicegff,$errorcode,$errormsg); | |
} | |
# READ THROUGH THE INTRON GFF FILE: | |
$random_number = rand(); | |
$splicegff = $outputdir."/tmp".$random_number; | |
open(SPLICEGFF,">$splicegff") || die "ERROR: make_splicesite_gff: cannot open $splicegff\n"; | |
open(INTRON_GFF,"$intron_gff") || die "ERROR: make_splicesite_gff: cannot open $intron_gff.\n"; | |
while(<INTRON_GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
$chromosome = $temp[0]; | |
$start = $temp[3]; | |
$end = $temp[4]; | |
$intron = $temp[8]; | |
@temp = split(/\./,$intron); | |
$intron_no = $temp[$#temp]; | |
# FIND THE INTRON STRAND FROM THE INTRON NAME: | |
if ($intron =~ /\.r/ || $intron =~ /\.f/) | |
{ | |
$strand = $temp[$#temp-1]; | |
$gene = ""; | |
for ($i = 0; $i < $#temp-1; $i++) { $gene = $gene.".".$temp[$i];} | |
$gene = substr($gene,1,length($gene)-1); | |
$gene =~ tr/[a-z]/[A-Z]/; | |
$intron = $gene.".".$strand.".".$intron_no; | |
$intron =~ tr/[a-z]/[A-Z]/; | |
# DEFINE THE SPLICE SITE REGION | |
if ($strand eq 'f') | |
{ | |
# THE SPLICE SITE IS DEFINED BY 3'-MOST BASE OF INTRON, FOR 3' SPLICE SITE: | |
if ($spliceend eq 'threeprime') | |
{ | |
$splice_start= $end - $fiveprimebp; | |
$splice_end = $end + $threeprimebp; | |
} | |
# THE SPLICE SITE IS DEFINED BY 5'-MOST BASE OF INTRON, FOR 5' SPLICE SITE: | |
elsif ($spliceend eq 'fiveprime') | |
{ | |
$splice_start = $start - $fiveprimebp; | |
$splice_end = $start + $threeprimebp; | |
} | |
$intron_strand = "+"; | |
} | |
elsif ($strand eq 'r' || $strand eq 'R') | |
{ | |
# THE SPLICE SITE IS DEFINED BY 3'-MOST BASE OF INTRON, FOR 3' SPLICE SITE: | |
if ($spliceend eq 'threeprime') | |
{ | |
$splice_start = $start - $threeprimebp; | |
$splice_end = $start + $fiveprimebp; | |
} | |
# THE SPLICE SITE IS DEFINED BY 5'-MOST BASE OF INTRON, FOR 5' SPLICE SITE: | |
elsif ($spliceend eq 'fiveprime') | |
{ | |
$splice_start = $end - $threeprimebp; | |
$splice_end = $end + $fiveprimebp; | |
} | |
$intron_strand = "-"; | |
} | |
if ($spliceend eq 'threeprime') { $splice = $intron."_3splice"; } | |
elsif ($spliceend eq 'fiveprime') { $splice = $intron."_5splice"; } | |
if ($spliceend eq 'threeprime') | |
{ | |
print SPLICEGFF "$chromosome\tcurated\t3_splice_site\t$splice_start\t$splice_end\t.\t$intron_strand\t.\t$splice\n"; | |
} | |
elsif ($spliceend eq 'fiveprime') | |
{ | |
print SPLICEGFF "$chromosome\tcurated\t5_splice_site\t$splice_start\t$splice_end\t.\t$intron_strand\t.\t$splice\n"; | |
} | |
} | |
else | |
{ | |
$errormsg = "ERROR: make_splice_site_gff: intron name $intron\n"; | |
$errorcode = 5; # ERRORCODE=5 | |
return($splicegff,$errorcode,$errormsg); | |
} | |
} | |
close(INTRON_GFF); | |
close(SPLICEGFF); | |
return($splicegff,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &remove_duplicate_introns | |
sub test_remove_duplicate_introns | |
{ | |
my $outputdir = $_[0]; ## DIRECTORY FOR WRITING OUTPUT FILES IN | |
my $random_number; ## RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES | |
my $expected_new_intron_gff; ## EXPECTED NEW INTRON GFF | |
my $errorcode; ## RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; ## RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR | |
my $differences; ## DIFFERENCES BETWEEN $new_intron_gff AND $expected_new_intron_gff | |
my $length_differences; ## LENGTH OF $differences | |
my $line; ## | |
my $intron_gff; ## THE INTRON GFF FILE | |
my $new_intron_gff; ## THE NEW INTRON GFF FILE | |
$random_number = rand(); | |
$intron_gff = $outputdir."/tmp".$random_number; | |
open(INTRONGFF,">$intron_gff") || die "ERROR: test_remove_duplicate_introns: cannot open $intron_gff\n"; | |
print INTRONGFF "scaffold1\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n"; | |
print INTRONGFF "scaffold1\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n"; | |
print INTRONGFF "scaffold1\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t116\t119\t.\t.\t.\tgene1.f.1\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t126\t129\t.\t.\t.\tgene1.f.2\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t136\t139\t.\t.\t.\tgene1.f.3\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene2.f.1\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene2.f.2\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene2.f.3\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene3.f.1\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene3.f.2\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene3.f.3\n"; | |
close(INTRONGFF); | |
($new_intron_gff,$errorcode,$errormsg) = &remove_duplicate_introns($intron_gff,$outputdir); | |
if ($errorcode != 0) { print STDERR "ERROR: test_remove_duplicate_introns: failed test1 (errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
$random_number = rand(); | |
$expected_new_intron_gff = $outputdir."/tmp".$random_number; | |
open(EXPECTED,">$expected_new_intron_gff") || die "ERROR: test_remove_duplicate_introns: cannot open $expected_new_intron_gff\n"; | |
print EXPECTED "scaffold1\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n"; | |
print EXPECTED "scaffold1\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n"; | |
print EXPECTED "scaffold1\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n"; | |
print EXPECTED "scaffold2\tcurated\tintron\t116\t119\t.\t.\t.\tgene1.f.1\n"; | |
print EXPECTED "scaffold2\tcurated\tintron\t126\t129\t.\t.\t.\tgene1.f.2\n"; | |
print EXPECTED "scaffold2\tcurated\tintron\t136\t139\t.\t.\t.\tgene1.f.3\n"; | |
print EXPECTED "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene2.f.1\n"; | |
print EXPECTED "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene2.f.2\n"; | |
print EXPECTED "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene2.f.3\n"; | |
close(EXPECTED); | |
$differences = ""; | |
open(TEMP,"diff $new_intron_gff $expected_new_intron_gff |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_remove_duplicate_introns: failed test1 (files $new_intron_gff $expected_new_intron_gff)\n"; exit;} | |
system "rm -f $new_intron_gff"; | |
system "rm -f $expected_new_intron_gff"; | |
system "rm -f $intron_gff"; | |
$random_number = rand(); | |
$intron_gff = $outputdir."/tmp".$random_number; | |
open(INTRONGFF,">$intron_gff") || die "ERROR: test_remove_duplicate_introns: cannot open $intron_gff\n"; | |
print INTRONGFF "scaffold1\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n"; | |
print INTRONGFF "scaffold1\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n"; | |
print INTRONGFF "\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene2.f.1\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene2.f.2\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene2.f.3\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene3.f.1\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene3.f.2\n"; | |
print INTRONGFF "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene3.f.3\n"; | |
close(INTRONGFF); | |
($new_intron_gff,$errorcode,$errormsg) = &remove_duplicate_introns($intron_gff,$outputdir); | |
if ($errorcode != 4) { print STDERR "ERROR: test_remove_duplicate_introns: failed test2 (errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
system "rm -f $new_intron_gff"; | |
system "rm -f $intron_gff"; | |
} | |
#------------------------------------------------------------------# | |
# REMOVE ANY DUPLICATE INTRONS (WITH THE SAME COORDINATES) FROM THE GFF FILE OF INTRONS: | |
# DUPLICATE INTRONS CAN ARISE IF THE TRAINING DATA INCLUDES SEVERAL SPLICE-FORMS OF THE | |
# SAME GENE, THAT SHARE INTRONS. | |
sub remove_duplicate_introns | |
{ | |
my $intron_gff = $_[0]; # GFF FILE OF INTRONS | |
my $outputdir = $_[1]; # DIRECTORY FOR WRITING OUTPUT FILES TO | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $new_intron_gff = "none";# A NEW INTRON GFF FILE, WITH DUPLICATE INTRONS REMOVED | |
my %SEEN = (); # A HASH TABLE TO RECORD WHETHER WE HAVE SEEN AN INTRON BEFORE | |
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES | |
my $line; # | |
my @temp; # | |
my $chrom; # CHROMOSOME OF AN INTRON | |
my $start; # START OF AN INTRON | |
my $end; # END OF AN INTRON | |
my $strand; # STRAND OF AN INTRON | |
my $coords; # COORDINATES OF AN INTRON | |
$random_number = rand(); | |
$new_intron_gff = $outputdir."/tmp".$random_number; | |
open(NEWINTRONGFF,">$new_intron_gff") || die "ERROR: remove_duplicate_introns: cannot open $new_intron_gff\n"; | |
open(INTRONGFF,"$intron_gff") || die "ERROR: remove_duplicate_introns: cannot open $intron_gff.\n"; | |
while(<INTRONGFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
$chrom = $temp[0]; | |
$start = $temp[3]; | |
$end = $temp[4]; | |
$strand = $temp[6]; | |
if ($chrom eq '' || $start eq '' || $end eq '' || $strand eq '') | |
{ | |
$errormsg = "ERROR: remove_duplicate_introns: chrom $chrom start $start end $end strand $strand line $line\n"; | |
$errorcode = 4; # ERRORCODE=4 | |
return($new_intron_gff,$errorcode,$errormsg); | |
} | |
$coords = $chrom."=".$start."=".$end."=".$strand; | |
if (!($SEEN{$coords})) | |
{ | |
print NEWINTRONGFF "$line\n"; | |
$SEEN{$coords} = 1; | |
} | |
} | |
close(INTRONGFF); | |
close(NEWINTRONGFF); | |
system "rm -f $intron_gff"; | |
return($new_intron_gff,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &make_intron_gff | |
sub test_make_intron_gff | |
{ | |
my $outputdir = $_[0]; ## DIRECTORY FOR WRITING OUTPUT FILES IN | |
my $random_number; ## RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES | |
my $exon_gff; ## SORTED GFF FILE OF EXONS IN THE TRAINING SET | |
my $expected_intron_gff; ## EXPECTED INTRON GFF | |
my $errorcode; ## RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; ## RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR | |
my $differences; ## DIFFERENCES BETWEEN $intron_gff AND $expected_intron_gff | |
my $length_differences; ## LENGTH OF $differences | |
my $line; ## | |
my $intron_gff; ## THE INTRON GFF FILE | |
# TEST CASE WHEN exons_names_sameas_genes = 'yes': | |
$random_number = rand(); | |
$exon_gff = $outputdir."/tmp".$random_number; | |
open(EXONGFF,">$exon_gff") || die "ERROR: test_sort_exon_gff: cannot open $exon_gff\n"; | |
print EXONGFF "scaffold1\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold1\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold1\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold1\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold2\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold2\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene2\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene2\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene2\n"; | |
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene2\n"; | |
close(EXONGFF); | |
($intron_gff,$errorcode,$errormsg) = &make_intron_gff($exon_gff,$outputdir,"yes"); | |
if ($errorcode != 0) { print STDERR "ERROR: test_make_intron_gff: failed test1 (errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
$random_number = rand(); | |
$expected_intron_gff = $outputdir."/tmp".$random_number; | |
open(EXPECTED,">$expected_intron_gff") || die "ERROR: test_make_intron_gff: cannot open $expected_intron_gff\n"; | |
print EXPECTED "scaffold1\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n"; | |
print EXPECTED "scaffold1\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n"; | |
print EXPECTED "scaffold1\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n"; | |
print EXPECTED "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n"; | |
print EXPECTED "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n"; | |
print EXPECTED "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n"; | |
print EXPECTED "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene2.f.1\n"; | |
print EXPECTED "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene2.f.2\n"; | |
print EXPECTED "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene2.f.3\n"; | |
close(EXPECTED); | |
$differences = ""; | |
open(TEMP,"diff $intron_gff $expected_intron_gff |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_make_intron_gff: failed test1 (files $intron_gff $expected_intron_gff)\n"; exit;} | |
system "rm -f $intron_gff"; | |
system "rm -f $expected_intron_gff"; | |
system "rm -f $exon_gff"; | |
# TEST CASE WHEN exons_names_sameas_genes = 'no': | |
$random_number = rand(); | |
$exon_gff = $outputdir."/tmp".$random_number; | |
open(EXONGFF,">$exon_gff") || die "ERROR: test_sort_exon_gff: cannot open $exon_gff\n"; | |
print EXONGFF "scaffold1\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene1.1\n"; | |
print EXONGFF "scaffold1\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene1.2\n"; | |
print EXONGFF "scaffold1\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene1.3\n"; | |
print EXONGFF "scaffold1\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1.4\n"; | |
print EXONGFF "scaffold2\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene1.1\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene1.2\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene1.3\n"; | |
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1.4\n"; | |
print EXONGFF "scaffold2\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene2.1\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene2.2\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene2.3\n"; | |
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene2.4\n"; | |
close(EXONGFF); | |
($intron_gff,$errorcode,$errormsg) = &make_intron_gff($exon_gff,$outputdir,"no"); | |
if ($errorcode != 0) { print STDERR "ERROR: test_make_intron_gff: failed test2\n"; exit;} | |
$random_number = rand(); | |
$expected_intron_gff = $outputdir."/tmp".$random_number; | |
open(EXPECTED,">$expected_intron_gff") || die "ERROR: test_make_intron_gff: cannot open $expected_intron_gff\n"; | |
print EXPECTED "scaffold1\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n"; | |
print EXPECTED "scaffold1\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n"; | |
print EXPECTED "scaffold1\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n"; | |
print EXPECTED "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene1.f.1\n"; | |
print EXPECTED "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene1.f.2\n"; | |
print EXPECTED "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene1.f.3\n"; | |
print EXPECTED "scaffold2\tcurated\tintron\t16\t19\t.\t.\t.\tgene2.f.1\n"; | |
print EXPECTED "scaffold2\tcurated\tintron\t26\t29\t.\t.\t.\tgene2.f.2\n"; | |
print EXPECTED "scaffold2\tcurated\tintron\t36\t39\t.\t.\t.\tgene2.f.3\n"; | |
close(EXPECTED); | |
$differences = ""; | |
open(TEMP,"diff $intron_gff $expected_intron_gff |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_make_intron_gff: failed test2 (files $intron_gff $expected_intron_gff)\n"; exit;} | |
system "rm -f $intron_gff"; | |
system "rm -f $expected_intron_gff"; | |
system "rm -f $exon_gff"; | |
# TEST CASE WHEN exons_names_sameas_genes IS NOT 'yes' OR 'no': | |
$random_number = rand(); | |
$exon_gff = $outputdir."/tmp".$random_number; | |
open(EXONGFF,">$exon_gff") || die "ERROR: test_sort_exon_gff: cannot open $exon_gff\n"; | |
print EXONGFF "scaffold1\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene1.1\n"; | |
print EXONGFF "scaffold1\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene1.2\n"; | |
print EXONGFF "scaffold1\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene1.3\n"; | |
print EXONGFF "scaffold1\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1.4\n"; | |
print EXONGFF "scaffold2\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene1.1\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene1.2\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene1.3\n"; | |
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1.4\n"; | |
print EXONGFF "scaffold2\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene2.1\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene2.2\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene2.3\n"; | |
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene2.4\n"; | |
close(EXONGFF); | |
($intron_gff,$errorcode,$errormsg) = &make_intron_gff($exon_gff,$outputdir,"hello"); | |
if ($errorcode != 2) { print STDERR "ERROR: test_make_intron_gff: failed test3\n"; exit;} | |
system "rm -f $exon_gff"; | |
# TEST CASE WHEN INTRON LENGTHS ARE 0: | |
$random_number = rand(); | |
$exon_gff = $outputdir."/tmp".$random_number; | |
open(EXONGFF,">$exon_gff") || die "ERROR: test_sort_exon_gff: cannot open $exon_gff\n"; | |
print EXONGFF "scaffold1\tcegma\tFirst\t10\t20\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold1\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold1\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold1\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold2\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold2\tcegma\tFirst\t10\t15\t5.5\t+\t0\tgene2\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t20\t25\t5.5\t+\t0\tgene2\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t30\t35\t5.5\t+\t0\tgene2\n"; | |
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene2\n"; | |
close(EXONGFF); | |
($intron_gff,$errorcode,$errormsg) = &make_intron_gff($exon_gff,$outputdir,"yes"); | |
if ($errorcode != 3) { print STDERR "ERROR: test_make_intron_gff: failed test4\n"; exit;} | |
system "rm -f $intron_gff"; | |
system "rm -f $exon_gff"; | |
} | |
#------------------------------------------------------------------# | |
# MAKE A GFF FILE OF THE INTRONS: | |
sub make_intron_gff | |
{ | |
my $sorted_exon_gff = $_[0]; # SORTED GFF FILE OF EXONS | |
my $outputdir = $_[1]; # DIRECTORY FOR WRITING OUTPUT FILES IN | |
my $exons_names_sameas_genes = $_[2]; # SAYS WHETHER THE EXONS ARE NAMED THE SAME AS THE GENES IN THE INPUT GFF FILE | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none"; # RETURNED AS 'none' IF THERE IS NO ERROR | |
my $intron_gff = "none"; # THE INTRON GFF FILE | |
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES | |
my $line; # | |
my @temp; # | |
my $exon; # EXON NAME | |
my $gene; # GENE NAME | |
my @temp2; # | |
my $i; # | |
my $prev_gene = "none"; # PREVIOUS GENE NAME | |
my $chrom; # NAME OF THE CHROMOSOME | |
my $prev_chrom = "none"; # NAME OF THE PREVIOUS CHROMOSOME | |
my $intron_no = 0; # NUMBER OF AN INTRON ALONG A GENE | |
my $exon_start; # START COORDINATE OF EXON | |
my $exon_end; # END COORDINATE OF EXON | |
my $strand; # STRAND OF EXON | |
my $prev_exon_end; # END COORDINATE OF THE LAST EXON | |
my $intron_start; # START COORDINATE OF THE INTRON | |
my $intron_end; # END COORDINATE OF THE INTRON | |
my $intron; # NAME FOR AN INTRON | |
# CHECK $exons_names_sameas_genes IS 'yes' OR 'no': | |
if ($exons_names_sameas_genes ne 'yes' && $exons_names_sameas_genes ne 'no') | |
{ | |
$errormsg = "ERROR: make_intron_gff: exons_names_sameas_genes is $exons_names_sameas_genes (should be yes/no)\n"; | |
$errorcode = 2; # ERRORCODE=2 | |
return($intron_gff,$errorcode,$errormsg); | |
} | |
# OPEN AN OUTPUT INTRON GFF FILE: | |
$random_number = rand(); | |
$intron_gff = $outputdir."/tmp".$random_number; | |
open(INTRONGFF,">$intron_gff") || die "ERROR: make_intron_gff: cannot open $intron_gff\n"; | |
# READ THROUGH THE SORTED EXON GFF: | |
open(EXONGFF,"$sorted_exon_gff") || die "ERROR: make_intron_gff: cannot open $sorted_exon_gff\n"; | |
while(<EXONGFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
$chrom = $temp[0]; | |
$exon = $temp[8]; # eg. KOG0018.10 | |
$exon_start = $temp[3]; | |
$exon_end = $temp[4]; | |
$strand = $temp[6]; | |
if ($strand eq '+') { $strand = "f";} elsif ($strand eq '-') { $strand = "r";} | |
# GET THE NAME OF THE GENE: | |
if ($exon =~ /\"/) # SOME GFF FILES HAVE THE EXON NAME IN "" IN THE LAST COLUMN | |
{ | |
@temp2 = split(/\"/,$exon); | |
$exon = $temp2[1]; | |
} | |
if ($exon =~ /Parent=CDS:/) # WORMBASES USES THIS FORMAT IN SOME GFF FILES | |
{ | |
@temp2 = split(/Parent=CDS:/,$exon); | |
$exon = $temp2[1]; | |
} | |
# IF THE EXONS ARE JUST NAMED THE SAME AS THE GENES: | |
if ($exons_names_sameas_genes eq 'yes') | |
{ | |
$gene = $exon; | |
} | |
else # WE ASSUME THAT THE EXON NAMES ARE MADE BY ADDING .1,.2,.3, etc. TO THE GENE NAME, AS IN C. ELEGANS: | |
{ | |
$gene = ""; | |
@temp2 = split(/\./,$exon); | |
for ($i = 0; $i <= $#temp2-1; $i++) { $gene = $gene.".".$temp2[$i];} | |
if ($gene eq '') { $gene = $exon; } | |
else { $gene = substr($gene,1,length($gene)-1);} | |
} | |
# IF THE GENE NAME IS THE SAME AS THE NAME OF THE PREVIOUS GENE ON THIS CHROMOSOME, THE REGION BETWEEN IS ASSUMED TO BE AN INTRON: | |
if ($gene eq $prev_gene && $chrom eq $prev_chrom) | |
{ | |
$intron_no++; | |
$intron_start = $prev_exon_end + 1; | |
$intron_end = $exon_start - 1; | |
if ($intron_start > $intron_end) | |
{ | |
# THIS COULD HAPPEN BECAUSE OF AN ERROR IN THE GFF FILE WHERE THERE | |
# IS AN INTRON OF LENGTH 0: | |
$errormsg = "ERROR: make_intron_gff: intron_start $intron_start before end $intron_end for gene $gene on chrom $chrom.\n"; | |
$errorcode = 3; # ERRORCODE=3 | |
return($intron_gff,$errorcode,$errormsg); | |
} | |
else | |
{ | |
$intron = $gene.".".$strand.".".$intron_no; | |
print INTRONGFF "$chrom curated intron $intron_start $intron_end . . . $intron\n"; | |
} | |
} | |
else | |
{ | |
$intron_no = 0; | |
} | |
$prev_gene = $gene; | |
$prev_chrom = $chrom; | |
$prev_exon_end = $exon_end; | |
} | |
close(EXONGFF); | |
# CLOSE THE INTRON GFF FILE: | |
close(INTRONGFF); | |
# DELETE THE SORTED EXON GFF, IT IS NO LONGER NEEDED: | |
system "rm -f $sorted_exon_gff"; | |
return($intron_gff,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &sort_exon_gff | |
sub test_sort_exon_gff | |
{ | |
my $outputdir = $_[0]; # DIRECTORY FOR WRITING OUTPUT FILES IN | |
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAMES | |
my $exon_gff; # GFF FILE OF EXONS IN THE TRAINING SET | |
my $expected_sorted_gff; # EXPECTED SORTED EXON GFF | |
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR | |
my $sorted_exon_gff; # SORTED EXON GFF FILE | |
my $differences; # DIFFERENCES BETWEEN $sorted_exon_gff AND $expected_sorted_gff | |
my $length_differences; # LENGTH OF $differences | |
my $line; # | |
$random_number = rand(); | |
$exon_gff = $outputdir."/tmp".$random_number; | |
open(EXONGFF,">$exon_gff") || die "ERROR: test_sort_exon_gff: cannot open $exon_gff\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t30\t40\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold1\tcegma\tFirst\t10\t20\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t20\t30\t5.5\t+\t0\tgene2\n"; | |
print EXONGFF "scaffold1\tcegma\tInternal\t20\t30\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold2\tcegma\tFirst\t10\t20\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold1\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t20\t30\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t30\t40\t5.5\t+\t0\tgene2\n"; | |
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold1\tcegma\tInternal\t30\t40\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold2\tcegma\tFirst\t10\t20\t5.5\t+\t0\tgene2\n"; | |
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene2\n"; | |
close(EXONGFF); | |
($sorted_exon_gff,$errorcode,$errormsg) = &sort_exon_gff($exon_gff,$outputdir); | |
if ($errorcode != 0) { print STDERR "ERROR: test_sort_exon_gff: failed test1 (errorcode $errorcode)\n"; exit;} | |
$random_number = rand(); | |
$expected_sorted_gff = $outputdir."/tmp".$random_number; | |
open(EXPECTED,">$expected_sorted_gff") || die "ERROR: test_sort_exon_gff: cannot open $expected_sorted_gff\n"; | |
print EXPECTED "scaffold1\tcegma\tFirst\t10\t20\t5.5\t+\t0\tgene1\n"; | |
print EXPECTED "scaffold1\tcegma\tInternal\t20\t30\t5.5\t+\t0\tgene1\n"; | |
print EXPECTED "scaffold1\tcegma\tInternal\t30\t40\t5.5\t+\t0\tgene1\n"; | |
print EXPECTED "scaffold1\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1\n"; | |
print EXPECTED "scaffold2\tcegma\tFirst\t10\t20\t5.5\t+\t0\tgene1\n"; | |
print EXPECTED "scaffold2\tcegma\tInternal\t20\t30\t5.5\t+\t0\tgene1\n"; | |
print EXPECTED "scaffold2\tcegma\tInternal\t30\t40\t5.5\t+\t0\tgene1\n"; | |
print EXPECTED "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1\n"; | |
print EXPECTED "scaffold2\tcegma\tFirst\t10\t20\t5.5\t+\t0\tgene2\n"; | |
print EXPECTED "scaffold2\tcegma\tInternal\t20\t30\t5.5\t+\t0\tgene2\n"; | |
print EXPECTED "scaffold2\tcegma\tInternal\t30\t40\t5.5\t+\t0\tgene2\n"; | |
print EXPECTED "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene2\n"; | |
close(EXPECTED); | |
$differences = ""; | |
open(TEMP,"diff $sorted_exon_gff $expected_sorted_gff |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_sort_exon_gff: failed test1 (files $sorted_exon_gff $expected_sorted_gff)\n"; exit;} | |
system "rm -f $sorted_exon_gff"; | |
system "rm -f $expected_sorted_gff"; | |
system "rm -f $exon_gff"; | |
$random_number = rand(); | |
$exon_gff = $outputdir."/tmp".$random_number; | |
open(EXONGFF,">$exon_gff") || die "ERROR: test_sort_exon_gff: cannot open $exon_gff\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t30\t40\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold1\tcegma\tFirst\t10\t20\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t20\t30\t5.5\t+\t0\tgene2\n"; | |
print EXONGFF "scaffold1\tcegma\tInternal\t20\t30\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold2\tcegma\tCDS\t10\t20\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold1\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t20\t30\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold2\tcegma\tInternal\t30\t40\t5.5\t+\t0\tgene2\n"; | |
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold1\tcegma\tInternal\t30\t40\t5.5\t+\t0\tgene1\n"; | |
print EXONGFF "scaffold2\tcegma\tFirst\t10\t20\t5.5\t+\t0\tgene2\n"; | |
print EXONGFF "scaffold2\tcegma\tTerminal\t40\t50\t5.5\t+\t0\tgene2\n"; | |
close(EXONGFF); | |
($sorted_exon_gff,$errorcode,$errormsg) = &sort_exon_gff($exon_gff,$outputdir); | |
if ($errorcode != 1) { print STDERR "ERROR: test_sort_exon_gff: failed test2\n"; exit;} | |
system "rm -f $exon_gff"; | |
} | |
#------------------------------------------------------------------# | |
# SORT THE GFF FILE BY CHROMOSOME, THEN BY GENE, THEN BY START IN THE GENE: | |
sub sort_exon_gff | |
{ | |
my $exon_gff = $_[0]; # THE GFF FILE OF EXONS IN THE TRAINING SET | |
my $outputdir = $_[1]; # DIRECTORY FOR WRITING OUTPUT FILES TO | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $line; # | |
my @temp; # | |
my $feature_type; # FEATURE TYPE IN THE GFF FILE | |
my $sorted_exon_gff = "none";# SORTED EXON GFF FILE | |
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAME | |
my $cmd; # COMMAND TO RUN | |
# CHECK THAT THE $exon_gff FILE ONLY HAS FEATURES OF ONE TYPE 'Single', 'First', 'Internal', 'Terminal', 'Initial', or 'coding_exon' (DIFFERENT | |
# NAMES USED FOR CODING EXONS IN DIFFERENT GFF FILES): | |
open(EXONGFF,"$exon_gff") || die "ERROR: sort_exon_gff: cannot open $exon_gff\n"; | |
while(<EXONGFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
$feature_type = $temp[2]; | |
$feature_type =~ tr/[A-Z]/[a-z]/; | |
if ($feature_type ne 'first' && $feature_type ne 'internal' && $feature_type ne 'terminal' && $feature_type ne 'single' && | |
$feature_type ne 'coding_exon' && $feature_type ne 'initial') | |
{ | |
$errormsg = "ERROR: sort_exon_gff: $exon_gff has features of type $feature_type (should be first/initial/internal/terminal/single/coding_exon)\n"; | |
$errorcode = 1; # ERRORCODE=1 | |
return($sorted_exon_gff,$errorcode,$errormsg); | |
} | |
} | |
close(EXONGFF); | |
# SORT THE EXON GFF FILE: | |
$random_number = rand(); | |
$sorted_exon_gff = $outputdir."/tmp".$random_number; | |
$cmd = "sort $exon_gff -k1,1 -k9,9 -k4,4n > $sorted_exon_gff"; | |
system "$cmd"; | |
sleep(1); | |
return($sorted_exon_gff,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &print_error | |
sub test_print_error | |
{ | |
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR | |
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR | |
($errormsg,$errorcode) = &print_error(45,45,1); | |
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;} | |
($errormsg,$errorcode) = &print_error('My error message','My error message',1); | |
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;} | |
($errormsg,$errorcode) = &print_error('none',45,1); | |
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;} | |
($errormsg,$errorcode) = &print_error('My error message', 0, 1); | |
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;} | |
} | |
#------------------------------------------------------------------# | |
# PRINT OUT AN ERROR MESSAGE AND EXIT. | |
sub print_error | |
{ | |
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED. | |
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED. | |
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT | |
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; | |
exit; | |
} | |
} | |
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/)) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; | |
exit; | |
} | |
} | |
if ($errormsg eq 'none' || $errorcode == 0) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; | |
exit; | |
} | |
} | |
else | |
{ | |
print STDERR "$errormsg"; | |
exit; | |
} | |
return($errormsg,$errorcode); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment