Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created August 27, 2013 14:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/6354195 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/6354195 to your computer and use it in GitHub Desktop.
Perl script that finds the set of highest-scoring non-overlapping genes in a gff file.
#!/usr/bin/env perl
=head1 NAME
find_best_nonoverlapping_genes.pl
=head1 SYNOPSIS
find_best_nonoverlapping_genes.pl input_gff path_to_bedtools output_gff outputdir min_gene_score from_maker subtract_from
where input_gff is the input GeneWise gff file,
path_to_bedtools is the path to the bedtools installation,
output_gff is the output gff file,
outputdir is the output directory for writing output files,
min_gene_score is the minimum score for a gene to be retained,
from_maker says whether the gff file is from maker (yes/no),
substract_from is the number to subtract the score from (eg. 1 for maker, none if you don't want this option)
=head1 DESCRIPTION
This script takes a gff file of GeneWise predictions, and for each set
of overlapping predictions, it takes just the highest scoring gene prediction.
The highest scoring gene predictions are put in the output gff (<output_gff>).
Only genes with scores of >=min_gene_score are kept.
=head1 VERSION
Perl script last edited 9-Jan-2013.
=head1 CONTACT
alc@sanger.ac.uk (Avril Coghlan)
=cut
#
# Perl script find_best_nonoverlapping_genes.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 9-Jan-13.
# Last edited 9-Jan-2013.
# SCRIPT SYNOPSIS: find_best_nonoverlapping_genes.pl: finds set of highest-scoring non-overlapping genes in a gff file
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
use strict;
use warnings;
my $num_args = $#ARGV + 1;
if ($num_args != 7)
{
print "Usage of find_best_nonoverlapping_genes.pl\n\n";
print "perl find_best_nonoverlapping_genes.pl <input_gff> <path_to_bedtools> <output_gff> <outputdir> <min_gene_score> <from_maker> <subtract_from>\n";
print "where <input_gff> is the input GeneWise gff file,\n";
print " <path_to_bedtools> is the path to the bedtools installation,\n";
print " <output_gff> is the output gff file,\n";
print " <outputdir> is the output directory for writing output files,\n";
print " <min_gene_score> is the minimum score for a gene to be retained,\n";
print " <from_maker> says whether the gff file is from maker (yes/no)\n";
print " <subtract_from> says the number to subtract the score from (eg. 1 for maker, none if you don't want this option)\n";
print "For example, >perl find_best_nonoverlapping_genes.pl TF.gff\n";
print "/software/pathogen/external/apps/usr/bin/bedtools\n";
print "TF_output.gff /lustre/scratch108/parasites/alc/Genewise50Helminths/gff_file 10 no none\n";
exit;
}
# FIND THE PATH TO THE INPUT GFF FILE:
my $input_gff = $ARGV[0];
# FIND THE PATH TO THE BEDTOOLS INSTALLATION:
my $path_to_bedtools = $ARGV[1];
# FIND THE OUTPUT GFF FILE:
my $output_gff = $ARGV[2];
# FIND THE DIRECTORY TO USE FOR OUTPUT FILES:
my $outputdir = $ARGV[3];
# FIND THE MINIMUM SCORE FOR A GENE TO BE RETAINED:
my $min_gene_score = $ARGV[4];
# FIND OUT WHETHER THE GFF FILE IS FROM MAKER:
my $from_maker = $ARGV[5];
# FIND THE NUMBER TO SUBTRACT THE SCORE FROM:
my $subtract_from = $ARGV[6];
#------------------------------------------------------------------#
# TEST SUBROUTINES:
my $PRINT_TEST_DATA = 1; # SAYS WHETHER TO PRINT DATA USED DURING TESTING.
&test_read_maker_scores($outputdir);
&test_make_cds_gff($outputdir);
&test_print_error;
&test_read_genes_for_transcripts($outputdir);
&test_run_main_program($outputdir,$path_to_bedtools);
&test_write_output_gff($outputdir);
&test_find_overlapping_genes($outputdir,$path_to_bedtools);
&test_find_best_predictions;
print STDERR "Tests done, running main code...\n";
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
(my $errorcode,my $errormsg) = &run_main_program($outputdir,$input_gff,$output_gff,$path_to_bedtools,$min_gene_score,$from_maker,0,$subtract_from);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# TEST &run_main_program
sub test_run_main_program
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES INTO
my $path_to_bedtools = $_[1]; # PATH TO THE BEDTOOLS INSTALLATION
my $gff; # GFF FILE FOR THE SCAFFOLD
my $new_gff; # NEW GFF FILE FOR THIS SCAFFOLD
my $expected_new_gff; # FILE WITH EXPECTED CONTENTS OF $new_gff
my $differences; # DIFFERENES BETWEEN $new_gff AND $expected_new_gff
my $length_differences; # LENGTH OF $differences
my $line; #
my @temp; #
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
# IN THIS EXAMPLE, SHOULD JUST HAVE ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5 AND ACAC.scaffold.00050.383205-TF101001-genewise-prediction-5 AT THE END:
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_run_main_program: cannot open gff $gff\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tintron\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tCDS\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tCDS\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tCDS\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tCDS\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tCDS\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-5\n";
close(GFF);
($new_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@temp = split(/\//,$new_gff);
$new_gff = $temp[$#temp];
($errorcode,$errormsg) = &run_main_program($outputdir,$gff,$new_gff,$path_to_bedtools,10,'no',1,'none');
if ($errorcode != 0) { print STDERR "ERROR: test_run_main_program: failed test1\n"; exit;}
($expected_new_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_new_gff") || die "ERROR: test_run_main_program: cannot open expected_new_gff $expected_new_gff\n";
print EXPECTED "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print EXPECTED "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print EXPECTED "ACAC.scaffold.00050.383204\tGeneWise\tintron\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print EXPECTED "ACAC.scaffold.00050.383205\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-5\n";
print EXPECTED "ACAC.scaffold.00050.383205\tGeneWise\tCDS\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-5\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $new_gff $expected_new_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_run_main_program: failed test1 (new_gff $new_gff expected_new_gff $expected_new_gff)\n"; exit;}
system "rm -f $new_gff";
system "rm -f $expected_new_gff";
system "rm -f $gff";
# MAKER TEST CASE:
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_run_main_program: cannot open gff $gff\n";
print GFF "Contig_000100 maker gene 6027 7030 . + . ID=DME_0000799101;Name=DME_0000799101\n";
print GFF "Contig_000100 maker mRNA 6027 7030 . + . ID=DME_0000799101-mRNA-1;Parent=DME_0000799101;Name=DME_0000799101-mRNA-1;_AED=0.15;_eAED=0.15;_QI=0|0|0|1|1|1|3|0|188\n";
print GFF "Contig_000100 maker exon 6027 6166 . + . ID=DME_0000799101-mRNA-1:exon:1;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker exon 6236 6410 . + . ID=DME_0000799101-mRNA-1:exon:2;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker exon 6779 7030 . + . ID=DME_0000799101-mRNA-1:exon:3;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6027 6166 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6236 6410 . + 1 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6779 7030 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker gene 6027 7030 . + . ID=DME_0000799102;Name=DME_0000799102\n";
print GFF "Contig_000100 maker mRNA 6027 7030 . + . ID=DME_0000799102-mRNA-1;Parent=DME_0000799102;Name=DME_0000799102-mRNA-1;_AED=0.12;_eAED=0.12;_QI=0|0|0|1|1|1|3|0|188\n";
print GFF "Contig_000100 maker exon 6027 6166 . + . ID=DME_0000799102-mRNA-1:exon:1;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker exon 6236 6410 . + . ID=DME_0000799102-mRNA-1:exon:2;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker exon 6779 7030 . + . ID=DME_0000799102-mRNA-1:exon:3;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6027 6166 . + 0 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6236 6410 . + 1 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6779 7030 . + 0 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "##FASTA\n";
print GFF ">seq1\n";
print GFF "ACAGAG\n";
close(GFF);
($new_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@temp = split(/\//,$new_gff);
$new_gff = $temp[$#temp];
($errorcode,$errormsg) = &run_main_program($outputdir,$gff,$new_gff,$path_to_bedtools,0,'yes',1,'none');
if ($errorcode != 0) { print STDERR "ERROR: test_run_main_program: failed test2\n"; exit;}
($expected_new_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_new_gff") || die "ERROR: test_run_main_program: cannot open expected_new_gff $expected_new_gff\n";
print EXPECTED "Contig_000100 maker gene 6027 7030 . + . ID=DME_0000799101;Name=DME_0000799101\n";
print EXPECTED "Contig_000100 maker mRNA 6027 7030 . + . ID=DME_0000799101-mRNA-1;Parent=DME_0000799101;Name=DME_0000799101-mRNA-1;_AED=0.15;_eAED=0.15;_QI=0|0|0|1|1|1|3|0|188\n";
print EXPECTED "Contig_000100 maker exon 6027 6166 . + . ID=DME_0000799101-mRNA-1:exon:1;Parent=DME_0000799101-mRNA-1\n";
print EXPECTED "Contig_000100 maker exon 6236 6410 . + . ID=DME_0000799101-mRNA-1:exon:2;Parent=DME_0000799101-mRNA-1\n";
print EXPECTED "Contig_000100 maker exon 6779 7030 . + . ID=DME_0000799101-mRNA-1:exon:3;Parent=DME_0000799101-mRNA-1\n";
print EXPECTED "Contig_000100 maker CDS 6027 6166 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print EXPECTED "Contig_000100 maker CDS 6236 6410 . + 1 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print EXPECTED "Contig_000100 maker CDS 6779 7030 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print EXPECTED "##FASTA\n";
print EXPECTED ">seq1\n";
print EXPECTED "ACAGAG\n";
close(EXPECTED);
$differences = "";
$new_gff = $outputdir."/".$new_gff;
open(TEMP,"diff $new_gff $expected_new_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_run_main_program: failed test2 (new_gff $new_gff expected_new_gff $expected_new_gff)\n"; exit;}
system "rm -f $new_gff";
system "rm -f $expected_new_gff";
system "rm -f $gff";
# TEST ERRORCODE=3 (from_maker IS NOT yes/no):
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_run_main_program: cannot open gff $gff\n";
print GFF "SSTP.scaffold.00055.227916 maker gene 7006 10673 . - . ID=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 maker mRNA 7006 10673 . - . ID=1:SRAE_2028500;Parent=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|0|0|1|1|1|2|0|1206;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match 7006 10673 1000 - . ID=SSTP.scaffold.00055.227916:hit:3043:3.5.0.0;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52-mRNA-1;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match_part 10537 10673 1000 - . ID=SSTP.scaffold.00055.227916:hsp:5479:3.5.0.0;Parent=SSTP.scaffold.00055.227916:hit:3043:3.5.0.0;Target=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52-mRNA-1 3485 3621 +;Gap=M137\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match_part 7006 10489 1000 - . ID=SSTP.scaffold.00055.227916:hsp:5480:3.5.0.0;Parent=SSTP.scaffold.00055.227916:hit:3043:3.5.0.0;Target=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52-mRNA-1 1 3484 +;Gap=M3484\n";
print GFF "SSTP.scaffold.00055.227916 maker gene 6942 8068 . + . ID=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 maker mRNA 6942 8068 . + . ID=1:SRAE_2028600;Parent=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69-mRNA-1;_AED=1.00;_eAED=1.00;_QI=106|0|0|0|1|1|2|884|43;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match 6942 8068 1000 + . ID=SSTP.scaffold.00055.227916:hit:3060:3.5.0.0;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69-mRNA-1;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match_part 6942 6956 1000 + . ID=SSTP.scaffold.00055.227916:hsp:5520:3.5.0.0;Parent=SSTP.scaffold.00055.227916:hit:3060:3.5.0.0;Target=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69-mRNA-1 1 15 +;Gap=M15\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match_part 6962 8068 1000 + . ID=SSTP.scaffold.00055.227916:hsp:5521:3.5.0.0;Parent=SSTP.scaffold.00055.227916:hit:3060:3.5.0.0;Target=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69-mRNA-1 16 1122 +;Gap=M1107\n";
close(GFF);
($new_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@temp = split(/\//,$new_gff);
$new_gff = $temp[$#temp];
($errorcode,$errormsg) = &run_main_program($outputdir,$gff,$new_gff,$path_to_bedtools,10,'hello',1,'none');
if ($errorcode != 3) { print STDERR "ERROR: test_run_main_program: failed test3\n"; exit;}
system "rm -f $new_gff";
system "rm -f $gff";
}
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
# TO FIND THE HIGHEST-SCORING NON-OVERLAPPING GENES, THIS DOES THE FOLLOWING:
# (i) SORTS THE GENES IN ORDER OF DESCENDING SCORES
# (ii) REMOVE ALL GENES FROM THE GFF THAT OVERLAP THE HIGHEST-SCORING GENE -> GIVES GFF1
# (iii) REMOVE ALL THE GENES FROM THE GFF1 THAT OVERLAP THE SECOND HIGHEST-SCORING GENE -> GIVES GFF2
# (iv) REMOVE ALL THE GENES FROM THE GFF2 THAT OVERLAP THE THIRD HIGHEST-SCORING GENE -> GIVES GFF3
# (v) etc.
sub run_main_program
{
my $outputdir = $_[0]; ## DIRECTORY TO PUT OUTPUT FILES IN.
my $input_gff = $_[1]; ## THE INPUT GFF FILE
my $output_gff = $_[2]; ## THE OUTPUT GFF FILE
my $path_to_bedtools = $_[3]; ## PATH TO THE BEDTOOLS INSTALLATION
my $min_gene_score = $_[4]; ## MINIMUM SCORE FOR A GENE TO BE RETAINED
my $from_maker = $_[5]; ## SAYS WHETHER THE GFF FILE IS FROM MAKER (yes/no)
my $testing = $_[6]; ## SAYS WHETHER THIS IS BEING CALLED FROM A TESTING SUBROUTINE
my $subtract_from = $_[7]; ## NUMBER TO SUBTRACT THE SCORE FROM (eg. 1 FOR MAKER)
my $errorcode = 0; ## RETURNED AS 0 IF THERE IS NO ERROR.
my $errormsg = "none";## RETURNED AS 'none' IF THERE IS NO ERROR.
my $scaffolds_file; ## A FILE WITH THE NAMES OF ALL SCAFFOLDS IN $new_gff
my $scaffold; ## NAME OF A SCAFFOLD
my $scaffold_gff; ## GFF FILE FOR SCAFFOLD $scaffold
my $scaffold_gff2; ## TEMPORARY GFF FILE FOR SCAFFOLD $scaffold
my $new_gff; ## GFF FILE OF GENES
my %SCORE = (); ## HASH TABLE WITH MAKER SCORES FOR GENES
my $GENE; ## HASH TABLE OF GENE NAMES FOR TRANSCRIPT NAMES
my $OVERLAP; ## HASH TABLE WITH OVERLAPPING GENES
my $DISCARD; ## HASH TABLE OF GENES TO DISCARD
my $DELETE_TRANSCRIPT; ## HASH TABLE OF TRANSCRIPTS TO DISCARD
# CHECK THAT $from_maker IS yes/no:
if ($from_maker ne 'yes' && $from_maker ne 'no')
{
$errormsg = "ERROR: run_main_program: from_maker $from_maker: should be yes/no\n";
$errorcode = 3; # ERRORCODE=3 (TESTED FOR)
return($errorcode,$errormsg);
}
# READ IN THE GENE NAME FOR EACH TRANSCRIPT IN THE INPUT GFF FILE:
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# MAKE A GFF FILE WITH JUST THE 'cds' FEATURES (NOT 'gene' OR 'intron' FEATURES):
($new_gff,$errorcode,$errormsg) = &make_cds_gff($input_gff,$outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# FIND THE CODING OVERLAPS BETWEEN ALL THE GENES, BY LOOKING FOR OVERLAPS IN 'cds' FEATURES:
($OVERLAP,$errorcode,$errormsg) = &find_overlapping_genes($new_gff,$path_to_bedtools,$outputdir,$GENE);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# READ IN THE GENE SCORES FOR THE GENES:
if ($from_maker eq 'yes') # IT IS A MAKER GFF FILE
{
($DELETE_TRANSCRIPT,$errorcode,$errormsg) = &read_maker_scores($input_gff,\%SCORE,$subtract_from,0);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
}
else # IT IS NOT A MAKER GFF FILE
{
($errorcode,$errormsg) = &read_scores($input_gff,\%SCORE,$subtract_from);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
}
# FILTER ON $min_gene_score
# FIND THE HIGHEST-SCORING NON-OVERLAPPING GENE PREDICTIONS:
($DISCARD,$errorcode,$errormsg) = &find_best_predictions($testing,$OVERLAP,\%SCORE,$min_gene_score);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# WRITE THE OUTPUT FILE:
$output_gff = $outputdir."/".$output_gff;
($errorcode,$errormsg) = &write_output_gff($input_gff,$GENE,$output_gff,$DISCARD,$from_maker,$DELETE_TRANSCRIPT);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
return($errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &write_output_gff
sub test_write_output_gff
{
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES IN
my $gff; # INPUT GFF FILE
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUCNCTION IF THERE IS NO ERROR
my %GENE = (); # HASH TABLE OF GENE NAMES FOR TRANSCRIPTS
my $output_gff; # OUTPUT GFF FILE
my %DISCARD = (); # HASH TABLE OF GENES TO DISCARD
my $expected_output_gff; # EXPECTED OUTPUT GFF FILE
my $differences; # DIFFERENCES BETWEEN $output_gff AND $expected_output_gff
my $length_differences; # LENGTH OF $differences
my $line; #
my %DELETE_TRANSCRIPT = (); # HASH TABLE OF TRANSCRIPTS TO DELETE
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_write_output_gff: cannot open gff $gff\n";
print GFF "Contig_000100 maker gene 6027 7030 . + . ID=DME_0000799101;Name=DME_0000799101\n";
print GFF "Contig_000100 maker mRNA 6027 7030 . + . ID=DME_0000799101-mRNA-1;Parent=DME_0000799101;Name=DME_0000799101-mRNA-1;_AED=0.15;_eAED=0.15;_QI=0|0|0|1|1|1|3|0|188\n";
print GFF "Contig_000100 maker exon 6027 6166 . + . ID=DME_0000799101-mRNA-1:exon:1;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker exon 6236 6410 . + . ID=DME_0000799101-mRNA-1:exon:2;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker exon 6779 7030 . + . ID=DME_0000799101-mRNA-1:exon:3;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6027 6166 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6236 6410 . + 1 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6779 7030 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker gene 6027 7030 . + . ID=DME_0000799102;Name=DME_0000799102\n";
print GFF "Contig_000100 maker mRNA 6027 7030 . + . ID=DME_0000799102-mRNA-1;Parent=DME_0000799102;Name=DME_0000799102-mRNA-1;_AED=0.12;_eAED=0.12;_QI=0|0|0|1|1|1|3|0|188\n";
print GFF "Contig_000100 maker exon 6027 6166 . + . ID=DME_0000799102-mRNA-1:exon:1;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker exon 6236 6410 . + . ID=DME_0000799102-mRNA-1:exon:2;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker exon 6779 7030 . + . ID=DME_0000799102-mRNA-1:exon:3;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6027 6166 . + 0 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6236 6410 . + 1 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6779 7030 . + 0 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "##FASTA\n";
print GFF ">seq1\n";
print GFF "ACAGAG\n";
close(GFF);
$GENE{"DME_0000799101-mRNA-1"} = "DME_0000799101";
$GENE{"DME_0000799102-mRNA-1"} = "DME_0000799102";
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
$DISCARD{"DME_0000799102"} = 1;
($errorcode,$errormsg) = &write_output_gff($gff,\%GENE,$output_gff,\%DISCARD,"yes",\%DELETE_TRANSCRIPT);
if ($errorcode != 0) { print STDERR "ERROR: test_write_output_gff: failed test1\n"; exit;}
($expected_output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_output_gff") || die "ERROR: test_write_output_gff: cannot open expected_output_gff $expected_output_gff\n";
print EXPECTED "Contig_000100 maker gene 6027 7030 . + . ID=DME_0000799101;Name=DME_0000799101\n";
print EXPECTED "Contig_000100 maker mRNA 6027 7030 . + . ID=DME_0000799101-mRNA-1;Parent=DME_0000799101;Name=DME_0000799101-mRNA-1;_AED=0.15;_eAED=0.15;_QI=0|0|0|1|1|1|3|0|188\n";
print EXPECTED "Contig_000100 maker exon 6027 6166 . + . ID=DME_0000799101-mRNA-1:exon:1;Parent=DME_0000799101-mRNA-1\n";
print EXPECTED "Contig_000100 maker exon 6236 6410 . + . ID=DME_0000799101-mRNA-1:exon:2;Parent=DME_0000799101-mRNA-1\n";
print EXPECTED "Contig_000100 maker exon 6779 7030 . + . ID=DME_0000799101-mRNA-1:exon:3;Parent=DME_0000799101-mRNA-1\n";
print EXPECTED "Contig_000100 maker CDS 6027 6166 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print EXPECTED "Contig_000100 maker CDS 6236 6410 . + 1 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print EXPECTED "Contig_000100 maker CDS 6779 7030 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print EXPECTED "##FASTA\n";
print EXPECTED ">seq1\n";
print EXPECTED "ACAGAG\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $output_gff $expected_output_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_write_output_gff: failed test1 (output_gff $output_gff expected_output_gff $expected_output_gff)\n"; exit;}
system "rm -f $gff";
system "rm -f $expected_output_gff";
system "rm -f $output_gff";
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_write_output_gff: cannot open gff $gff\n";
print GFF "Contig_000100 maker gene 6027 7030 . + . ID=DME_0000799101;Name=DME_0000799101\n";
print GFF "Contig_000100 maker mRNA 6027 7030 . + . ID=DME_0000799101-mRNA-1;Parent=DME_0000799101;Name=DME_0000799101-mRNA-1;_AED=0.15;_eAED=0.15;_QI=0|0|0|1|1|1|3|0|188\n";
print GFF "Contig_000100 maker exon 6027 6166 . + . ID=DME_0000799101-mRNA-1:exon:1;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker exon 6236 6410 . + . ID=DME_0000799101-mRNA-1:exon:2;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker exon 6779 7030 . + . ID=DME_0000799101-mRNA-1:exon:3;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6027 6166 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6236 6410 . + 1 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6779 7030 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker gene 6027 7030 . + . ID=DME_0000799102;Name=DME_0000799102\n";
print GFF "Contig_000100 maker mRNA 6027 7030 . + . ID=DME_0000799102-mRNA-1;Parent=DME_0000799102;Name=DME_0000799102-mRNA-1;_AED=0.12;_eAED=0.12;_QI=0|0|0|1|1|1|3|0|188\n";
print GFF "Contig_000100 maker exon 6027 6166 . + . ID=DME_0000799102-mRNA-1:exon:1;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker exon 6236 6410 . + . ID=DME_0000799102-mRNA-1:exon:2;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker exon 6779 7030 . + . ID=DME_0000799102-mRNA-1:exon:3;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6027 6166 . + 0 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6236 6410 . + 1 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6779 7030 . + 0 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "##FASTA\n";
print GFF ">seq1\n";
print GFF "ACAGAG\n";
close(GFF);
$GENE{"DME_0000799101-mRNA-1"} = "DME_0000799101";
$GENE{"DME_0000799102-mRNA-1"} = "DME_0000799102";
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
%DISCARD = ();
$DISCARD{"DME_0000799101"} = 1;
($errorcode,$errormsg) = &write_output_gff($gff,\%GENE,$output_gff,\%DISCARD,"yes",\%DELETE_TRANSCRIPT);
if ($errorcode != 0) { print STDERR "ERROR: test_write_output_gff: failed test2\n"; exit;}
($expected_output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_output_gff") || die "ERROR: test_write_output_gff: cannot open expected_output_gff $expected_output_gff\n";
print EXPECTED "Contig_000100 maker gene 6027 7030 . + . ID=DME_0000799102;Name=DME_0000799102\n";
print EXPECTED "Contig_000100 maker mRNA 6027 7030 . + . ID=DME_0000799102-mRNA-1;Parent=DME_0000799102;Name=DME_0000799102-mRNA-1;_AED=0.12;_eAED=0.12;_QI=0|0|0|1|1|1|3|0|188\n";
print EXPECTED "Contig_000100 maker exon 6027 6166 . + . ID=DME_0000799102-mRNA-1:exon:1;Parent=DME_0000799102-mRNA-1\n";
print EXPECTED "Contig_000100 maker exon 6236 6410 . + . ID=DME_0000799102-mRNA-1:exon:2;Parent=DME_0000799102-mRNA-1\n";
print EXPECTED "Contig_000100 maker exon 6779 7030 . + . ID=DME_0000799102-mRNA-1:exon:3;Parent=DME_0000799102-mRNA-1\n";
print EXPECTED "Contig_000100 maker CDS 6027 6166 . + 0 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print EXPECTED "Contig_000100 maker CDS 6236 6410 . + 1 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print EXPECTED "Contig_000100 maker CDS 6779 7030 . + 0 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print EXPECTED "##FASTA\n";
print EXPECTED ">seq1\n";
print EXPECTED "ACAGAG\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $output_gff $expected_output_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_write_output_gff: failed test2 (output_gff $output_gff expected_output_gff $expected_output_gff)\n"; exit;}
system "rm -f $gff";
system "rm -f $expected_output_gff";
system "rm -f $output_gff";
# TEST ERRORCODE=104 (DON'T KNOW GENE NAME FOR TRANSCRIPT):
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_write_output_gff: cannot open gff $gff\n";
print GFF "Contig_000100 maker gene 6027 7030 . + . ID=DME_0000799101;Name=DME_0000799101\n";
print GFF "Contig_000100 maker mRNA 6027 7030 . + . ID=DME_0000799101-mRNA-1;Parent=DME_0000799101;Name=DME_0000799101-mRNA-1;_AED=0.15;_eAED=0.15;_QI=0|0|0|1|1|1|3|0|188\n";
print GFF "Contig_000100 maker exon 6027 6166 . + . ID=DME_0000799101-mRNA-1:exon:1;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker exon 6236 6410 . + . ID=DME_0000799101-mRNA-1:exon:2;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker exon 6779 7030 . + . ID=DME_0000799101-mRNA-1:exon:3;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6027 6166 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6236 6410 . + 1 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6779 7030 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker gene 6027 7030 . + . ID=DME_0000799102;Name=DME_0000799102\n";
print GFF "Contig_000100 maker mRNA 6027 7030 . + . ID=DME_0000799102-mRNA-1;Parent=DME_0000799102;Name=DME_0000799102-mRNA-1;_AED=0.12;_eAED=0.12;_QI=0|0|0|1|1|1|3|0|188\n";
print GFF "Contig_000100 maker exon 6027 6166 . + . ID=DME_0000799102-mRNA-1:exon:1;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker exon 6236 6410 . + . ID=DME_0000799102-mRNA-1:exon:2;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker exon 6779 7030 . + . ID=DME_0000799102-mRNA-1:exon:3;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6027 6166 . + 0 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6236 6410 . + 1 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6779 7030 . + 0 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "##FASTA\n";
print GFF ">seq1\n";
print GFF "ACAGAG\n";
close(GFF);
%GENE = ();
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
%DISCARD = ();
$DISCARD{"DME_0000799101"} = 1;
($errorcode,$errormsg) = &write_output_gff($gff,\%GENE,$output_gff,\%DISCARD,"yes",\%DELETE_TRANSCRIPT);
if ($errorcode != 104) { print STDERR "ERROR: test_write_output_gff: failed test3\n"; exit;}
system "rm -f $gff";
system "rm -f $expected_output_gff";
system "rm -f $output_gff";
# TEST ERRORCODE=105 (UNUSUAL FEATURE IN GFF FILE):
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_write_output_gff: cannot open gff $gff\n";
print GFF "Contig_000100 maker gene 6027 7030 . + . ID=DME_0000799101;Name=DME_0000799101\n";
print GFF "Contig_000100 maker mRNA 6027 7030 . + . ID=DME_0000799101-mRNA-1;Parent=DME_0000799101;Name=DME_0000799101-mRNA-1;_AED=0.15;_eAED=0.15;_QI=0|0|0|1|1|1|3|0|188\n";
print GFF "Contig_000100 maker exon 6027 6166 . + . ID=DME_0000799101-mRNA-1:exon:1;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker exon 6236 6410 . + . ID=DME_0000799101-mRNA-1:exon:2;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker exon 6779 7030 . + . ID=DME_0000799101-mRNA-1:exon:3;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6027 6166 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6236 6410 . + 1 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6779 7030 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker hello 6027 7030 . + . ID=DME_0000799102;Name=DME_0000799102\n";
print GFF "Contig_000100 maker mRNA 6027 7030 . + . ID=DME_0000799102-mRNA-1;Parent=DME_0000799102;Name=DME_0000799102-mRNA-1;_AED=0.12;_eAED=0.12;_QI=0|0|0|1|1|1|3|0|188\n";
print GFF "Contig_000100 maker exon 6027 6166 . + . ID=DME_0000799102-mRNA-1:exon:1;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker exon 6236 6410 . + . ID=DME_0000799102-mRNA-1:exon:2;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker exon 6779 7030 . + . ID=DME_0000799102-mRNA-1:exon:3;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6027 6166 . + 0 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6236 6410 . + 1 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6779 7030 . + 0 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "##FASTA\n";
print GFF ">seq1\n";
print GFF "ACAGAG\n";
close(GFF);
$GENE{"DME_0000799101-mRNA-1"} = "DME_0000799101";
$GENE{"DME_0000799102-mRNA-1"} = "DME_0000799102";
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
%DISCARD = ();
$DISCARD{"DME_0000799101"} = 1;
($errorcode,$errormsg) = &write_output_gff($gff,\%GENE,$output_gff,\%DISCARD,"yes",\%DELETE_TRANSCRIPT);
if ($errorcode != 105) { print STDERR "ERROR: test_write_output_gff: failed test4\n"; exit;}
system "rm -f $gff";
system "rm -f $expected_output_gff";
system "rm -f $output_gff";
# TEST ERRORCODE=106 (WRONG NUMBER OF COLUMNS IN GFF FILE):
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_write_output_gff: cannot open gff $gff\n";
print GFF "Contig_000100 maker gene 6027 7030 . + . ID=DME_0000799101;Name=DME_0000799101\n";
print GFF "Contig_000100 maker mRNA 6027 7030 . + . ID=DME_0000799101-mRNA-1;Parent=DME_0000799101;Name=DME_0000799101-mRNA-1;_AED=0.15;_eAED=0.15;_QI=0|0|0|1|1|1|3|0|188\n";
print GFF "Contig_000100 maker exon 6027 6166 . + . ID=DME_0000799101-mRNA-1:exon:1;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker exon 6236 6410 . + . ID=DME_0000799101-mRNA-1:exon:2;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker exon 6779 7030 . + . ID=DME_0000799101-mRNA-1:exon:3;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6027 6166 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6236 6410 . + 1 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6779 7030 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker gene 6027 7030 . + . ID=DME_0000799102;Name=DME_0000799102\n";
print GFF "Contig_000100 maker mRNA 7030 . + . ID=DME_0000799102-mRNA-1;Parent=DME_0000799102;Name=DME_0000799102-mRNA-1;_AED=0.12;_eAED=0.12;_QI=0|0|0|1|1|1|3|0|188\n";
print GFF "Contig_000100 maker exon 6027 6166 . + . ID=DME_0000799102-mRNA-1:exon:1;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker exon 6236 6410 . + . ID=DME_0000799102-mRNA-1:exon:2;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker exon 6779 7030 . + . ID=DME_0000799102-mRNA-1:exon:3;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6027 6166 . + 0 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6236 6410 . + 1 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6779 7030 . + 0 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "##FASTA\n";
print GFF ">seq1\n";
print GFF "ACAGAG\n";
close(GFF);
$GENE{"DME_0000799101-mRNA-1"} = "DME_0000799101";
$GENE{"DME_0000799102-mRNA-1"} = "DME_0000799102";
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
%DISCARD = ();
$DISCARD{"DME_0000799101"} = 1;
($errorcode,$errormsg) = &write_output_gff($gff,\%GENE,$output_gff,\%DISCARD,"yes",\%DELETE_TRANSCRIPT);
if ($errorcode != 106) { print STDERR "ERROR: test_write_output_gff: failed test5 (errorcode=$errorcode errormsg=$errormsg)\n"; exit;}
system "rm -f $gff";
system "rm -f $expected_output_gff";
system "rm -f $output_gff";
# EXAMPLE WITH TWO TRANSCRIPTS IN A GENE, ONE WITH HIGHER SCORE:
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_write_output_gff: cannot open gff $gff\n";
open(GFF,">$gff") || die "ERROR: test_read_maker_scores: cannot open gff $gff\n";
print GFF "Sratt_Chr00_000002 maker gene 371193 372952 . + . ID=SRAE_0000033601;Name=SRAE_0000033601\n";
print GFF "Sratt_Chr00_000002 maker mRNA 371193 372952 . + . ID=SRAE_0000033601-mRNA-1;Parent=SRAE_0000033601;Name=SRAE_0000033601-mRNA-1;_AED=0.02;_eAED=0.02;_QI=550|1|1|1|0|0|3|60|350;score=3922\n";
print GFF "Sratt_Chr00_000002 maker mRNA 371415 372952 . + . ID=SRAE_0000033601-mRNA-2;Parent=SRAE_0000033601;Name=SRAE_0000033601-mRNA-2;_AED=0.07;_eAED=0.07;_QI=378|1|1|1|0|0|2|60|350;score=7307\n";
print GFF "Sratt_Chr00_000002 maker exon 371193 371384 . + . ID=SRAE_0000033601-mRNA-1:exon:1;Parent=SRAE_0000033601-mRNA-1\n";
print GFF "Sratt_Chr00_000002 maker exon 371435 371821 . + . ID=SRAE_0000033601-mRNA-1:exon:2;Parent=SRAE_0000033601-mRNA-1\n";
print GFF "Sratt_Chr00_000002 maker exon 371415 371821 . + . ID=SRAE_0000033601-mRNA-2:exon:1;Parent=SRAE_0000033601-mRNA-2\n";
print GFF "Sratt_Chr00_000002 maker five_prime_UTR 371193 371384 . + . ID=SRAE_0000033601-mRNA-1:five_prime_utr;Parent=SRAE_0000033601-mRNA-1\n";
print GFF "Sratt_Chr00_000002 maker five_prime_UTR 371435 371792 . + . ID=SRAE_0000033601-mRNA-1:five_prime_utr;Parent=SRAE_0000033601-mRNA-1\n";
print GFF "Sratt_Chr00_000002 maker CDS 371793 371821 . + 0 ID=SRAE_0000033601-mRNA-1:cds;Parent=SRAE_0000033601-mRNA-1\n";
print GFF "Sratt_Chr00_000002 maker CDS 371869 372892 . + 1 ID=SRAE_0000033601-mRNA-1:cds;Parent=SRAE_0000033601-mRNA-1\n";
print GFF "Sratt_Chr00_000002 maker three_prime_UTR 372893 372952 . + . ID=SRAE_0000033601-mRNA-1:three_prime_utr;Parent=SRAE_0000033601-mRNA-1\n";
print GFF "Sratt_Chr00_000002 maker five_prime_UTR 371415 371792 . + . ID=SRAE_0000033601-mRNA-2:five_prime_utr;Parent=SRAE_0000033601-mRNA-2\n";
print GFF "Sratt_Chr00_000002 maker CDS 371793 371821 . + 0 ID=SRAE_0000033601-mRNA-2:cds;Parent=SRAE_0000033601-mRNA-2\n";
print GFF "Sratt_Chr00_000002 maker CDS 371869 372892 . + 1 ID=SRAE_0000033601-mRNA-2:cds;Parent=SRAE_0000033601-mRNA-2\n";
print GFF "Sratt_Chr00_000002 maker three_prime_UTR 372893 372952 . + . ID=SRAE_0000033601-mRNA-2:three_prime_utr;Parent=SRAE_0000033601-mRNA-2\n";
print GFF "##FASTA\n";
print GFF ">seq1\n";
print GFF "ACAGAG\n";
close(GFF);
$GENE{"SRAE_0000033601-mRNA-1"} = "SRAE_0000033601";
$GENE{"SRAE_0000033601-mRNA-2"} = "SRAE_0000033601";
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
$DELETE_TRANSCRIPT{"SRAE_0000033601-mRNA-1"} = 1;
($errorcode,$errormsg) = &write_output_gff($gff,\%GENE,$output_gff,\%DISCARD,"yes",\%DELETE_TRANSCRIPT);
if ($errorcode != 0) { print STDERR "ERROR: test_write_output_gff: failed test1\n"; exit;}
($expected_output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_output_gff") || die "ERROR: test_write_output_gff: cannot open expected_output_gff $expected_output_gff\n";
print EXPECTED "Sratt_Chr00_000002 maker gene 371193 372952 . + . ID=SRAE_0000033601;Name=SRAE_0000033601\n";
print EXPECTED "Sratt_Chr00_000002 maker mRNA 371415 372952 . + . ID=SRAE_0000033601-mRNA-2;Parent=SRAE_0000033601;Name=SRAE_0000033601-mRNA-2;_AED=0.07;_eAED=0.07;_QI=378|1|1|1|0|0|2|60|350;score=7307\n";
print EXPECTED "Sratt_Chr00_000002 maker exon 371415 371821 . + . ID=SRAE_0000033601-mRNA-2:exon:1;Parent=SRAE_0000033601-mRNA-2\n";
print EXPECTED "Sratt_Chr00_000002 maker five_prime_UTR 371415 371792 . + . ID=SRAE_0000033601-mRNA-2:five_prime_utr;Parent=SRAE_0000033601-mRNA-2\n";
print EXPECTED "Sratt_Chr00_000002 maker CDS 371793 371821 . + 0 ID=SRAE_0000033601-mRNA-2:cds;Parent=SRAE_0000033601-mRNA-2\n";
print EXPECTED "Sratt_Chr00_000002 maker CDS 371869 372892 . + 1 ID=SRAE_0000033601-mRNA-2:cds;Parent=SRAE_0000033601-mRNA-2\n";
print EXPECTED "Sratt_Chr00_000002 maker three_prime_UTR 372893 372952 . + . ID=SRAE_0000033601-mRNA-2:three_prime_utr;Parent=SRAE_0000033601-mRNA-2\n";
print EXPECTED "##FASTA\n";
print EXPECTED ">seq1\n";
print EXPECTED "ACAGAG\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $output_gff $expected_output_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_write_output_gff: failed test6 (output_gff $output_gff expected_output_gff $expected_output_gff)\n"; exit;}
system "rm -f $gff";
system "rm -f $expected_output_gff";
system "rm -f $output_gff";
}
#------------------------------------------------------------------#
# WRITE THE OUTPUT FILE:
sub write_output_gff
{
my $input_gff = $_[0]; # INPUT GFF FILE
my $GENE = $_[1]; # HASH TABLE OF GENE NAMES FOR TRANSCRIPTS
my $output_gff = $_[2]; # OUTPUT GFF FILE
my $DISCARD = $_[3]; # HASH TABLE OF GENES TO DISCARD
my $from_maker = $_[4]; # SAYS WHETHER THE INPUT GFF IS FROM MAKER
my $DELETE_TRANSCRIPT = $_[5]; # HASH TABLE OF TRANSCRIPTS TO DELETE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $line; #
my @temp; #
my $end_of_gff = 0; # SAYS WHETHER WE HAVE REACHED THE END OF THE INPUT GFF
my $gene; # GENE NAME
my $feature; # FEATURE TYPE
my $transcript; # TRANSCRIPT NAME
my $name; # FEATURE NAME
# OPEN THE OUTPUT GFF:
open(OUTPUT,">$output_gff") || die "ERROR: write_output_gff: cannot open output_gff $output_gff\n";
# OPEN THE INPUT GFF:
open(INPUT_GFF,"$input_gff") || die "ERROR: write_output_gff: cannot open $input_gff\n";
while(<INPUT_GFF>)
{
$line = $_;
chomp $line;
if ($line eq '##FASTA') { $end_of_gff = 1;}
if ($end_of_gff == 0)
{
@temp = split(/\t+/,$line);
if ($#temp == 8)
{
$name = $temp[8];
$feature = $temp[2];
if ($feature eq 'mRNA' || $feature eq 'exon' || $feature eq 'three_prime_UTR' || $feature eq 'five_prime_UTR' || $feature eq 'CDS' || $feature eq 'gene' || $feature eq 'intron')
{
@temp = split(/ID=/,$name);
$name = $temp[1];
if ($feature eq 'mRNA' && $from_maker eq 'yes')
{
@temp = split(/Parent=/,$name);
$gene = $temp[1]; # eg. maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|0|0|1|1|1|2|0|1206;score=1000
@temp = split(/\;/,$gene);
$gene = $temp[0]; # eg. maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52
# GET THE TRANSCRIPT NAME:
@temp = split(/\;/,$name);
$transcript = $temp[0];
}
elsif (($feature eq 'exon' || $feature eq 'three_prime_UTR' || $feature eq 'five_prime_UTR' || $feature eq 'CDS') && $from_maker eq 'yes')
{
@temp = split(/Parent=/,$name);
$transcript = $temp[1]; # eg. augustus_masked-SSTP.scaffold.00055.227916-processed-gene-0.264-mRNA-1
@temp = split(/\;/,$transcript);
$transcript = $temp[0]; # eg. augustus_masked-SSTP.scaffold.00055.227916-processed-gene-0.264
# FIND THE GENE NAME FOR THIS TRANSCRIPT:
if (!($GENE->{$transcript}))
{
$errormsg = "ERROR: write_output_gff: do not know gene name for transcript $transcript (line $line)\n";
$errorcode = 104; # ERRORCODE=104 (TESTED FOR)
return($errorcode,$errormsg);
}
$gene = $GENE->{$transcript};
}
elsif ($feature eq 'gene')
{
@temp = split(/\;/,$name);
$gene = $temp[0];
$transcript = $gene;
}
if (!($DISCARD->{$gene}) && !($DELETE_TRANSCRIPT->{$transcript}))
{
print OUTPUT "$line\n";
}
}
else
{
if (!($feature eq 'protein_match' || $feature eq 'expressed_sequence_match' || $feature eq 'contig' || $feature eq 'match' || $feature eq 'match_part') && $from_maker eq 'yes')
{
$errormsg = "ERROR: write_output_gff: feature $feature line $line\n";
$errorcode = 105; # ERRORCODE=105 (TESTED FOR)
return($errorcode,$errormsg);
}
# protein_match AND expressed_sequence_match FEATURES ARE NOT PART OF MAKER GENE PREDICTIONS
if ($from_maker eq 'yes' && ($feature eq 'protein_match' || $feature eq 'expressed_sequence_match' || $feature eq 'contig' || $feature eq 'match_part' || $feature eq 'match'))
{
print OUTPUT "$line\n";
}
}
}
else
{
if (substr($line,0,1) ne '#')
{
$errormsg = "ERROR: write_output_gff: line $line\n";
$errorcode = 106; # ERRORCODE=106 (TESTED FOR)
return($errorcode,$errormsg);
}
print OUTPUT "$line\n";
}
}
else
{
print OUTPUT "$line\n";
}
}
close(INPUT_GFF);
close(OUTPUT);
return($errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &find_overlapping_genes
sub test_find_overlapping_genes
{
my $output_dir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES IN
my $path_to_bedtools = $_[1]; # PATH TO BEDTOOLS
my $OVERLAP; # HASH TABLE OF OVERLAPS FOR GENES
my $gff; # INPUT GFF FILE
my %GENE = (); # HASH TABLE OF GENE NAMES FOR TRANSCRIPTS
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_find_overlapping_genes: cannot open gff $gff\n";
print GFF "Contig_000100 maker CDS 6027 6166 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6236 6410 . + 1 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6779 7030 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6027 6166 . + 0 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6236 6410 . + 1 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6779 7030 . + 0 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6025 6167 . + 0 ID=DME_0000799103-mRNA-1:cds;Parent=DME_0000799103-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6237 6411 . + 1 ID=DME_0000799103-mRNA-1:cds;Parent=DME_0000799103-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6783 7035 . + 0 ID=DME_0000799103-mRNA-1:cds;Parent=DME_0000799103-mRNA-1\n";
close(GFF);
$GENE{"DME_0000799101-mRNA-1"} = "DME_0000799101";
$GENE{"DME_0000799102-mRNA-1"} = "DME_0000799102";
$GENE{"DME_0000799103-mRNA-1"} = "DME_0000799103";
($OVERLAP,$errorcode,$errormsg) = &find_overlapping_genes($gff,$path_to_bedtools,$outputdir,\%GENE);
if ($errorcode != 0) { print STDERR "ERROR: test_find_overlapping_genes: failed test1 (errorcode=$errorcode errormsg=$errormsg)\n"; exit;}
if ($OVERLAP->{"DME_0000799101"} ne "DME_0000799102,DME_0000799103") { print STDERR "ERROR: test_find_overlapping_genes: failed test1b\n"; exit;}
if ($OVERLAP->{"DME_0000799102"} ne "DME_0000799101,DME_0000799103") { print STDERR "ERROR: test_find_overlapping_genes: failed test1c\n"; exit;}
if ($OVERLAP->{"DME_0000799103"} ne "DME_0000799101,DME_0000799102") { print STDERR "ERROR: test_find_overlapping_genes: failed test1d\n"; exit;}
# TEST CASE WHERE THERE ARE NO OVERLAPS:
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_find_overlapping_genes: cannot open gff $gff\n";
print GFF "Contig_000100 maker CDS 6027 6166 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6236 6410 . + 1 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6779 7030 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000101 maker CDS 6025 6167 . + 0 ID=DME_0000799103-mRNA-1:cds;Parent=DME_0000799103-mRNA-1\n";
print GFF "Contig_000101 maker CDS 6237 6411 . + 1 ID=DME_0000799103-mRNA-1:cds;Parent=DME_0000799103-mRNA-1\n";
print GFF "Contig_000101 maker CDS 6783 7035 . + 0 ID=DME_0000799103-mRNA-1:cds;Parent=DME_0000799103-mRNA-1\n";
close(GFF);
($OVERLAP,$errorcode,$errormsg) = &find_overlapping_genes($gff,$path_to_bedtools,$outputdir,\%GENE);
if ($errorcode != 0) { print STDERR "ERROR: test_find_overlapping_genes: failed test2\n"; exit;}
if (defined($OVERLAP->{"DME_0000799101"})) { print STDERR "ERROR: test_find_overlapping_genes: failed test2b\n"; exit;}
if (defined($OVERLAP->{"DME_0000799103"})) { print STDERR "ERROR: test_find_overlapping_genes: failed test2c\n"; exit;}
# TEST ERRORCODE=103 (DO NOT KNOW GENE NAME FOR TRANSCRIPT):
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_find_overlapping_genes: cannot open gff $gff\n";
print GFF "Contig_000100 maker CDS 6027 6166 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6236 6410 . + 1 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6779 7030 . + 0 ID=DME_0000799101-mRNA-1:cds;Parent=DME_0000799101-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6027 6166 . + 0 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6236 6410 . + 1 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6779 7030 . + 0 ID=DME_0000799102-mRNA-1:cds;Parent=DME_0000799102-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6025 6167 . + 0 ID=DME_0000799103-mRNA-1:cds;Parent=DME_0000799103-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6237 6411 . + 1 ID=DME_0000799103-mRNA-1:cds;Parent=DME_0000799103-mRNA-1\n";
print GFF "Contig_000100 maker CDS 6783 7035 . + 0 ID=DME_0000799103-mRNA-1:cds;Parent=DME_0000799103-mRNA-1\n";
close(GFF);
%GENE = ();
$GENE{"DME_0000799102-mRNA-1"} = "DME_0000799102";
($OVERLAP,$errorcode,$errormsg) = &find_overlapping_genes($gff,$path_to_bedtools,$outputdir,\%GENE);
if ($errorcode != 103) { print STDERR "ERROR: test_find_overlapping_genes: failed test3\n"; exit;}
}
#------------------------------------------------------------------#
# FIND THE OVERLAPS BETWEEN ALL THE GENE FEATURES:
sub find_overlapping_genes
{
my $gff = $_[0]; # INPUT GENE GFF FILE WITH 'cds' FEATURES
my $path_to_bedtools = $_[1]; # PATH TO BEDTOOLS EXECUTABLE
my $outputdir = $_[2]; # DIRECTORY TO PUT OUTPUT FILES INTO
my $GENE = $_[3]; # HASH TABLE WITH GENE NAMES FOR TRANSCRIPTS
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my %OVERLAP = (); # HASH TABLE OF OVERLAPPING GENES
my $overlaps; # FILE WITH OVERLAPS BETWEEN GENES
my $line; #
my @temp; #
my $cmd; # COMMAND TO RUN
my $gene1; # FIRST GENE
my $gene2; # SECOND GENE
my %SEEN = (); # HASH TABLE TO RECORD WHICH GENE PAIRS WE HAVE SEEN ALREADY
my $pair1; # A PAIR OF GENES
my $pair2; # A PAIR OF GENES
# FIND ALL THE OVERLAPPING GENES:
($overlaps,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
$cmd = $path_to_bedtools." intersect -loj -a $gff -b $gff > $overlaps";
system "$cmd";
open(OVERLAPS,"$overlaps") || die "ERROR: find_overlapping_genes: cannot open overlaps $overlaps\n";
while(<OVERLAPS>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
$gene1 = $temp[8]; # eg. ID=DME_0000799301-mRNA-1:cds;Parent=DME_0000799301-mRNA-1
$gene2 = $temp[17];
@temp = split(/ID=/,$gene1);
$gene1 = $temp[1];
if ($gene1 =~ /Parent=/)
{
@temp = split(/Parent=/,$gene1);
$gene1 = $temp[1];
# FIND THE GENE NAME FOR THIS TRANSCRIPT:
if (!($GENE->{$gene1}))
{
$errormsg = "ERROR: find_overlapping_genes: do not know gene name for transcript $gene1 (line $line)\n";
$errorcode = 103; # ERRORCODE=103 (TESTED FOR)
return(\%OVERLAP,$errorcode,$errormsg);
}
$gene1 = $GENE->{$gene1};
}
@temp = split(/ID=/,$gene2);
$gene2 = $temp[1];
if ($gene2 =~ /Parent=/)
{
@temp = split(/Parent=/,$gene2);
$gene2 = $temp[1];
# FIND THE GENE NAME FOR THIS TRANSCRIPT:
if (!($GENE->{$gene2}))
{
$errormsg = "ERROR: find_overlapping_genes: do not know gene name for transcript $gene2 (line $line)\n";
$errorcode = 103; # ERRORCODE=103 (TESTED FOR)
return(\%OVERLAP,$errorcode,$errormsg);
}
$gene2 = $GENE->{$gene2};
}
if ($gene1 ne $gene2)
{
$pair1 = $gene1."=".$gene2;
$pair2 = $gene2."=".$gene1;
if (!($SEEN{$pair1}) && !($SEEN{$pair2}))
{
# RECORD OVERLAPS FOR $gene1:
if (!($OVERLAP{$gene1})) { $OVERLAP{$gene1} = $gene2;}
else {$OVERLAP{$gene1} = $OVERLAP{$gene1}.",".$gene2;}
# RECORD OVERLAPS FOR $gene2:
if (!($OVERLAP{$gene2})) { $OVERLAP{$gene2} = $gene1;}
else {$OVERLAP{$gene2} = $OVERLAP{$gene2}.",".$gene1;}
$SEEN{$pair1} = 1;
$SEEN{$pair2} = 1;
}
}
}
close(OVERLAPS);
return(\%OVERLAP,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &read_genes_for_transcripts
sub test_read_genes_for_transcripts
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN
my $GENE; # HASH TABLE OF GENE NAMES FOR TRANSCRIPTS
my $input_gff; # INPUT GFF FILE
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_genes_for_transcripts: cannot open $input_gff\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">seq1\n";
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n";
close(INPUT_GFF);
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff);
if ($errorcode != 0) { print STDERR "ERROR: test_read_genes_for_transcripst: failed test1\n"; exit;}
if ($GENE->{"augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1"} ne "augustus_masked-scaff_000010-processed-gene-0.3") { print STDERR "ERROR: test_read_genes_for_transcripts: failed test1b\n"; exit;}
if ($GENE->{"augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1"} ne "augustus_masked-scaff_000010-processed-gene-0.4") { print STDERR "ERROR: test_read_genes_for_transcripts: failed test1c\n"; exit;}
system "rm -f $input_gff";
# TEST ERRORCODE=9 (mRNA APPEARS TWICE IN GFF):
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_genes_for_transcripts: cannot open $input_gff\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">seq1\n";
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n";
close(INPUT_GFF);
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff);
if ($errorcode != 9) { print STDERR "ERROR: test_read_genes_for_transcripts: failed test2\n"; exit;}
system "rm -f $input_gff";
# TEST ERRORCODE=3 (LINE DOES NOT HAVE 9 COLUMNS):
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_genes_for_transcripts: cannot open $input_gff\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">seq1\n";
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n";
close(INPUT_GFF);
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff);
if ($errorcode != 3) { print STDERR "ERROR: test_read_genes_for_transcripts: failed test3\n"; exit;}
system "rm -f $input_gff";
# TEST MAKER EXAMPLE:
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_genes_for_transcripts: cannot open $input_gff\n";
print INPUT_GFF "SSTP.scaffold.00055.227916\tmaker\tmRNA\t28133\t41313\t.\t+\t.\tID=1:SRAE_2027700;Parent=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.59;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.59-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|0|0|1|1|1|2|0|4380;score=1000\n";
print INPUT_GFF "SSTP.scaffold.00055.227916\tmaker\tmRNA\t49678\t50700\t.\t+\t.\tID=augustus_masked-SSTP.scaffold.00055.227916-processed-gene-0.264-mRNA-1;Parent=augustus_masked-SSTP.scaffold.00055.227916-processed-gene-0.264;Name=augustus_masked-SSTP.scaffold.00055.227916-processed-gene-0.264-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|340\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">seq1\n";
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n";
close(INPUT_GFF);
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff);
if ($errorcode != 0) { print STDERR "ERROR: test_read_genes_for_transcripst: failed test4\n"; exit;}
if ($GENE->{"1:SRAE_2027700"} ne "maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.59") { print STDERR "ERROR: test_read_genes_for_transcripts: failed test4b\n"; exit;}
if ($GENE->{"maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.59-mRNA-1"} ne "maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.59") { print STDERR "ERROR: test_read_genes_for_transcripts: failed test4c\n"; exit;}
if ($GENE->{"augustus_masked-SSTP.scaffold.00055.227916-processed-gene-0.264-mRNA-1"} ne "augustus_masked-SSTP.scaffold.00055.227916-processed-gene-0.264") { print STDERR "ERROR: test_read_genes_for_transcripts: failed test4d\n"; exit;}
system "rm -f $input_gff";
}
#------------------------------------------------------------------#
# READ IN THE GENE NAME FOR EACH TRANSCRIPT IN THE INPUT GFF FILE:
sub read_genes_for_transcripts
{
my $input_gff = $_[0]; # INPUT GFF FILE
my $errorcode = 0; # THIS IS RETURNED AS 0 IF NO ERROR OCCURRED.
my $errormsg = "none";# THIS IS RETURNED AS 'none' IF NO ERROR OCCURRED.
my %GENE = (); # HASH TABLE OF GENE NAMES FOR TRANSCRIPT NAMES.
my $line; #
my @temp; #
my $end_of_gff = 0; # SAYS WHETHER THE END OF THE GFF FILE HAS BEEN REACHED
my $name; # NAME OF mRNA
my $gene; # NAME OF GENE
my $name2; # NAME OF mRNA
open(INPUT_GFF,"$input_gff") || die "ERROR: read_genes_for_transcripts: cannot open $input_gff\n";
while(<INPUT_GFF>)
{
$line = $_;
chomp $line;
if ($line eq '##FASTA') { $end_of_gff = 1;}
if (substr($line,0,1) ne '#' && $end_of_gff == 0)
{
@temp = split(/\t+/,$line);
if ($#temp == 8)
{
if ($temp[2] eq 'mRNA')
{
# eg. scaff_000010 maker mRNA 88793 89828 . + . ID=maker-scaff_000010-augustus-gene-0.21-mRNA-1;Parent=maker-scaff_000010-augustus-gene-0.21;...
$name = $temp[8];
$gene = $temp[8];
@temp = split(/ID=/,$name);
$name = $temp[1]; # eg. maker-scaff_000010-augustus-gene-0.21-mRNA-1;...
@temp = split(/\;/,$name);
$name = $temp[0]; # eg. maker-scaff_000010-augustus-gene-0.21-mRNA-1
# GET THE NAME OF THE GENE:
@temp = split(/Parent=/,$gene);
$gene = $temp[1]; # eg. maker-scaff_000010-augustus-gene-0.21;...
@temp = split(/\;/,$gene);
$gene = $temp[0]; # eg. maker-scaff_000010-augustus-gene-0.21
# RECORD THE GENE NAME FOR THE mRNA:
if ($GENE{$name})
{
$errormsg= "ERROR: read_genes_for_transcripts: already have gene name for $name\n";
$errorcode= 9; # ERRORCODE=9 (TESTED FOR)
return(\%GENE,$errorcode,$errormsg);
}
$GENE{$name} = $gene;
# SEE WHAT NAME IS STORED IN THE Name= PART OF THE LINE:
@temp = split(/\t+/,$line);
$name2 = $temp[8];
@temp = split(/Name=/,$name2);
$name2 = $temp[1];
@temp = split(/\;/,$name2);
$name2 = $temp[0];
if ($name2 ne $name)
{
if ($GENE{$name2})
{
$errormsg= "ERROR: read_genes_for_transcripts: already have gene name for $name2\n";
$errorcode= 9; # ERRORCODE=9 (TESTED FOR)
return(\%GENE,$errorcode,$errormsg);
}
$GENE{$name2} = $gene;
}
}
}
else
{
$errormsg = "ERROR: read_genes_for_transcripts: line does not have 9 columns: $line\n";
$errorcode = 3; # ERRORCODE=3 (TESTED FOR)
return(\%GENE,$errorcode,$errormsg);
}
}
}
close(INPUT_GFF);
return(\%GENE,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &read_maker_scores
sub test_read_maker_scores
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN
my $gff; # GFF FILE
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
my %SCORE = (); # HASH TABLE WITH SCORES FOR GENES
my $DELETE_TRANSCRIPT; # HASH TABLE OF TRANSCRIPTS TO DELETE
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_read_maker_scores: cannot open gff $gff\n";
print GFF "SSTP.scaffold.00055.227916 maker gene 7006 10673 . - . ID=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 maker mRNA 7006 10673 . - . ID=1:SRAE_2028500;Parent=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|0|0|1|1|1|2|0|1206;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match 7006 10673 1000 - . ID=SSTP.scaffold.00055.227916:hit:3043:3.5.0.0;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52-mRNA-1;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match_part 10537 10673 1000 - . ID=SSTP.scaffold.00055.227916:hsp:5479:3.5.0.0;Parent=SSTP.scaffold.00055.227916:hit:3043:3.5.0.0;Target=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52-mRNA-1 3485 3621 +;Gap=M137\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match_part 7006 10489 1000 - . ID=SSTP.scaffold.00055.227916:hsp:5480:3.5.0.0;Parent=SSTP.scaffold.00055.227916:hit:3043:3.5.0.0;Target=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52-mRNA-1 1 3484 +;Gap=M3484\n";
print GFF "SSTP.scaffold.00055.227916 maker gene 6942 8068 . + . ID=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 maker mRNA 6942 8068 . + . ID=1:SRAE_2028600;Parent=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69-mRNA-1;_AED=1.00;_eAED=1.00;_QI=106|0|0|0|1|1|2|884|43;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match 6942 8068 1000 + . ID=SSTP.scaffold.00055.227916:hit:3060:3.5.0.0;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69-mRNA-1;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match_part 6942 6956 1000 + . ID=SSTP.scaffold.00055.227916:hsp:5520:3.5.0.0;Parent=SSTP.scaffold.00055.227916:hit:3060:3.5.0.0;Target=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69-mRNA-1 1 15 +;Gap=M15\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match_part 6962 8068 1000 + . ID=SSTP.scaffold.00055.227916:hsp:5521:3.5.0.0;Parent=SSTP.scaffold.00055.227916:hit:3060:3.5.0.0;Target=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69-mRNA-1 16 1122 +;Gap=M1107\n";
close(GFF);
($DELETE_TRANSCRIPT,$errorcode,$errormsg) = &read_maker_scores($gff,\%SCORE,"none",1);
if ($errorcode != 0) { print STDERR "ERROR: test_read_maker_scores: failed test1\n"; exit;}
if ($SCORE{"maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69"} ne '1.00') { print STDERR "ERROR: test_read_maker_scores: failed test1b\n"; exit;}
if ($SCORE{"maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52"} != 0) { print STDERR "ERROR: test_read_maker_scores: failed test1c\n"; exit;}
system "rm -f $gff";
# TEST FOR ERRORCODE=1 (mRNA DOESN'T HAVE AED SCORE):
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_read_maker_scores: cannot open gff $gff\n";
print GFF "SSTP.scaffold.00055.227916 maker gene 7006 10673 . - . ID=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 maker mRNA 7006 10673 . - . ID=1:SRAE_2028500;Parent=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52-mRNA-1;_QI=0|0|0|1|1|1|2|0|1206;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match 7006 10673 1000 - . ID=SSTP.scaffold.00055.227916:hit:3043:3.5.0.0;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52-mRNA-1;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match_part 10537 10673 1000 - . ID=SSTP.scaffold.00055.227916:hsp:5479:3.5.0.0;Parent=SSTP.scaffold.00055.227916:hit:3043:3.5.0.0;Target=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52-mRNA-1 3485 3621 +;Gap=M137\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match_part 7006 10489 1000 - . ID=SSTP.scaffold.00055.227916:hsp:5480:3.5.0.0;Parent=SSTP.scaffold.00055.227916:hit:3043:3.5.0.0;Target=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52-mRNA-1 1 3484 +;Gap=M3484\n";
print GFF "SSTP.scaffold.00055.227916 maker gene 6942 8068 . + . ID=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 maker mRNA 6942 8068 . + . ID=1:SRAE_2028600;Parent=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69-mRNA-1;_AED=1.00;_eAED=1.00;_QI=106|0|0|0|1|1|2|884|43;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match 6942 8068 1000 + . ID=SSTP.scaffold.00055.227916:hit:3060:3.5.0.0;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69-mRNA-1;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match_part 6942 6956 1000 + . ID=SSTP.scaffold.00055.227916:hsp:5520:3.5.0.0;Parent=SSTP.scaffold.00055.227916:hit:3060:3.5.0.0;Target=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69-mRNA-1 1 15 +;Gap=M15\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match_part 6962 8068 1000 + . ID=SSTP.scaffold.00055.227916:hsp:5521:3.5.0.0;Parent=SSTP.scaffold.00055.227916:hit:3060:3.5.0.0;Target=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69-mRNA-1 16 1122 +;Gap=M1107\n";
close(GFF);
($DELETE_TRANSCRIPT,$errorcode,$errormsg) = &read_maker_scores($gff,\%SCORE,"none",1);
if ($errorcode != 1) { print STDERR "ERROR: test_read_maker_scores: failed test2 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
system "rm -f $gff";
# TEST ERRORCODE=4 (GFF FILE DOES NOT HAVE 8 COLUMNS):
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_read_maker_scores: cannot open gff $gff\n";
print GFF "SSTP.scaffold.00055.227916 maker gene 7006 10673 . - . ID=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 maker mRNA 7006 10673 . - . ID=1:SRAE_2028500;Parent=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|0|0|1|1|1|2|0|1206;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match 7006 10673 1000 - . ID=SSTP.scaffold.00055.227916:hit:3043:3.5.0.0;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52-mRNA-1;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match_part 10537 10673 1000 - . ID=SSTP.scaffold.00055.227916:hsp:5479:3.5.0.0;Parent=SSTP.scaffold.00055.227916:hit:3043:3.5.0.0;Target=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52-mRNA-1 3485 3621 +;Gap=M137\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match_part 7006 10489 1000 - . ID=SSTP.scaffold.00055.227916:hsp:5480:3.5.0.0;Parent=SSTP.scaffold.00055.227916:hit:3043:3.5.0.0;Target=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.52-mRNA-1 1 3484 +;Gap=M3484\n";
print GFF "SSTP.scaffold.00055.227916 maker gene 6942 8068 . + . ID=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 maker mRNA 6942 8068 . + . ID=1:SRAE_2028600;Parent=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69-mRNA-1;_AED=1.00;_eAED=1.00;_QI=106|0|0|0|1|1|2|884|43;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl 6942 8068 1000 + . ID=SSTP.scaffold.00055.227916:hit:3060:3.5.0.0;Name=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69-mRNA-1;score=1000\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match_part 6942 6956 1000 + . ID=SSTP.scaffold.00055.227916:hsp:5520:3.5.0.0;Parent=SSTP.scaffold.00055.227916:hit:3060:3.5.0.0;Target=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69-mRNA-1 1 15 +;Gap=M15\n";
print GFF "SSTP.scaffold.00055.227916 pred_gff:embl match_part 6962 8068 1000 + . ID=SSTP.scaffold.00055.227916:hsp:5521:3.5.0.0;Parent=SSTP.scaffold.00055.227916:hit:3060:3.5.0.0;Target=maker-SSTP.scaffold.00055.227916-pred_gff_embl-gene-0.69-mRNA-1 16 1122 +;Gap=M1107\n";
close(GFF);
($DELETE_TRANSCRIPT,$errorcode,$errormsg) = &read_maker_scores($gff,\%SCORE,"none",1);
if ($errorcode != 4) { print STDERR "ERROR: test_read_maker_scores: failed test3\n"; exit;}
system "rm -f $gff";
# EXAMPLE WITH TWO TRANSCRIPTS IN A GENE, ONE WITH HIGHER SCORE:
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_read_maker_scores: cannot open gff $gff\n";
print GFF "Sratt_Chr00_000002 maker gene 371193 372952 . + . ID=SRAE_0000033601;Name=SRAE_0000033601\n";
print GFF "Sratt_Chr00_000002 maker mRNA 371193 372952 . + . ID=SRAE_0000033601-mRNA-1;Parent=SRAE_0000033601;Name=SRAE_0000033601-mRNA-1;_AED=0.02;_eAED=0.02;_QI=550|1|1|1|0|0|3|60|350;score=3922\n";
print GFF "Sratt_Chr00_000002 maker mRNA 371415 372952 . + . ID=SRAE_0000033601-mRNA-2;Parent=SRAE_0000033601;Name=SRAE_0000033601-mRNA-2;_AED=0.07;_eAED=0.07;_QI=378|1|1|1|0|0|2|60|350;score=7307\n";
print GFF "Sratt_Chr00_000002 maker exon 371193 371384 . + . ID=SRAE_0000033601-mRNA-1:exon:1;Parent=SRAE_0000033601-mRNA-1\n";
print GFF "Sratt_Chr00_000002 maker exon 371435 371821 . + . ID=SRAE_0000033601-mRNA-1:exon:2;Parent=SRAE_0000033601-mRNA-1\n";
print GFF "Sratt_Chr00_000002 maker exon 371415 371821 . + . ID=SRAE_0000033601-mRNA-2:exon:1;Parent=SRAE_0000033601-mRNA-2\n";
print GFF "Sratt_Chr00_000002 maker five_prime_UTR 371193 371384 . + . ID=SRAE_0000033601-mRNA-1:five_prime_utr;Parent=SRAE_0000033601-mRNA-1\n";
print GFF "Sratt_Chr00_000002 maker five_prime_UTR 371435 371792 . + . ID=SRAE_0000033601-mRNA-1:five_prime_utr;Parent=SRAE_0000033601-mRNA-1\n";
print GFF "Sratt_Chr00_000002 maker CDS 371793 371821 . + 0 ID=SRAE_0000033601-mRNA-1:cds;Parent=SRAE_0000033601-mRNA-1\n";
print GFF "Sratt_Chr00_000002 maker CDS 371869 372892 . + 1 ID=SRAE_0000033601-mRNA-1:cds;Parent=SRAE_0000033601-mRNA-1\n";
print GFF "Sratt_Chr00_000002 maker three_prime_UTR 372893 372952 . + . ID=SRAE_0000033601-mRNA-1:three_prime_utr;Parent=SRAE_0000033601-mRNA-1\n";
print GFF "Sratt_Chr00_000002 maker five_prime_UTR 371415 371792 . + . ID=SRAE_0000033601-mRNA-2:five_prime_utr;Parent=SRAE_0000033601-mRNA-2\n";
print GFF "Sratt_Chr00_000002 maker CDS 371793 371821 . + 0 ID=SRAE_0000033601-mRNA-2:cds;Parent=SRAE_0000033601-mRNA-2\n";
print GFF "Sratt_Chr00_000002 maker CDS 371869 372892 . + 1 ID=SRAE_0000033601-mRNA-2:cds;Parent=SRAE_0000033601-mRNA-2\n";
print GFF "Sratt_Chr00_000002 maker three_prime_UTR 372893 372952 . + . ID=SRAE_0000033601-mRNA-2:three_prime_utr;Parent=SRAE_0000033601-mRNA-2\n";
close(GFF);
($DELETE_TRANSCRIPT,$errorcode,$errormsg) = &read_maker_scores($gff,\%SCORE,"none",1);
if ($errorcode != 0) { print STDERR "ERROR: test_read_maker_scores: failed test4\n"; exit;}
my $score = $SCORE{"SRAE_0000033601"};
if ($SCORE{"SRAE_0000033601"} ne '0.07') { print STDERR "ERROR: test_read_maker_scores: failed test4b\n"; exit;}
system "rm -f $gff";
}
#------------------------------------------------------------------#
# IF IT IS A MAKER GFF FILE, READ IN THE SCORE INFO. FROM THE GFF FILE:
sub read_maker_scores
{
my $input_gff = $_[0]; # INPUT GFF FILE
my $SCORE = $_[1]; # HASH TABLE OF SCORES FOR MAKER GENES
my $subtract_from = $_[2]; # NUMBER TO SUBTRACT THE MAKER SCORES FROM
my $testing = $_[3]; # SAYS WHETHER THIS IS BEING CALLED FROM A TESTING SUBROUTINE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $line; #
my @temp; #
my $name; # NAME FOR mRNA
my $gene; # GENE NAME
my $score; # SCORE FOR GENE
my $end_of_gff = 0; # SAYS WHETHER WE HAVE REACHED THE END OF THE GFF FILE
my $current_score; # CURRENTLY STORED SCORE FOR A GENE
my $transcript; # TRANSCRIPT NAME
my $current_transcript; # CURRENTLY STORED HIGHEST-SCORING TRANSCRIPT FOR A GENE
my %DELETE_TRANSCRIPT = (); # HASH TABLE OF TRANSCRIPTS TO DELETE
my %SCORE2 = (); # HASH TABLE OF SCORES
# THE MAKER OUTPUT FILE HAS AN AED SCORE FOR EACH mRNA FEATURE.
# IF THERE ARE SEVERAL mRNAs IN ONE GENE, TAKE THE MAXIMUM SCORE OF ALL THE mRNAs IN THE GENE AS THE GENE SCORE.
open(INPUT_GFF,"$input_gff") || die "ERROR: read_maker_scores: cannot open $input_gff\n";
while(<INPUT_GFF>)
{
$line = $_;
chomp $line;
if ($line eq '##FASTA') { $end_of_gff = 1;} # END OF GFF FILE
if (substr($line,0,1) ne '#' && $end_of_gff == 0)
{
@temp = split(/\t+/,$line);
if ($#temp == 8)
{
if ($temp[2] eq 'mRNA') # mRNA FEATURE
{
# eg. SSTP.scaffold.00055.227916 maker mRNA 49678 50700 . + . ID=augustus_masked-SSTP.scaffold.00055.227916-processed-gene-0.264-mRNA-1;Parent=augustus_masked-SSTP.scaffold.00055.227916-processed-gene-0.264;Name=augustus_masked-SSTP.scaffold.00055.227916-processed-gene-0.264-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|340
$name = $temp[8]; # eg. ID=augustus_masked-SSTP.scaffold.00055.227916-processed-gene-0.264-mRNA-1;Parent=augustus_masked-SSTP.scaffold.00055.227916-processed-gene-0.264;Name=augustus_masked-SSTP.scaffold.00055.227916-processed-gene-0.264-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|340
@temp = split(/Parent=/,$name);
$gene = $temp[1]; # eg. augustus_masked-SSTP.scaffold.00055.227916-processed-gene-0.264;Name=augustus_masked-SSTP.scaffold.00055.227916-processed-gene-0.264-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|340
@temp = split(/\;/,$gene);
$gene = $temp[0]; # eg. augustus_masked-SSTP.scaffold.00055.227916-processed-gene-0.264;
@temp = split(/ID=/,$name);
$transcript = $temp[1]; # eg. augustus_masked-SSTP.scaffold.00055.227916-processed-gene-0.264-mRNA-1;Pa...
@temp = split(/\;/,$transcript);
$transcript = $temp[0]; # eg. augustus_masked-SSTP.scaffold.00055.227916-processed-gene-0.264-mRNA-1
# NOW GET THE SCORE:
if (!($name =~ /AED=/))
{
$errormsg = "ERROR: read_maker_scores: name $name does not seem to have AED score\n";
$errorcode = 1; # ERRORCODE=1 (TESTED FOR)
return(\%DELETE_TRANSCRIPT,$errorcode,$errormsg);
}
@temp = split(/AED=/,$name);
$score = $temp[1]; # eg. 0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|340
@temp = split(/\;/,$score);
$score = $temp[0]; # eg. 0.00
if ($subtract_from ne 'none') # eg. FOR MAKER, WE WANT TO SUBTRACT THE SCORE FROM 1
{
$score = $subtract_from - $score;
}
if (!($SCORE2{$gene})) # THERE IS NO SCORE STORED FOR THIS GENE YET
{
$SCORE->{$gene} = $score;
$SCORE2{$gene} = $score."=".$transcript;
}
else
{
$current_score = $SCORE2{$gene};
@temp = split(/=/,$current_score);
$current_score = $temp[0];
$current_transcript = $temp[1];
if ($score > $current_score)
{
$SCORE->{$gene} = $score;
$SCORE2{$gene} = $score."=".$transcript;
$DELETE_TRANSCRIPT{$current_transcript} = 1; # DELETE $current_transcript, AS $transcript HAS A HIGHER SCORE
if ($testing == 0) { print STDERR "WARNING: discarding transcript $current_transcript, as $transcript has a higher score...\n";}
}
else
{
$DELETE_TRANSCRIPT{$transcript} = 1; # DELETE $transcript, AS $current_transcript HAS A HIGHER SCORE
if ($testing == 0) { print STDERR "WARNING: discarding transcript $transcript, as $current_transcript has a higher score...\n";}
}
}
}
}
else
{
$errormsg = "ERROR: read_maker_scores: gff line $line does not seem to have 8 columns\n";
$errorcode = 4; # ERRORCODE=4 (TESTED FOR)
return(\%DELETE_TRANSCRIPT,$errorcode,$errormsg);
}
}
}
close(INPUT_GFF);
return(\%DELETE_TRANSCRIPT,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &find_best_predictions
sub test_find_best_predictions
{
my %OVERLAP = (); # HASH TABLE WITH OVERLAPS BETWEEN GENES
my %SCORE = (); # HASH TABLE WITH SCORES OF GENES
my $DISCARD; # HASH TABLE OF GENES TO DISCARD
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
$SCORE{"gene1"} = 10;
$SCORE{"gene2"} = 9.5;
$SCORE{"gene3"} = 9.0;
$SCORE{"gene4"} = 8.5;
$SCORE{"gene5"} = 8.0;
$SCORE{"gene6"} = 1.0;
# ---gene1------
# -----gene2----
# -----gene3----
# ---gene4---
# ---gene5---
# --gene6--
$OVERLAP{"gene1"} = "gene2";
$OVERLAP{"gene2"} = "gene1,gene3";
$OVERLAP{"gene3"} = "gene2";
$OVERLAP{"gene4"} = "gene5";
$OVERLAP{"gene5"} = "gene4";
($DISCARD,$errorcode,$errormsg) = &find_best_predictions(1,\%OVERLAP,\%SCORE,2);
if ($errorcode != 0) { print STDERR "ERROR: test_find_best_predictions: failed test1\n"; exit;}
if ($DISCARD->{"gene1"}) { print STDERR "ERROR: test_find_best_predictions: failed test1b\n"; exit;}
if (!($DISCARD->{"gene2"})) { print STDERR "ERROR: test_find_best_predictions: failed test1c\n"; exit;}
if ($DISCARD->{"gene3"}) { print STDERR "ERROR: test_find_best_predictions: failed test1d\n"; exit;}
if ($DISCARD->{"gene4"}) { print STDERR "ERROR: test_find_best_predictions: failed test1e\n"; exit;}
if (!($DISCARD->{"gene5"})) { print STDERR "ERROR: test_find_best_predictions: failed test1f\n"; exit;}
if (!($DISCARD->{"gene6"})) { print STDERR "ERROR: test_find_best_predictions: failed test1g\n"; exit;}
# TEST ERRORCODE=107 (SCORE NOT DEFINED FOR A GENE):
%SCORE = ();
# ---gene1------
# -----gene2----
# -----gene3----
# ---gene4---
# ---gene5---
# --gene6--
$SCORE{"gene1"} = 10;
$OVERLAP{"gene1"} = "gene2";
$OVERLAP{"gene2"} = "gene1,gene3";
$OVERLAP{"gene3"} = "gene2";
$OVERLAP{"gene4"} = "gene5";
$OVERLAP{"gene5"} = "gene4";
($DISCARD,$errorcode,$errormsg) = &find_best_predictions(1,\%OVERLAP,\%SCORE,2);
if ($errorcode != 107) { print STDERR "ERROR: test_find_best_predictions: failed test2 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
}
#------------------------------------------------------------------#
# FIND THE HIGHEST-SCORING NON-OVERLAPPING GENE PREDICTIONS:
sub find_best_predictions
{
my $testing = $_[0]; # SAYS WHETHER THIS IS BEING CALLED FROM A TESTING SUBROUTINE
my $OVERLAP = $_[1]; # HASH TABLE OF OVERLAPPING GENES
my $SCORE = $_[2]; # HASH TABLE OF GENE SCORES
my $min_gene_score = $_[3]; # MINIMUM SCORE OF GENES TO TAKE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my @genes; # ARRAY OF ALL THE GENES
my $i; #
my $gene; # GENE
my $score; # GENE SCORE
my $overlaps; # OVERLAPPING GENES FOR $gene
my @overlaps; # OVERLAPPING GENES FOR $gene
my $j; #
my $overlap; # AN OVERLAPPING GENE FOR $gene
my $overlap_score; # SCORE FOR $overlap
my %DISCARD = (); # HASH TABLE OF GENES TO DISCARD
# GET AN ARRAY OF ALL THE GENES:
@genes = keys %{$SCORE};
# NOW SORT THE ARRAY BY SCORE:
@genes = sort { $SCORE->{$a} <=> $SCORE->{$b} } @genes;
# NOW LOOP THROUGH THE GENES, TAKING THE HIGHEST SCORING GENE FIRST:
for ($i = $#genes; $i >= 0; $i--)
{
$gene = $genes[$i];
$score = $SCORE->{$gene};
# IGNORE THIS GENE IF WE HAVE DECIDED TO DISCARD IT:
if (!($DISCARD{$gene}))
{
# FIND THE GENES OVERLAPPING $gene:
if ($OVERLAP->{$gene})
{
$overlaps = $OVERLAP->{$gene};
@overlaps = split(/\,/,$overlaps);
for ($j = 0; $j <= $#overlaps; $j++)
{
$overlap = $overlaps[$j];
if (!(defined($SCORE->{$overlap})))
{
$errorcode = 107; # ERRORCODE=107 (TESTED FOR)
$errormsg = "ERROR: find_best_predictions: do not know score for $overlap\n";
return(\%DISCARD,$errorcode,$errormsg);
}
$overlap_score = $SCORE->{$overlap};
# IF $overlap HAS A LOWER SCORE, WE WANT TO DELETE IT:
if ($overlap_score < $score)
{
$DISCARD{$overlap} = 1;
if ($testing == 0) { print STDERR "Discarding $overlap ($overlap_score) as it overlaps $gene ($score)...\n";}
}
}
}
}
# IF THE SCORE IS < $min_gene_score, WE WANT TO DISCARD THE GENE:
if ($score < $min_gene_score)
{
$DISCARD{$gene} = 1;
if ($testing == 0) { print STDERR "Discarding $gene has it has score $score (<$min_gene_score)...\n";}
}
}
return(\%DISCARD,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &read_scores
sub test_read_scores
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
my $gff; # INPUT GFF FILE
my %SCORE = (); # HASH TABLE OF GENE SCORES
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_run_main_program: cannot open gff $gff\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tintron\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tCDS\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tCDS\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tCDS\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tCDS\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tCDS\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-5\n";
close(GFF);
($errorcode,$errormsg) = &read_scores($input_gff,\%SCORE,0);
if ($errorcode != 0) { print STDERR "ERROR: test_read_scores: failed test1\n"; exit;}
if ($SCORE{"ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1"} != -24.36) { print STDERR "ERROR: test_read_scores: failed test1b\n"; exit;}
if ($SCORE{"ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2"} != -3.52) { print STDERR "ERROR: test_read_scores: failed test1c\n"; exit;}
if ($SCORE{"ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3"} != -3.42) { print STDERR "ERROR: test_read_scores: failed test1d\n"; exit;}
if ($SCORE{"ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4"} != 200.15) { print STDERR "ERROR: test_read_scores: failed test1e\n"; exit;}
if ($SCORE{"ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5"} != 252.15) { print STDERR "ERROR: test_read_scores: failed test1f\n"; exit;}
if ($SCORE{"ACAC.scaffold.00050.383205-TF101001-genewise-prediction-1"} != -24.36) { print STDERR "ERROR: test_read_scores: failed test1g\n"; exit;}
if ($SCORE{"ACAC.scaffold.00050.383205-TF101001-genewise-prediction-2"} != -3.52) { print STDERR "ERROR: test_read_scores: failed test1h\n"; exit;}
if ($SCORE{"ACAC.scaffold.00050.383205-TF101001-genewise-prediction-3"} != -3.42) { print STDERR "ERROR: test_read_scores: failed test1i\n"; exit;}
if ($SCORE{"ACAC.scaffold.00050.383205-TF101001-genewise-prediction-4"} != 200.15) { print STDERR "ERROR: test_read_scores: failed test1j\n"; exit;}
if ($SCORE{"ACAC.scaffold.00050.383205-TF101001-genewise-prediction-5"} != 252.15) { print STDERR "ERROR: test_read_scores: failed test1k\n"; exit;}
system "rm -f $gff";
# TEST ERRORCODE=101 (DO NOT KNOW SCORE FOR GENE):
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_run_main_program: cannot open gff $gff\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tintron\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tCDS\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tCDS\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tCDS\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tCDS\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tCDS\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-5\n";
close(GFF);
$SCORE{"ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1"} = 101;
($errorcode,$errormsg) = &read_scores($input_gff,\%SCORE,0);
if ($errorcode != 101) { print STDERR "ERROR: test_read_scores: failed test2\n"; exit;}
system "rm -f $gff";
# TEST ERRORCODE=102 (GFF DOES NOT HAVE 8 COLUMNS):
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_run_main_program: cannot open gff $gff\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tintron\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tCDS\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tCDS\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tCDS\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tCDS\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tCDS\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383205-TF101001-genewise-prediction-5\n";
close(GFF);
($errorcode,$errormsg) = &read_scores($input_gff,\%SCORE,0);
if ($errorcode != 102) { print STDERR "ERROR: test_read_scores: failed test3\n"; exit;}
system "rm -f $gff";
}
#------------------------------------------------------------------#
# READ IN THE GENE SCORES:
sub read_scores
{
my $input_gff = $_[0]; # INPUT GFF FILE
my $SCORE = $_[1]; # HASH TABLE FOR STORING GENE SCORES
my $subtract_from = $_[2]; # SUBTRACT THE SCORE FROM THIS NUMBER
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $end_of_gff = 0; # SAYS WHETHER WE HAVE REACHED THE END OF THE GFF FILE
my $score; # GENE SCORE
my $gene; # GENE NAME
my $line; #
my @temp; #
open(INPUT_GFF,"$input_gff") || die "ERROR: read_scores: cannot open $input_gff\n";
while(<INPUT_GFF>)
{
$line = $_;
chomp $line;
if ($line eq '##FASTA') { $end_of_gff = 1;} # END OF GFF FILE
if (substr($line,0,1) ne '#' && $end_of_gff == 0)
{
@temp = split(/\t+/,$line);
if ($#temp == 8)
{
if ($temp[2] eq 'gene') # gene FEATURE
{
$score = $temp[5];
$gene = $temp[8];
@temp = split(/ID=/,$gene);
$gene = $temp[1];
if ($subtract_from ne 'none') # eg. FOR MAKER, WE WANT TO SUBTRACT THE SCORE FROM 1
{
$score = $subtract_from - $score;
}
if ($SCORE->{$gene})
{
$errormsg = "ERROR: read_scores: already have score for $gene line $line\n";
$errorcode = 101; # ERRORCODE=101 (TESTED FOR)
return($errorcode,$errormsg);
}
$SCORE->{$gene} = $score;
}
}
else
{
$errormsg = "ERROR: read_scores: gff line $line does not seem to have 8 columns\n";
$errorcode = 102; # ERRORCODE=102 (TESTED FOR)
return($errorcode,$errormsg);
}
}
}
close(INPUT_GFF);
return($errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &make_cds_gff
sub test_make_cds_gff
{
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES INTO
my $input_gff; # INPUT GFF FILE
my $new_gff; # NEW GFF FILE
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETUREND AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
my $expected_new_gff; # FILE WITH EXPECTED CONTENTS OF $new_gff
my $differences; # DIFFERENCES BETWEEN $new_gff AND $expected_new_gff
my $length_differences; # LENGTH OF $differences
my $line; #
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_make_cds_gff: cannot open input_gff $input_gff\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise gene 185915 185917 -24.36 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 185915 185917 0.00 + 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise gene 185920 185931 -3.42 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 185920 185931 0.00 + 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise gene 206829 214134 252.15 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 214021 214134 0.00 - 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise intron 213951 214020 0.00 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 213864 213950 0.00 - 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise intron 213848 213863 0.00 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 213764 213847 0.00 - 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise intron 211581 213763 0.00 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 211467 211580 0.00 - 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise intron 211407 211466 0.00 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 211285 211406 0.00 - 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise intron 211224 211284 0.00 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 211098 211223 0.00 - 1 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise intron 211035 211097 0.00 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 210932 211034 0.00 - 1 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise intron 210844 210931 0.00 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 210731 210843 0.00 - 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise intron 207010 210730 0.00 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 206829 207009 0.00 - 1 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
close(INPUT_GFF);
($new_gff,$errorcode,$errormsg) = &make_cds_gff($input_gff,$outputdir);
if ($errorcode != 0) { print STDERR "ERROR: test_make_cds_gff: failed test1\n"; exit;}
($expected_new_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_new_gff") || die "ERROR: test_make_cds_gff: cannot open $expected_new_gff\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise CDS 185915 185917 0.00 + 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise CDS 185920 185931 0.00 + 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise CDS 214021 214134 0.00 - 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise CDS 213864 213950 0.00 - 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise CDS 213764 213847 0.00 - 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise CDS 211467 211580 0.00 - 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise CDS 211285 211406 0.00 - 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise CDS 211098 211223 0.00 - 1 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise CDS 210932 211034 0.00 - 1 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise CDS 210731 210843 0.00 - 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise CDS 206829 207009 0.00 - 1 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $new_gff $expected_new_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_make_cds_gff: failed test1 (new_gff $new_gff expected_new_gff $expected_new_gff)\n"; exit;}
system "rm -f $input_gff";
system "rm -f $new_gff";
system "rm -f $expected_new_gff";
}
#------------------------------------------------------------------#
# MAKE A GFF FILE WITH JUST THE 'cds' FEATURES (NOT 'gene' OR 'intron' FEATURES):
sub make_cds_gff
{
my $input_gff = $_[0]; # INPUT GFF FILE
my $outputdir = $_[1]; # DIRECTORY FOR PUTTING OUTPUT FILES INTO
my $new_gff; # NEW GFF FILE WITH JUST THE 'cds' FEATURES
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $line; #
my @temp; #
my $end_of_gff = 0; # SAYS WHETHER WE HAVE REACHED THE END OF THE GFF FILE
# JUST TAKE THE 'cds' FEATURES FROM $input_gff:
($new_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(NEW_GFF,">$new_gff") || die "ERROR: make_cds_gff: cannot open $new_gff\n";
open(INPUT_GFF,"$input_gff") || die "ERROR: make_cds_gff: cannot open $input_gff\n";
while(<INPUT_GFF>)
{
$line = $_;
chomp $line;
if ($line eq '##FASTA') { $end_of_gff = 1;} # WE HAVE REACHED THE END OF THE GFF FILE
if (substr($line,0,1) ne '#' && $end_of_gff == 0)
{
@temp = split(/\t+/,$line);
if ($temp[2] eq 'CDS') { print NEW_GFF "$line\n";}
}
}
close(INPUT_GFF);
close(NEW_GFF);
return($new_gff,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# SUBROUTINE TO MAKE A FILE NAME FOR A TEMPORARY FILE:
sub make_filename
{
my $outputdir = $_[0]; # OUTPUT DIRECTORY TO WRITE TEMPORARY FILE NAME TO
my $found_name = 0; # SAYS WHETHER WE HAVE FOUND A FILE NAME YET
my $filename = "none";# NEW FILE NAME TO USE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $poss_filename; # POSSIBLE FILE NAME TO USE
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAME
while($found_name == 0)
{
$random_number = rand();
$poss_filename = $outputdir."/tmp".$random_number;
if (!(-e $poss_filename))
{
$filename = $poss_filename;
$found_name = 1;
}
}
if ($found_name == 0 || $filename eq 'none')
{
$errormsg = "ERROR: make_filename: found_name $found_name filename $filename\n";
$errorcode = 6; # ERRORCODE=6
return($filename,$errorcode,$errormsg);
}
return($filename,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &print_error
sub test_print_error
{
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR
($errormsg,$errorcode) = &print_error(45,45,1);
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message','My error message',1);
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;}
($errormsg,$errorcode) = &print_error('none',45,1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message', 0, 1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;}
}
#------------------------------------------------------------------#
# PRINT OUT AN ERROR MESSAGE AND EXIT.
sub print_error
{
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED.
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED.
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/)
{
if ($called_from_test == 1)
{
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n";
exit;
}
}
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/))
{
if ($called_from_test == 1)
{
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n";
exit;
}
}
if ($errormsg eq 'none' || $errorcode == 0)
{
if ($called_from_test == 1)
{
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n";
exit;
}
}
else
{
print STDERR "$errormsg";
exit;
}
return($errormsg,$errorcode);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment