Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save srividya22/322549d6f0cdaa10aea41c87c702cdfd to your computer and use it in GitHub Desktop.
Save srividya22/322549d6f0cdaa10aea41c87c702cdfd to your computer and use it in GitHub Desktop.
Perl script that renames genes in the maker gff files so that they have unique names.
#!/usr/bin/env perl
=head1 NAME
rename_genes_in_maker_gff.pl
=head1 SYNOPSIS
rename_genes_in_maker_gff.pl input_gff output_gff outputdir species
where input_gff is the input gff file,
output_gff is the output gff file,
outputdir is the output directory for writing output files,
species is the abbreviation for the species name, eg. BPAG.
=head1 DESCRIPTION
This script takes an input gff file from maker (<input_gff>), and renames the genes
and transcripts, and writes the output gff file (<output_gff>) in directory
<outputdir>. This script can handle multiple transcripts per gene.
=head1 VERSION
Perl script last edited 9-May-2013.
=head1 CONTACT
alc@sanger.ac.uk (Avril Coghlan)
=cut
#
# Perl script rename_genes_in_maker_gff.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 9-May-13.
# Last edited 9-May-2013.
# SCRIPT SYNOPSIS: rename_genes_in_maker_gff.pl: renames genes in the maker gff files so that they have unique names.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
use strict;
use warnings;
my $num_args = $#ARGV + 1;
if ($num_args != 4)
{
print "Usage of rename_genes_in_maker_gff.pl\n\n";
print "perl rename_genes_in_maker_gff.pl <input_gff> <output_gff> <outputdir> <species>\n";
print "where <input_gff> is the input gff file,\n";
print " <output_gff> is the output gff file,\n";
print " <outputdir> is the output directory for writing output files,\n";
print " <species> is the the abbreviation for the species name\n";
print "For example, >perl rename_genes_in_maker_gff.pl round3.gff new_round3.gff\n";
print "/lustre/scratch108/parasites/alc/ BPAG\n";
exit;
}
# FIND THE PATH TO THE INPUT GFF FILE:
my $input_gff = $ARGV[0];
# FIND THE PATH TO THE OUTPUT GFF FILE:
my $output_gff = $ARGV[1];
# FIND THE DIRECTORY TO USE FOR OUTPUT FILES:
my $outputdir = $ARGV[2];
# FIND THE ABBREVIATION FOR THE SPECIES NAME:
my $species = $ARGV[3];
#------------------------------------------------------------------#
# TEST SUBROUTINES:
my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING.
&test_check_whether_new_lines_complete;
&test_find_overlapping_features($outputdir);
&test_sort_gene_gff($outputdir);
&test_print_error;
&test_make_gene_gff($outputdir);
&test_read_genes_for_transcripts($outputdir);
&test_get_new_genename;
&test_assign_exon_numbers($outputdir);
&test_fix_gff_lines_for_gene($outputdir);
&test_count_lines($outputdir);
&test_make_gene_gff2($outputdir);
&test_run_main_program($outputdir);
print STDERR "Finished tests, running main code now...\n";
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
&run_main_program($outputdir,$input_gff,$output_gff,$species,0);
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# TEST THE MAIN PROGRAM:
sub test_run_main_program
{
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES IN
my $input_gff; # INPUT GFF FILE
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
my $output_gff; # OUTPUT GFF FILE
my $expected_output_gff; # FILE WITH EXPECTED CONTENTS OF $output_gff
my $differences; # DIFFERENCES BETWEEN $output_gff AND $expected_output_gff
my $length_differences; # LENGTH OF $differences
my $line; #
my @temp; #
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_run_main_program: cannot open $input_gff\n";
print INPUT_GFF "##gff-version 3\n";
print INPUT_GFF "BPAG.contig.00154.117101 . contig 1 117101 . . . ID=BPAG.contig.00154.117101;Name=BPAG.contig.00154.117101\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:0;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 36179 36341 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 37052 37204 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:2;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 38326 38535 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:3;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 38781 38871 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:4;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 39087 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:5;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 36179 36341 . + 1 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 38326 38535 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 38781 38871 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 39087 39253 . + 2 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker gene 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker mRNA 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;_AED=0.06;_eAED=0.06;_QI=0|0|0|1|1|1|3|0|276\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 103190 104650 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1:exon:1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 blastx protein_match 1346 1510 223 + . ID=BPAG.contig.00154.117101:hit:88:2.10.0.0;Name=tr|J9EFX0|J9EFX0_WUCBA\n";
print INPUT_GFF "BPAG.contig.00154.117101 blastx match_part 1346 1510 223 + . ID=BPAG.contig.00154.117101:hsp:88:2.10.0.0;Parent=BPAG.contig.00154.117101:hit:88:2.10.0.0;Target=tr|J9EFX0|J9EFX0_WUCBA 1 56;Gap=M24 I1 M31\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">BPAG.contig.10610.1000\n";
print INPUT_GFF "CCAAACGATGATTGAAAATTTATTCATTTCTTTGCTTTTAAAAGGTAATAGTTTAAAAGCAACATTCTCTTTCTTATATTTTCAAAGTTTAGAAGAAAGAAAATTTTAATAATTTTCATTAAAATAAAATTAGGTAAGATATTAGAAAGGTTCAGTTGAAGTATTCCAAAATTATTGCTATCCATTTATTATTTATTTAAAAAAAAAAAAAAAGATTGTTAATTACTTTCATTTCTTAAATTTGATTAATTATATCAAAGATTATGAAGAAAAGAAAAATATGCTTTTTGTATTTTTTAAATTTCTGTTTTCAATTATTTTACAAAAATTTTTGTTTATTTTATTAAATCATTTCTCTTTTTTTTTTTTTGCCATTTGATTTTGCCTATTATTTATACAAATAATCATAATTCACTTTAAATATTAATGGATGAATATTTAAAATTATTATTCCATTTGAATTTGAATGTTAGAAAAAAAAATTCTATATATGAAATGATGATTTCATATTATTAATTAATTTTTTTTTAATTAATTAATACTGAAATTATATTAGACGCAAATGCATTTGAAATATTTACAGGCATGAAGATAGGAATATATTAGGTGGCGTACAAATAATTGACGTTACGTAATACACGTAACTTGTTAAGAACGTCCATTCTGCAATGTTTCTATGCAAAAATATACCAAACGATAAACCGTAAATGTTGTGATAATAGCATAAAATTTGCAATATTAAATTTGGGATAAAAAATACGAACAATATTTAGAAGAAAAATATTTACATAAAAATTAAAAATCAAATTTGAACTTTTGCAAATATTTATTAGACAACAATTTTAAAATTACAACTTAACATAGCAAACATTGATTAAAAGTAAAAGCTCAAAGTAAACAAAAAAAAAAGAATTCATTTCTTCAGTTTAATTTTAAGCCCTGTAAATAGGAAAACAAAATTCTAAAATTATTGCAAATTTTCTAGAAATTTTTTAAAAAA\n";
close(INPUT_GFF);
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@temp = split(/\//,$output_gff);
$output_gff = $temp[$#temp];
&run_main_program($outputdir,$input_gff,$output_gff,'SRAE',1);
($expected_output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_output_gff") || die "ERROR: test_run_main_program: failed test1\n";
print EXPECTED "##gff-version 3\n";
print EXPECTED "BPAG.contig.00154.117101 . contig 1 117101 . . . ID=BPAG.contig.00154.117101;Name=BPAG.contig.00154.117101\n";
print EXPECTED "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=SRAE_0000000001;Name=SRAE_0000000001\n";
print EXPECTED "BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=SRAE_0000000001-mRNA-1;Parent=SRAE_0000000001;Name=SRAE_0000000001-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=SRAE_0000000001-mRNA-1:exon:1;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 36179 36341 . + . ID=SRAE_0000000001-mRNA-1:exon:2;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 37052 37204 . + . ID=SRAE_0000000001-mRNA-1:exon:3;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 38326 38535 . + . ID=SRAE_0000000001-mRNA-1:exon:4;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 38781 38871 . + . ID=SRAE_0000000001-mRNA-1:exon:5;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 39087 39253 . + . ID=SRAE_0000000001-mRNA-1:exon:6;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=SRAE_0000000001-mRNA-1:cds;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 36179 36341 . + 1 ID=SRAE_0000000001-mRNA-1:cds;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=SRAE_0000000001-mRNA-1:cds;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 38326 38535 . + 0 ID=SRAE_0000000001-mRNA-1:cds;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 38781 38871 . + 0 ID=SRAE_0000000001-mRNA-1:cds;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 39087 39253 . + 2 ID=SRAE_0000000001-mRNA-1:cds;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker gene 103190 104650 . + . ID=SRAE_0000000101;Name=SRAE_0000000101\n";
print EXPECTED "BPAG.contig.00154.117101 maker mRNA 103190 104650 . + . ID=SRAE_0000000101-mRNA-1;Parent=SRAE_0000000101;Name=SRAE_0000000101-mRNA-1;_AED=0.06;_eAED=0.06;_QI=0|0|0|1|1|1|3|0|276\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 103190 104650 . + 0 ID=SRAE_0000000101-mRNA-1:exon:1;Parent=SRAE_0000000101-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 103190 104650 . + . ID=SRAE_0000000101-mRNA-1:cds;Parent=SRAE_0000000101-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 blastx protein_match 1346 1510 223 + . ID=BPAG.contig.00154.117101:hit:88:2.10.0.0;Name=tr|J9EFX0|J9EFX0_WUCBA\n";
print EXPECTED "BPAG.contig.00154.117101 blastx match_part 1346 1510 223 + . ID=BPAG.contig.00154.117101:hsp:88:2.10.0.0;Parent=BPAG.contig.00154.117101:hit:88:2.10.0.0;Target=tr|J9EFX0|J9EFX0_WUCBA 1 56;Gap=M24 I1 M31\n";
print EXPECTED "##FASTA\n";
print EXPECTED ">BPAG.contig.10610.1000\n";
print EXPECTED "CCAAACGATGATTGAAAATTTATTCATTTCTTTGCTTTTAAAAGGTAATAGTTTAAAAGCAACATTCTCTTTCTTATATTTTCAAAGTTTAGAAGAAAGAAAATTTTAATAATTTTCATTAAAATAAAATTAGGTAAGATATTAGAAAGGTTCAGTTGAAGTATTCCAAAATTATTGCTATCCATTTATTATTTATTTAAAAAAAAAAAAAAAGATTGTTAATTACTTTCATTTCTTAAATTTGATTAATTATATCAAAGATTATGAAGAAAAGAAAAATATGCTTTTTGTATTTTTTAAATTTCTGTTTTCAATTATTTTACAAAAATTTTTGTTTATTTTATTAAATCATTTCTCTTTTTTTTTTTTTGCCATTTGATTTTGCCTATTATTTATACAAATAATCATAATTCACTTTAAATATTAATGGATGAATATTTAAAATTATTATTCCATTTGAATTTGAATGTTAGAAAAAAAAATTCTATATATGAAATGATGATTTCATATTATTAATTAATTTTTTTTTAATTAATTAATACTGAAATTATATTAGACGCAAATGCATTTGAAATATTTACAGGCATGAAGATAGGAATATATTAGGTGGCGTACAAATAATTGACGTTACGTAATACACGTAACTTGTTAAGAACGTCCATTCTGCAATGTTTCTATGCAAAAATATACCAAACGATAAACCGTAAATGTTGTGATAATAGCATAAAATTTGCAATATTAAATTTGGGATAAAAAATACGAACAATATTTAGAAGAAAAATATTTACATAAAAATTAAAAATCAAATTTGAACTTTTGCAAATATTTATTAGACAACAATTTTAAAATTACAACTTAACATAGCAAACATTGATTAAAAGTAAAAGCTCAAAGTAAACAAAAAAAAAAGAATTCATTTCTTCAGTTTAATTTTAAGCCCTGTAAATAGGAAAACAAAATTCTAAAATTATTGCAAATTTTCTAGAAATTTTTTAAAAAA\n";
close(EXPECTED);
$output_gff = $outputdir."/".$output_gff;
$differences = "";
open(TEMP,"diff $output_gff $expected_output_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_run_main_program: failed test1 (output_gff $output_gff expected_output_gff $expected_output_gff)\n"; exit;}
system "rm -f $output_gff";
system "rm -f $input_gff";
system "rm -f $expected_output_gff";
# EXAMPLE WITH MULTIPLE TRANSCRIPTS IN THE SAME GENE:
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_run_main_program: cannot open $input_gff\n";
print INPUT_GFF "##gff-version 3\n";
print INPUT_GFF "BPAG.contig.00154.117101 . contig 1 117101 . . . ID=BPAG.contig.00154.117101;Name=BPAG.contig.00154.117101\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:0;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 36179 36341 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 37052 37204 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:2;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 38326 38535 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:3;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 38781 38871 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:4;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 39087 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:5;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 36179 36341 . + 1 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 38326 38535 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 38781 38871 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 39087 39253 . + 2 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2:exon:0;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 36179 36341 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2:exon:1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 37052 37204 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2:exon:2;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 38326 38535 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2:exon:3;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 38781 38871 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2:exon:4;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 39087 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2:exon:5;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 36179 36341 . + 1 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 38326 38535 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 38781 38871 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 39087 39253 . + 2 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-2\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker gene 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker mRNA 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;_AED=0.06;_eAED=0.06;_QI=0|0|0|1|1|1|3|0|276\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 103190 104650 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1:exon:1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 blastx protein_match 1346 1510 223 + . ID=BPAG.contig.00154.117101:hit:88:2.10.0.0;Name=tr|J9EFX0|J9EFX0_WUCBA\n";
print INPUT_GFF "BPAG.contig.00154.117101 blastx match_part 1346 1510 223 + . ID=BPAG.contig.00154.117101:hsp:88:2.10.0.0;Parent=BPAG.contig.00154.117101:hit:88:2.10.0.0;Target=tr|J9EFX0|J9EFX0_WUCBA 1 56;Gap=M24 I1 M31\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">BPAG.contig.10610.1000\n";
print INPUT_GFF "CCAAACGATGATTGAAAATTTATTCATTTCTTTGCTTTTAAAAGGTAATAGTTTAAAAGCAACATTCTCTTTCTTATATTTTCAAAGTTTAGAAGAAAGAAAATTTTAATAATTTTCATTAAAATAAAATTAGGTAAGATATTAGAAAGGTTCAGTTGAAGTATTCCAAAATTATTGCTATCCATTTATTATTTATTTAAAAAAAAAAAAAAAGATTGTTAATTACTTTCATTTCTTAAATTTGATTAATTATATCAAAGATTATGAAGAAAAGAAAAATATGCTTTTTGTATTTTTTAAATTTCTGTTTTCAATTATTTTACAAAAATTTTTGTTTATTTTATTAAATCATTTCTCTTTTTTTTTTTTTGCCATTTGATTTTGCCTATTATTTATACAAATAATCATAATTCACTTTAAATATTAATGGATGAATATTTAAAATTATTATTCCATTTGAATTTGAATGTTAGAAAAAAAAATTCTATATATGAAATGATGATTTCATATTATTAATTAATTTTTTTTTAATTAATTAATACTGAAATTATATTAGACGCAAATGCATTTGAAATATTTACAGGCATGAAGATAGGAATATATTAGGTGGCGTACAAATAATTGACGTTACGTAATACACGTAACTTGTTAAGAACGTCCATTCTGCAATGTTTCTATGCAAAAATATACCAAACGATAAACCGTAAATGTTGTGATAATAGCATAAAATTTGCAATATTAAATTTGGGATAAAAAATACGAACAATATTTAGAAGAAAAATATTTACATAAAAATTAAAAATCAAATTTGAACTTTTGCAAATATTTATTAGACAACAATTTTAAAATTACAACTTAACATAGCAAACATTGATTAAAAGTAAAAGCTCAAAGTAAACAAAAAAAAAAGAATTCATTTCTTCAGTTTAATTTTAAGCCCTGTAAATAGGAAAACAAAATTCTAAAATTATTGCAAATTTTCTAGAAATTTTTTAAAAAA\n";
close(INPUT_GFF);
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@temp = split(/\//,$output_gff);
$output_gff = $temp[$#temp];
&run_main_program($outputdir,$input_gff,$output_gff,'SRAE',1);
($expected_output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_output_gff") || die "ERROR: test_run_main_program: failed test2\n";
print EXPECTED "##gff-version 3\n";
print EXPECTED "BPAG.contig.00154.117101 . contig 1 117101 . . . ID=BPAG.contig.00154.117101;Name=BPAG.contig.00154.117101\n";
print EXPECTED "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=SRAE_0000000001;Name=SRAE_0000000001\n";
print EXPECTED "BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=SRAE_0000000001-mRNA-1;Parent=SRAE_0000000001;Name=SRAE_0000000001-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=SRAE_0000000001-mRNA-1:exon:1;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 36179 36341 . + . ID=SRAE_0000000001-mRNA-1:exon:2;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 37052 37204 . + . ID=SRAE_0000000001-mRNA-1:exon:3;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 38326 38535 . + . ID=SRAE_0000000001-mRNA-1:exon:4;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 38781 38871 . + . ID=SRAE_0000000001-mRNA-1:exon:5;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 39087 39253 . + . ID=SRAE_0000000001-mRNA-1:exon:6;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=SRAE_0000000001-mRNA-1:cds;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 36179 36341 . + 1 ID=SRAE_0000000001-mRNA-1:cds;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=SRAE_0000000001-mRNA-1:cds;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 38326 38535 . + 0 ID=SRAE_0000000001-mRNA-1:cds;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 38781 38871 . + 0 ID=SRAE_0000000001-mRNA-1:cds;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 39087 39253 . + 2 ID=SRAE_0000000001-mRNA-1:cds;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=SRAE_0000000001-mRNA-2;Parent=SRAE_0000000001;Name=SRAE_0000000001-mRNA-2;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=SRAE_0000000001-mRNA-2:exon:1;Parent=SRAE_0000000001-mRNA-2\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 36179 36341 . + . ID=SRAE_0000000001-mRNA-2:exon:2;Parent=SRAE_0000000001-mRNA-2\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 37052 37204 . + . ID=SRAE_0000000001-mRNA-2:exon:3;Parent=SRAE_0000000001-mRNA-2\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 38326 38535 . + . ID=SRAE_0000000001-mRNA-2:exon:4;Parent=SRAE_0000000001-mRNA-2\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 38781 38871 . + . ID=SRAE_0000000001-mRNA-2:exon:5;Parent=SRAE_0000000001-mRNA-2\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 39087 39253 . + . ID=SRAE_0000000001-mRNA-2:exon:6;Parent=SRAE_0000000001-mRNA-2\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=SRAE_0000000001-mRNA-2:cds;Parent=SRAE_0000000001-mRNA-2\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 36179 36341 . + 1 ID=SRAE_0000000001-mRNA-2:cds;Parent=SRAE_0000000001-mRNA-2\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=SRAE_0000000001-mRNA-2:cds;Parent=SRAE_0000000001-mRNA-2\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 38326 38535 . + 0 ID=SRAE_0000000001-mRNA-2:cds;Parent=SRAE_0000000001-mRNA-2\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 38781 38871 . + 0 ID=SRAE_0000000001-mRNA-2:cds;Parent=SRAE_0000000001-mRNA-2\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 39087 39253 . + 2 ID=SRAE_0000000001-mRNA-2:cds;Parent=SRAE_0000000001-mRNA-2\n";
print EXPECTED "BPAG.contig.00154.117101 maker gene 103190 104650 . + . ID=SRAE_0000000101;Name=SRAE_0000000101\n";
print EXPECTED "BPAG.contig.00154.117101 maker mRNA 103190 104650 . + . ID=SRAE_0000000101-mRNA-1;Parent=SRAE_0000000101;Name=SRAE_0000000101-mRNA-1;_AED=0.06;_eAED=0.06;_QI=0|0|0|1|1|1|3|0|276\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 103190 104650 . + 0 ID=SRAE_0000000101-mRNA-1:exon:1;Parent=SRAE_0000000101-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 103190 104650 . + . ID=SRAE_0000000101-mRNA-1:cds;Parent=SRAE_0000000101-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 blastx protein_match 1346 1510 223 + . ID=BPAG.contig.00154.117101:hit:88:2.10.0.0;Name=tr|J9EFX0|J9EFX0_WUCBA\n";
print EXPECTED "BPAG.contig.00154.117101 blastx match_part 1346 1510 223 + . ID=BPAG.contig.00154.117101:hsp:88:2.10.0.0;Parent=BPAG.contig.00154.117101:hit:88:2.10.0.0;Target=tr|J9EFX0|J9EFX0_WUCBA 1 56;Gap=M24 I1 M31\n";
print EXPECTED "##FASTA\n";
print EXPECTED ">BPAG.contig.10610.1000\n";
print EXPECTED "CCAAACGATGATTGAAAATTTATTCATTTCTTTGCTTTTAAAAGGTAATAGTTTAAAAGCAACATTCTCTTTCTTATATTTTCAAAGTTTAGAAGAAAGAAAATTTTAATAATTTTCATTAAAATAAAATTAGGTAAGATATTAGAAAGGTTCAGTTGAAGTATTCCAAAATTATTGCTATCCATTTATTATTTATTTAAAAAAAAAAAAAAAGATTGTTAATTACTTTCATTTCTTAAATTTGATTAATTATATCAAAGATTATGAAGAAAAGAAAAATATGCTTTTTGTATTTTTTAAATTTCTGTTTTCAATTATTTTACAAAAATTTTTGTTTATTTTATTAAATCATTTCTCTTTTTTTTTTTTTGCCATTTGATTTTGCCTATTATTTATACAAATAATCATAATTCACTTTAAATATTAATGGATGAATATTTAAAATTATTATTCCATTTGAATTTGAATGTTAGAAAAAAAAATTCTATATATGAAATGATGATTTCATATTATTAATTAATTTTTTTTTAATTAATTAATACTGAAATTATATTAGACGCAAATGCATTTGAAATATTTACAGGCATGAAGATAGGAATATATTAGGTGGCGTACAAATAATTGACGTTACGTAATACACGTAACTTGTTAAGAACGTCCATTCTGCAATGTTTCTATGCAAAAATATACCAAACGATAAACCGTAAATGTTGTGATAATAGCATAAAATTTGCAATATTAAATTTGGGATAAAAAATACGAACAATATTTAGAAGAAAAATATTTACATAAAAATTAAAAATCAAATTTGAACTTTTGCAAATATTTATTAGACAACAATTTTAAAATTACAACTTAACATAGCAAACATTGATTAAAAGTAAAAGCTCAAAGTAAACAAAAAAAAAAGAATTCATTTCTTCAGTTTAATTTTAAGCCCTGTAAATAGGAAAACAAAATTCTAAAATTATTGCAAATTTTCTAGAAATTTTTTAAAAAA\n";
close(EXPECTED);
$output_gff = $outputdir."/".$output_gff;
$differences = "";
open(TEMP,"diff $output_gff $expected_output_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_run_main_program: failed test2 (output_gff $output_gff expected_output_gff $expected_output_gff)\n"; exit;}
system "rm -f $output_gff";
system "rm -f $input_gff";
system "rm -f $expected_output_gff";
# EXAMPLE WITH TWO OVERLAPPING GENES WITH THE SAME NAME, ON OPPOSITE STRANDS:
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_run_main_program: cannot open $input_gff\n";
print INPUT_GFF "BPAG.contig.08311.1317.reaprBIN . contig 1 1317 . . . ID=BPAG.contig.08311.1317.reaprBIN;Name=BPAG.contig.08311.1317.reaprBIN\n";
print INPUT_GFF "BPAG.contig.08311.1317.reaprBIN maker gene 835 1248 . + . ID=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0;Name=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0\n";
print INPUT_GFF "BPAG.contig.08311.1317.reaprBIN maker mRNA 835 1248 . + . ID=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1;Parent=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0;Name=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1;_AED=0.03;_eAED=0.03;_QI=0|0|0|0.5|0|0|2|0|77\n";
print INPUT_GFF "BPAG.contig.08311.1317.reaprBIN maker exon 835 837 . + . ID=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1:exon:9;Parent=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.08311.1317.reaprBIN maker exon 1018 1248 . + . ID=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1:exon:10;Parent=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.08311.1317.reaprBIN maker CDS 835 837 . + 0 ID=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1:cds;Parent=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.08311.1317.reaprBIN maker CDS 1018 1248 . + 0 ID=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1:cds;Parent=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.08311.1317.reaprBIN maker gene 1035 1265 . - . ID=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0;Name=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0\n";
print INPUT_GFF "BPAG.contig.08311.1317.reaprBIN maker mRNA 1035 1265 . - . ID=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1;Parent=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0;Name=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1;_AED=1.00;_eAED=1.00;_QI=0|-1|0|0|-1|1|1|0|76\n";
print INPUT_GFF "BPAG.contig.08311.1317.reaprBIN maker exon 1035 1265 . - . ID=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1:exon:11;Parent=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.08311.1317.reaprBIN maker CDS 1035 1265 . - 0 ID=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1:cds;Parent=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.08311.1317.reaprBIN augustus_masked match 1035 1265 0.37 - . ID=BPAG.contig.08311.1317.reaprBIN:hit:56:3.5.0.0;Name=augustus_masked-BPAG.contig.08311.1317.reaprBIN-abinit-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.08311.1317.reaprBIN augustus_masked match_part 1035 1265 0.37 - . ID=BPAG.contig.08311.1317.reaprBIN:hsp:116:3.5.0.0;Parent=BPAG.contig.08311.1317.reaprBIN:hit:56:3.5.0.0;Target=augustus_masked-BPAG.contig.08311.1317.reaprBIN-abinit-gene-0.0-mRNA-1 1 231 +;Gap=M231\n";
print INPUT_GFF "BPAG.contig.08311.1317.reaprBIN model_gff:maker match 835 1248 . + . ID=BPAG.contig.08311.1317.reaprBIN:hit:57:3.5.0.0;Name=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.08311.1317.reaprBIN model_gff:maker match_part 835 837 . + . ID=BPAG.contig.08311.1317.reaprBIN:hsp:117:3.5.0.0;Parent=BPAG.contig.08311.1317.reaprBIN:hit:57:3.5.0.0;Target=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1 1 3 +;Gap=M3\n";
print INPUT_GFF "BPAG.contig.08311.1317.reaprBIN model_gff:maker match_part 1018 1248 . + . ID=BPAG.contig.08311.1317.reaprBIN:hsp:118:3.5.0.0;Parent=BPAG.contig.08311.1317.reaprBIN:hit:57:3.5.0.0;Target=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1 4 234 +;Gap=M231\n";
print INPUT_GFF "BPAG.contig.08311.1317.reaprBIN repeatmasker match 104 144 17 + . ID=BPAG.contig.08311.1317.reaprBIN:hit:37:1.3.0.0;Name=species:%2528ACCT%2529n|genus:Simple_repeat;Target=species:%2528ACCT%2529n|genus:Simple_repeat 1 41 +\n";
print INPUT_GFF "BPAG.contig.08311.1317.reaprBIN repeatmasker match_part 104 144 17 + . ID=BPAG.contig.08311.1317.reaprBIN:hsp:97:1.3.0.0;Parent=BPAG.contig.08311.1317.reaprBIN:hit:37:1.3.0.0;Target=species:%2528ACCT%2529n|genus:Simple_repeat 1 41 +\n";
print INPUT_GFF "BPAG.contig.08311.1317.reaprBIN blastx protein_match 1015 1239 287 + . ID=BPAG.contig.08311.1317.reaprBIN:hit:38:2.10.0.0;Name=BUX_s00036.29.1\n";
print INPUT_GFF "BPAG.contig.08311.1317.reaprBIN blastx match_part 1015 1239 287 + . ID=BPAG.contig.08311.1317.reaprBIN:hsp:98:2.10.0.0;Parent=BPAG.contig.08311.1317.reaprBIN:hit:38:2.10.0.0;Target=BUX_s00036.29.1 1 74;Gap=M38 D1 M36\n";
print INPUT_GFF "BPAG.contig.08311.1317.reaprBIN blastx protein_match 1018 1242 272 + . ID=BPAG.contig.08311.1317.reaprBIN:hit:39:2.10.0.0;Name=UniProtKB:H3I027\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">BPAG.contig.08311.1317.reaprBIN\n";
close(INPUT_GFF);
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@temp = split(/\//,$output_gff);
$output_gff = $temp[$#temp];
&run_main_program($outputdir,$input_gff,$output_gff,'SRAE',1);
($expected_output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_output_gff") || die "ERROR: test_run_main_program: failed test3\n";
print EXPECTED "BPAG.contig.08311.1317.reaprBIN . contig 1 1317 . . . ID=BPAG.contig.08311.1317.reaprBIN;Name=BPAG.contig.08311.1317.reaprBIN\n";
print EXPECTED "BPAG.contig.08311.1317.reaprBIN maker gene 835 1248 . + . ID=SRAE_0000000001;Name=SRAE_0000000001\n";
print EXPECTED "BPAG.contig.08311.1317.reaprBIN maker mRNA 835 1248 . + . ID=SRAE_0000000001-mRNA-1;Parent=SRAE_0000000001;Name=SRAE_0000000001-mRNA-1;_AED=0.03;_eAED=0.03;_QI=0|0|0|0.5|0|0|2|0|77\n";
print EXPECTED "BPAG.contig.08311.1317.reaprBIN maker exon 835 837 . + . ID=SRAE_0000000001-mRNA-1:exon:1;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.08311.1317.reaprBIN maker exon 1018 1248 . + . ID=SRAE_0000000001-mRNA-1:exon:2;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.08311.1317.reaprBIN maker CDS 835 837 . + 0 ID=SRAE_0000000001-mRNA-1:cds;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.08311.1317.reaprBIN maker CDS 1018 1248 . + 0 ID=SRAE_0000000001-mRNA-1:cds;Parent=SRAE_0000000001-mRNA-1\n";
print EXPECTED "BPAG.contig.08311.1317.reaprBIN maker gene 1035 1265 . - . ID=SRAE_0000000101;Name=SRAE_0000000101\n";
print EXPECTED "BPAG.contig.08311.1317.reaprBIN maker mRNA 1035 1265 . - . ID=SRAE_0000000101-mRNA-1;Parent=SRAE_0000000101;Name=SRAE_0000000101-mRNA-1;_AED=1.00;_eAED=1.00;_QI=0|-1|0|0|-1|1|1|0|76\n";
print EXPECTED "BPAG.contig.08311.1317.reaprBIN maker exon 1035 1265 . - . ID=SRAE_0000000101-mRNA-1:exon:1;Parent=SRAE_0000000101-mRNA-1\n";
print EXPECTED "BPAG.contig.08311.1317.reaprBIN maker CDS 1035 1265 . - 0 ID=SRAE_0000000101-mRNA-1:cds;Parent=SRAE_0000000101-mRNA-1\n";
print EXPECTED "BPAG.contig.08311.1317.reaprBIN augustus_masked match 1035 1265 0.37 - . ID=BPAG.contig.08311.1317.reaprBIN:hit:56:3.5.0.0;Name=augustus_masked-BPAG.contig.08311.1317.reaprBIN-abinit-gene-0.0-mRNA-1\n";
print EXPECTED "BPAG.contig.08311.1317.reaprBIN augustus_masked match_part 1035 1265 0.37 - . ID=BPAG.contig.08311.1317.reaprBIN:hsp:116:3.5.0.0;Parent=BPAG.contig.08311.1317.reaprBIN:hit:56:3.5.0.0;Target=augustus_masked-BPAG.contig.08311.1317.reaprBIN-abinit-gene-0.0-mRNA-1 1 231 +;Gap=M231\n";
print EXPECTED "BPAG.contig.08311.1317.reaprBIN model_gff:maker match 835 1248 . + . ID=BPAG.contig.08311.1317.reaprBIN:hit:57:3.5.0.0;Name=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1\n";
print EXPECTED "BPAG.contig.08311.1317.reaprBIN model_gff:maker match_part 835 837 . + . ID=BPAG.contig.08311.1317.reaprBIN:hsp:117:3.5.0.0;Parent=BPAG.contig.08311.1317.reaprBIN:hit:57:3.5.0.0;Target=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1 1 3 +;Gap=M3\n";
print EXPECTED "BPAG.contig.08311.1317.reaprBIN model_gff:maker match_part 1018 1248 . + . ID=BPAG.contig.08311.1317.reaprBIN:hsp:118:3.5.0.0;Parent=BPAG.contig.08311.1317.reaprBIN:hit:57:3.5.0.0;Target=augustus_masked-BPAG.contig.08311.1317.reaprBIN-processed-gene-0.0-mRNA-1 4 234 +;Gap=M231\n";
print EXPECTED "BPAG.contig.08311.1317.reaprBIN repeatmasker match 104 144 17 + . ID=BPAG.contig.08311.1317.reaprBIN:hit:37:1.3.0.0;Name=species:%2528ACCT%2529n|genus:Simple_repeat;Target=species:%2528ACCT%2529n|genus:Simple_repeat 1 41 +\n";
print EXPECTED "BPAG.contig.08311.1317.reaprBIN repeatmasker match_part 104 144 17 + . ID=BPAG.contig.08311.1317.reaprBIN:hsp:97:1.3.0.0;Parent=BPAG.contig.08311.1317.reaprBIN:hit:37:1.3.0.0;Target=species:%2528ACCT%2529n|genus:Simple_repeat 1 41 +\n";
print EXPECTED "BPAG.contig.08311.1317.reaprBIN blastx protein_match 1015 1239 287 + . ID=BPAG.contig.08311.1317.reaprBIN:hit:38:2.10.0.0;Name=BUX_s00036.29.1\n";
print EXPECTED "BPAG.contig.08311.1317.reaprBIN blastx match_part 1015 1239 287 + . ID=BPAG.contig.08311.1317.reaprBIN:hsp:98:2.10.0.0;Parent=BPAG.contig.08311.1317.reaprBIN:hit:38:2.10.0.0;Target=BUX_s00036.29.1 1 74;Gap=M38 D1 M36\n";
print EXPECTED "BPAG.contig.08311.1317.reaprBIN blastx protein_match 1018 1242 272 + . ID=BPAG.contig.08311.1317.reaprBIN:hit:39:2.10.0.0;Name=UniProtKB:H3I027\n";
print EXPECTED "##FASTA\n";
print EXPECTED ">BPAG.contig.08311.1317.reaprBIN\n";
close(EXPECTED);
$output_gff = $outputdir."/".$output_gff;
$differences = "";
open(TEMP,"diff $output_gff $expected_output_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_run_main_program: failed test3 (output_gff $output_gff expected_output_gff $expected_output_gff)\n"; exit;}
system "rm -f $output_gff";
system "rm -f $input_gff";
system "rm -f $expected_output_gff";
# EXAMPLE WITH OVERLAPPING GENE PREDICTIONS ON THE SAME STRAND, WHERE ONE OF THE GENES HAS + AND - EXONS:
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_run_main_program: cannot open $input_gff\n";
print INPUT_GFF "##gff-version 3\n";
print INPUT_GFF "AG00002 maker gene 683300 686711 . - . ID=maker-AG00002-pred_gff_protein_coding-gene-7.12;Name=maker-AG00002-pred_gff_protein_coding-gene-7.12\n";
print INPUT_GFF "AG00002 maker mRNA 683300 686711 . + . ID=F07C4.9;Parent=maker-AG00002-pred_gff_protein_coding-gene-7.12;Name=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1;_AED=1.00;_eAED=1.00;_QI=84|0|0|0|1|1|6|0|282\n";
print INPUT_GFF "AG00002 model_gff:maker match 683300 686711 . + . ID=AG00002:hit:2251:4.5.0.7;Name=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1\n";
print INPUT_GFF "AG00002 model_gff:maker match_part 683300 683550 . - . ID=AG00002:hsp:8272:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 1 251 +;Gap=M251\n";
print INPUT_GFF "AG00002 model_gff:maker match_part 683670 683878 . - . ID=AG00002:hsp:8273:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 252 460 +;Gap=M209\n";
print INPUT_GFF "AG00002 model_gff:maker match_part 684544 684548 . - . ID=AG00002:hsp:8274:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 461 465 +;Gap=M5\n";
print INPUT_GFF "AG00002 model_gff:maker match_part 685452 685459 . + . ID=AG00002:hsp:8275:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 466 473 +;Gap=M8\n";
print INPUT_GFF "AG00002 model_gff:maker match_part 686133 686341 . + . ID=AG00002:hsp:8276:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 474 682 +;Gap=M209\n";
print INPUT_GFF "AG00002 model_gff:maker match_part 686461 686711 . + . ID=AG00002:hsp:8277:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 683 933 +;Gap=M251\n";
print INPUT_GFF "AG00002 maker exon 684544 684548 . - . ID=F07C4.9:exon:255;Parent=F07C4.9\n";
print INPUT_GFF "AG00002 maker exon 683670 683878 . - . ID=F07C4.9:exon:254;Parent=F07C4.9\n";
print INPUT_GFF "AG00002 maker exon 683300 683550 . - . ID=F07C4.9:exon:253;Parent=F07C4.9\n";
print INPUT_GFF "AG00002 maker exon 685452 685459 . + . ID=F07C4.9:exon:256;Parent=F07C4.9\n";
print INPUT_GFF "AG00002 maker exon 686133 686341 . + . ID=F07C4.9:exon:257;Parent=F07C4.9\n";
print INPUT_GFF "AG00002 maker exon 686461 686711 . + . ID=F07C4.9:exon:258;Parent=F07C4.9\n";
print INPUT_GFF "AG00002 maker five_prime_UTR 683550 683633 . + . ID=F07C4.9:five_prime_utr;Parent=F07C4.9\n";
print INPUT_GFF "AG00002 maker CDS 683300 683634 . + 0 ID=F07C4.9:cds;Parent=F07C4.9\n";
print INPUT_GFF "AG00002 maker CDS 683670 683878 . + 1 ID=F07C4.9:cds;Parent=F07C4.9\n";
print INPUT_GFF "AG00002 maker CDS 684544 684548 . + 2 ID=F07C4.9:cds;Parent=F07C4.9\n";
print INPUT_GFF "AG00002 maker CDS 685452 685459 . + 0 ID=F07C4.9:cds;Parent=F07C4.9\n";
print INPUT_GFF "AG00002 maker CDS 686133 686341 . + 1 ID=F07C4.9:cds;Parent=F07C4.9\n";
print INPUT_GFF "AG00002 maker CDS 686461 686711 . + 2 ID=F07C4.9:cds;Parent=F07C4.9\n";
print INPUT_GFF "AG00002 maker gene 683300 684111 . - . ID=augustus_masked-AG00002-processed-gene-6.5;Name=augustus_masked-AG00002-processed-gene-6.5\n";
print INPUT_GFF "AG00002 maker mRNA 683300 684111 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1;Parent=augustus_masked-AG00002-processed-gene-6.5;Name=augustus_masked-AG00002-processed-gene-6.5-mRNA-1;_AED=0.23;_eAED=0.29;_QI=54|0|0|0.66|1|1|3|0|162\n";
print INPUT_GFF "AG00002 maker exon 684077 684111 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:exon:210;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1\n";
print INPUT_GFF "AG00002 maker exon 683646 683902 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:exon:209;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1\n";
print INPUT_GFF "AG00002 maker exon 683300 683550 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:exon:208;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1\n";
print INPUT_GFF "AG00002 maker five_prime_UTR 684077 684111 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:five_prime_utr;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1\n";
print INPUT_GFF "AG00002 maker five_prime_UTR 683884 683902 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:five_prime_utr;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1\n";
print INPUT_GFF "AG00002 maker CDS 683646 683883 . - 0 ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:cds;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1\n";
print INPUT_GFF "AG00002 maker CDS 683300 683550 . - 2 ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:cds;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1\n";
close(INPUT_GFF);
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@temp = split(/\//,$output_gff);
$output_gff = $temp[$#temp];
&run_main_program($outputdir,$input_gff,$output_gff,'ASG',1);
($expected_output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# WE DON'T KNOW THE GENE NAME FOR THE TRANSCRIPTS FOR GENE maker-AG00002-pred_gff_protein_coding-gene-7.12, BECAUSE THE mRNA LINE IS ON ANOTHER STRAND. AS A RESULT WE DON'T GET THE EXONS/UTR/CDS/mRNA/gene IN THE OUTPUT GFF,
# BECAUSE WE EXCLUDE GENES WITHOUT gene+CDS+exon+mRNA FROM THE OUTPUT GFF.
open(EXPECTED,">$expected_output_gff") || die "ERROR: test_run_main_program: failed test4\n";
print EXPECTED "##gff-version 3\n";
print EXPECTED "AG00002 maker gene 683300 684111 . - . ID=ASG_0000000001;Name=ASG_0000000001\n";
print EXPECTED "AG00002 maker mRNA 683300 684111 . - . ID=ASG_0000000001-mRNA-1;Parent=ASG_0000000001;Name=ASG_0000000001-mRNA-1;_AED=0.23;_eAED=0.29;_QI=54|0|0|0.66|1|1|3|0|162\n";
print EXPECTED "AG00002 maker exon 684077 684111 . - . ID=ASG_0000000001-mRNA-1:exon:1;Parent=ASG_0000000001-mRNA-1\n";
print EXPECTED "AG00002 maker exon 683646 683902 . - . ID=ASG_0000000001-mRNA-1:exon:2;Parent=ASG_0000000001-mRNA-1\n";
print EXPECTED "AG00002 maker exon 683300 683550 . - . ID=ASG_0000000001-mRNA-1:exon:3;Parent=ASG_0000000001-mRNA-1\n";
print EXPECTED "AG00002 maker five_prime_UTR 684077 684111 . - . ID=ASG_0000000001-mRNA-1:five_prime_utr;Parent=ASG_0000000001-mRNA-1\n";
print EXPECTED "AG00002 maker five_prime_UTR 683884 683902 . - . ID=ASG_0000000001-mRNA-1:five_prime_utr;Parent=ASG_0000000001-mRNA-1\n";
print EXPECTED "AG00002 maker CDS 683646 683883 . - 0 ID=ASG_0000000001-mRNA-1:cds;Parent=ASG_0000000001-mRNA-1\n";
print EXPECTED "AG00002 maker CDS 683300 683550 . - 2 ID=ASG_0000000001-mRNA-1:cds;Parent=ASG_0000000001-mRNA-1\n";
print EXPECTED "AG00002 model_gff:maker match 683300 686711 . + . ID=AG00002:hit:2251:4.5.0.7;Name=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1\n";
print EXPECTED "AG00002 model_gff:maker match_part 683300 683550 . - . ID=AG00002:hsp:8272:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 1 251 +;Gap=M251\n";
print EXPECTED "AG00002 model_gff:maker match_part 683670 683878 . - . ID=AG00002:hsp:8273:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 252 460 +;Gap=M209\n";
print EXPECTED "AG00002 model_gff:maker match_part 684544 684548 . - . ID=AG00002:hsp:8274:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 461 465 +;Gap=M5\n";
print EXPECTED "AG00002 model_gff:maker match_part 685452 685459 . + . ID=AG00002:hsp:8275:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 466 473 +;Gap=M8\n";
print EXPECTED "AG00002 model_gff:maker match_part 686133 686341 . + . ID=AG00002:hsp:8276:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 474 682 +;Gap=M209\n";
print EXPECTED "AG00002 model_gff:maker match_part 686461 686711 . + . ID=AG00002:hsp:8277:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 683 933 +;Gap=M251\n";
close(EXPECTED);
$output_gff = $outputdir."/".$output_gff;
$differences = "";
open(TEMP,"diff $output_gff $expected_output_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_run_main_program: failed test4 (output_gff $output_gff expected_output_gff $expected_output_gff)\n"; exit;}
system "rm -f $output_gff";
system "rm -f $input_gff";
system "rm -f $expected_output_gff";
}
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
sub run_main_program
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN.
my $input_gff = $_[1]; # THE INPUT GFF FILE
my $output_gff = $_[2]; # THE OUTPUT GFF FILE
my $species = $_[3]; # SPECIES ABBREVIATION
my $testing = $_[4]; # SAYS WHETHER IT'S CALLED FROM A TESTING SUBROUTINE
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR
my $line; #
my @temp; #
my $end_of_gff = 0; # SAYS WHETHER WE HAVE REACHED THE END OF THE GFF
my $gene_count = 1; # THE GENE COUNT, USED TO MAKE THE NEW GENE NAME
my $new_gene_name; # NEW NAME FOR A GENE
my $new_lines; # NEW GFF FILE LINES, WITH NEW NAMES
my $gene_gff; # GFF FILE WITH JUST THE FEATURES PERTAINING TO GENES
my $gene; # NAME OF THE GENE
my $input_gff_linecnt; # NUMBER OF LINES IN $input_gff
my $output_gff_linecnt; # NUMBER OF LINES IN $output_gff
my $gene_gff2; # GFF FILE WITH JUST 'gene' FEATURES
my $FEATURES; # HASH TABLE OF OVERLAPPING FEATURES BETWEEN $gene_gff2, AND $gene_gff2
my $features; # OVERLAPPING FEATURES FOR GENE $gene
my $scaffold; # SCAFFOLD NAME
my $comment_lines = 0; # NUMBER OF COMMENT LINES TO IGNORE
my $new_lines_complete; # SAYS WHETHER THE NEW GFF LINES FOR A GENE HAVE 'gene','exon','CDS','mRNA' FEATURES
# MAKE A TEMPORARY GFF FILE WITH JUST THE FEATURES PERTAINING TO GENES ('exon', 'CDS', 'mRNA', etc.):
($gene_gff,$errorcode,$errormsg) = &make_gene_gff($input_gff);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# MAKE A TEMPORARY GFF FILE WITH JUST THE 'GENE' FEATURES:
($gene_gff2,$errorcode,$errormsg) = &make_gene_gff2($gene_gff);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# FIND FEATURES THAT OVERLAP BETWEEN EACH GENE IN $gene_gff2, AND ALL THE GENE FEATURES ('exon', 'CDS', etc.)
# IN $gene_gff:
($FEATURES,$errorcode,$errormsg) = &find_overlapping_features($gene_gff,$gene_gff2);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# SORT $gene_gff BY CHROMOSOME, THEN START POINT IN THE CHROMOSOME:
# (WE NEED TO DO THIS SO THAT GENES ARE NUMBERED 1,2,3,etc. IN ORDER ALONG A CHROMOSOME)
($gene_gff2,$errorcode,$errormsg) = &sort_gene_gff($outputdir,$gene_gff2);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# OPEN THE OUTPUT GFF FILE:
$output_gff = $outputdir."/".$output_gff;
open(OUTPUT_GFF,">$output_gff") || die "ERROR: run_main_program: cannot open output $output_gff\n";
# READ IN THE $input_gff FILE AND GET COMMENT LINES AND 'contig' LINES:
open(INPUT_GFF,"$input_gff") || die "ERROR: run_main_program: cannot open $input_gff\n";
while(<INPUT_GFF>)
{
$line = $_;
chomp $line;
if ($line eq '##FASTA') { $end_of_gff = 1;}
if (substr($line,0,1) ne '#' && $end_of_gff == 0)
{
@temp = split(/\t+/,$line);
if ($temp[2] eq 'contig')
{
print OUTPUT_GFF "$line\n";
}
}
else
{
if ($line ne '###')
{
if ($end_of_gff == 0) { print OUTPUT_GFF "$line\n";} # PRINT COMMENT LINES
}
else { $comment_lines++;} # KEEP A COUNT OF COMMENT LINES TO IGNORE
}
}
close(INPUT_GFF);
# READ IN THE GENES IN THE SORTED 'gene' GFF FILE:
open(SORTED_GENES_GFF,"$gene_gff2") || die "ERROR: run_main_program: cannot open input $gene_gff2\n";
while(<SORTED_GENES_GFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
$scaffold = $temp[0];
# FIND THE GENE NAME:
# BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110
$gene = $temp[8]; # eg. ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110
@temp = split(/ID=/,$gene);
$gene = $temp[1];
@temp = split(/\;/,$gene);
$gene = $temp[0];
# GET THE NEW NAME FOR THIS GENE:
($new_gene_name,$errorcode,$errormsg) = &get_new_genename($gene_count,$species);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
if ($testing == 0) { print STDERR "Getting new gene name for gene $gene_count in $species (scaffold $scaffold) ... $gene (old) = $new_gene_name (new)\n";}
# FIND THE OVERLAPPING GFF LINES FROM $gene_gff FOR THE GENE:
if (!($FEATURES->{$line})) { print STDERR "ERROR: do not know overlapping features for gene $gene (line $line)\n"; exit;}
$features = $FEATURES->{$line};
# RENAME THIS GENE AND ALL ITS EXONS etc.:
($new_lines,$errorcode,$errormsg) = &fix_gff_lines_for_gene($features,$new_gene_name,$gene);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# CHECK THAT $new_lines INCLUDES 'gene', 'mRNA', 'exon', AND 'CDS' FEATURES, ALL ON THE SAME STRAND:
($new_lines_complete,$errorcode,$errormsg) = &check_whether_new_lines_complete($new_lines);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
if ($new_lines_complete == 1)
{
print OUTPUT_GFF "$new_lines";
$gene_count = $gene_count + 100; # THE GENE COUNT, USED TO MAKE THE NEW GENE NAME
}
else
{
if ($testing == 0) { print STDERR "WARNING: discarding gene $gene because it doesn't have all types of features (gene,mRNA,CDS,exon) on the same strand\n";}
}
}
close(SORTED_GENES_GFF);
# NOW ADD BACK THE FEATURE TYPES SUCH AS match, protein_match etc.
$end_of_gff = 0;
open(INPUT_GFF,"$input_gff") || die "ERROR: run_main_program: cannot open input $input_gff\n";
while(<INPUT_GFF>)
{
$line = $_;
chomp $line;
if ($line eq '##FASTA') { $end_of_gff = 1;}
if (substr($line,0,1) ne '#' && $end_of_gff == 0)
{
@temp = split(/\t+/,$line);
if ($temp[2] eq 'protein_match' || $temp[2] eq 'match_part' || $temp[2] eq 'match' || $temp[2] eq 'expressed_sequence_match')
{
print OUTPUT_GFF "$line\n";
}
else
{
if ($temp[2] ne 'gene' && $temp[2] ne 'contig' && $temp[2] ne 'exon' && $temp[2] ne 'CDS' && $temp[2] ne 'five_prime_UTR' &&
$temp[2] ne 'three_prime_UTR' && $temp[2] ne 'mRNA')
{
print STDERR "ERROR: temp[2] $temp[2] in line $line of $input_gff\n";
exit;
}
}
}
}
close(INPUT_GFF);
# NOW ADD BACK THE FASTA SEQUENCE TO THE END OF THE GFF:
$end_of_gff = 0;
open(INPUT_GFF,"$input_gff") || die "ERROR: run_main_program: cannot open input $input_gff\n";
while(<INPUT_GFF>)
{
$line = $_;
chomp $line;
if ($line eq '##FASTA') { $end_of_gff = 1;}
if ($end_of_gff == 1)
{
print OUTPUT_GFF "$line\n";
}
}
close(OUTPUT_GFF);
# NOTE: I USED TO CHECK THAT THE INPUT AND OUTPUT GFF HAS THE SAME NUMBER OF LINES, ie. $output_gff_linecnt != $input_gff_linecnt - $comment_lines
# HOWEVER, THIS ASSUMPTION DOESN'T HOLD AS GENES THAT HAVE A MIX OF + AND - STRAND FEATURES ARE NOW DISCARDED.
# DELETE TEMPORARY FILES:
system "rm -f $gene_gff";
system "rm -f $gene_gff2";
}
#------------------------------------------------------------------#
# TEST &check_whether_new_lines_complete
sub test_check_whether_new_lines_complete
{
my $new_lines; # NEW GFF LINES FOR GENE
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
my $new_lines_complete; # SAYS WHETHER $new_lines INCLUDES gene,mRNA,CDS+exon FEATURES
$new_lines = "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=SRAE_0000000001;Name=SRAE_0000000001\n";
$new_lines = $new_lines."BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=SRAE_0000000001-mRNA-1;Parent=SRAE_0000000001;Name=SRAE_0000000001-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
$new_lines = $new_lines."BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=SRAE_0000000001-mRNA-1:exon:1;Parent=SRAE_0000000001-mRNA-1\n";
$new_lines = $new_lines."BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=SRAE_0000000001-mRNA-1:cds;Parent=SRAE_0000000001-mRNA-1";
($new_lines_complete,$errorcode,$errormsg) = &check_whether_new_lines_complete($new_lines);
if ($errorcode != 0) { print STDERR "ERROR: test_check_whether_new_lines_complete: failed test1\n"; exit;}
if ($new_lines_complete != 1) { print STDERR "ERROR: test_whether_new_lines_complete: failed test1b (new_lines_complete $new_lines_complete)\n"; exit;}
$new_lines = "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=SRAE_0000000001;Name=SRAE_0000000001";
($new_lines_complete,$errorcode,$errormsg) = &check_whether_new_lines_complete($new_lines);
if ($errorcode != 0) { print STDERR "ERROR: test_check_whether_new_lines_complete: failed test2\n"; exit;}
if ($new_lines_complete != 0) { print STDERR "ERROR: test_whether_new_lines_complete: failed test2b\n"; exit;}
# AN EXAMPLE WHERE STRANDS DISAGREE:
$new_lines = "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=SRAE_0000000001;Name=SRAE_0000000001\n";
$new_lines = $new_lines."BPAG.contig.00154.117101 maker mRNA 35582 39253 . - . ID=SRAE_0000000001-mRNA-1;Parent=SRAE_0000000001;Name=SRAE_0000000001-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
$new_lines = $new_lines."BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=SRAE_0000000001-mRNA-1:exon:1;Parent=SRAE_0000000001-mRNA-1\n";
$new_lines = $new_lines."BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=SRAE_0000000001-mRNA-1:cds;Parent=SRAE_0000000001-mRNA-1";
($new_lines_complete,$errorcode,$errormsg) = &check_whether_new_lines_complete($new_lines);
if ($errorcode != 0) { print STDERR "ERROR: test_check_whether_new_lines_complete: failed test3\n"; exit;}
if ($new_lines_complete != 0) { print STDERR "ERROR: test_whether_new_lines_complete: failed test3b (new_lines_complete $new_lines_complete)\n"; exit;}
# AN EXAMPLE WHERE STRANDS DISAGREE:
$new_lines = "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=SRAE_0000000001;Name=SRAE_0000000001\n";
$new_lines = $new_lines."BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=SRAE_0000000001-mRNA-1;Parent=SRAE_0000000001;Name=SRAE_0000000001-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
$new_lines = $new_lines."BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=SRAE_0000000001-mRNA-1:exon:1;Parent=SRAE_0000000001-mRNA-1\n";
$new_lines = $new_lines."BPAG.contig.00154.117101 maker CDS 35582 35727 . - 0 ID=SRAE_0000000001-mRNA-1:cds;Parent=SRAE_0000000001-mRNA-1";
($new_lines_complete,$errorcode,$errormsg) = &check_whether_new_lines_complete($new_lines);
if ($errorcode != 0) { print STDERR "ERROR: test_check_whether_new_lines_complete: failed test4\n"; exit;}
if ($new_lines_complete != 0) { print STDERR "ERROR: test_whether_new_lines_complete: failed test4b (new_lines_complete $new_lines_complete)\n"; exit;}
}
#------------------------------------------------------------------#
# CHECK WHETHER THE NEW GFF LINES FOR A GENE CONTAIN 'gene', 'mRNA', 'exon', 'CDS' FEATURES, ALL ON THE SAME STRAND:
sub check_whether_new_lines_complete
{
my $new_lines = $_[0]; # NEW GFF LINES FOR A GENE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $new_lines_complete = 0; # SAYS WHETHER WE HAVE SEEN 'gene','mRNA','exon','CDS' FEATURES
my @new_lines; # ARRAY OF NEW LINES
my $i; #
my $new_line; # GFF LINE
my $feature_type; # FEATURE TYPE
my @temp; #
my $seen_gene = 0; # SAYS WHETHER WE SAW A 'gene' FEATURE
my $seen_mRNA = 0; # SAYS WHETHER WE SAW A 'mRNA' FEATURE
my $seen_exon = 0; # SAYS WHETHER WE SAW AN 'exon' FEATURE
my $seen_CDS = 0; # SAYS WHETHER WE SAW A 'CDS' FEATURE
my $the_strand = "none";# STRAND OF THE GENE
my $strands_agree = 1; # SAYS WHETHER ALL THE FEATURES HAVE THE SAME STRAND
my $strand; # STRAND OF A FEATURE
@new_lines = split(/\n+/,$new_lines);
for ($i = 0; $i <= $#new_lines; $i++)
{
$new_line = $new_lines[$i];
@temp = split(/\t+/,$new_line);
$feature_type = $temp[2];
$strand = $temp[6];
if ($the_strand eq 'none') { $the_strand = $strand;}
if ($feature_type eq 'gene') { $seen_gene = 1;}
elsif ($feature_type eq 'mRNA') { $seen_mRNA = 1;}
elsif ($feature_type eq 'exon') { $seen_exon = 1;}
elsif ($feature_type eq 'CDS') { $seen_CDS = 1;}
if ($strand ne $the_strand) { $strands_agree = 0;}
}
if ($seen_gene == 1 && $seen_mRNA == 1 && $seen_exon == 1 && $seen_CDS == 1) { $new_lines_complete = 1;}
if ($strands_agree == 0) { $new_lines_complete = 0;}
return($new_lines_complete,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &sort_gene_gff
sub test_sort_gene_gff
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
my $input_gff; # INPUT GFF FILE NAME
my $expected_input_gff; # FILE WITH EXPECTED CONTENTS OF $input_gff
my $differences; # DIFFERENCES BETWEEN $input_gff AND $expected_input_gff
my $length_differences; # LENGTH OF $differences
my $line; #
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_sort_gene_gff: cannot open $input_gff\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker gene 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker mRNA 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;_AED=0.06;_eAED=0.06;_QI=0|0|0|1|1|1|3|0|276\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:0;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 36179 36341 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 37052 37204 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:2;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 38326 38535 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:3;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 38781 38871 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:4;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 39087 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:5;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 36179 36341 . + 1 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 38326 38535 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 38781 38871 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 39087 39253 . + 2 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
close(INPUT_GFF);
($input_gff,$errorcode,$errormsg) = &sort_gene_gff($outputdir,$input_gff);
if ($errorcode != 0) { print STDERR "ERROR: test_sort_gene_gff: failed test1\n"; exit;}
($expected_input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_input_gff") || die "ERROR: test_sort_gene_gff: failed test1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:0;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110\n";
print EXPECTED "BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 36179 36341 . + 1 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 36179 36341 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 37052 37204 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:2;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 38326 38535 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 38326 38535 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:3;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 38781 38871 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 38781 38871 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:4;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 39087 39253 . + 2 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 39087 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:5;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker gene 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119\n";
print EXPECTED "BPAG.contig.00154.117101 maker mRNA 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;_AED=0.06;_eAED=0.06;_QI=0|0|0|1|1|1|3|0|276\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $input_gff $expected_input_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_sort_gene_gff: failed test1 (input_gff $input_gff expected_input_gff $expected_input_gff)\n"; exit;}
system "rm -f $input_gff";
system "rm -f $expected_input_gff";
}
#------------------------------------------------------------------#
# SORT $gene_gff BY CHROMOSOME, THEN START POINT IN THE CHROMOSOME:
# (WE NEED TO DO THIS SO THAT GENES ARE NUMBERED 1,2,3,etc. IN ORDER ALONG A CHROMOSOME)
# NOTE THE GENE GFF WILL JUST HAVE exon,gene,mRNA,CDS,five_prime_UTR,three_prime_UTR FEATURES, AND WON'T HAVE
# COMMENT-LINES OR SEQUENCE.
sub sort_gene_gff
{
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES IN
my $input_gff = $_[1]; # GFF FILE
my $sorted_gff; # SORTED GFF FILE
my $cmd; # COMMAND TO RUN
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
($sorted_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
$cmd = "sort -k1,1 -k4,4n $input_gff > $sorted_gff";
system "$cmd";
system "rm -f $input_gff";
return($sorted_gff,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &count_lines
sub test_count_lines
{
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES IN
my $input_gff; # INPUT GFF FILE
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
my $input_gff_linecnt; # NUMBER OF LINES IN $input_gff
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_run_main_program: cannot open $input_gff\n";
print INPUT_GFF "##gff-version 3\n";
print INPUT_GFF "BPAG.contig.00154.117101 . contig 1 117101 . . . ID=BPAG.contig.00154.117101;Name=BPAG.contig.00154.117101\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:0;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 36179 36341 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 37052 37204 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:2;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 38326 38535 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:3;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 38781 38871 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:4;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 39087 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:5;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 36179 36341 . + 1 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 38326 38535 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 38781 38871 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 39087 39253 . + 2 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker gene 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker mRNA 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;_AED=0.06;_eAED=0.06;_QI=0|0|0|1|1|1|3|0|276\n";
print INPUT_GFF "BPAG.contig.00154.117101 blastx protein_match 1346 1510 223 + . ID=BPAG.contig.00154.117101:hit:88:2.10.0.0;Name=tr|J9EFX0|J9EFX0_WUCBA\n";
print INPUT_GFF "BPAG.contig.00154.117101 blastx match_part 1346 1510 223 + . ID=BPAG.contig.00154.117101:hsp:88:2.10.0.0;Parent=BPAG.contig.00154.117101:hit:88:2.10.0.0;Target=tr|J9EFX0|J9EFX0_WUCBA 1 56;Gap=M24 I1 M31\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">BPAG.contig.10610.1000\n";
print INPUT_GFF "CCAAACGATGATTGAAAATTTATTCATTTCTTTGCTTTTAAAAGGTAATAGTTTAAAAGCAACATTCTCTTTCTTATATTTTCAAAGTTTAGAAGAAAGAAAATTTTAATAATTTTCATTAAAATAAAATTAGGTAAGATATTAGAAAGGTTCAGTTGAAGTATTCCAAAATTATTGCTATCCATTTATTATTTATTTAAAAAAAAAAAAAAAGATTGTTAATTACTTTCATTTCTTAAATTTGATTAATTATATCAAAGATTATGAAGAAAAGAAAAATATGCTTTTTGTATTTTTTAAATTTCTGTTTTCAATTATTTTACAAAAATTTTTGTTTATTTTATTAAATCATTTCTCTTTTTTTTTTTTTGCCATTTGATTTTGCCTATTATTTATACAAATAATCATAATTCACTTTAAATATTAATGGATGAATATTTAAAATTATTATTCCATTTGAATTTGAATGTTAGAAAAAAAAATTCTATATATGAAATGATGATTTCATATTATTAATTAATTTTTTTTTAATTAATTAATACTGAAATTATATTAGACGCAAATGCATTTGAAATATTTACAGGCATGAAGATAGGAATATATTAGGTGGCGTACAAATAATTGACGTTACGTAATACACGTAACTTGTTAAGAACGTCCATTCTGCAATGTTTCTATGCAAAAATATACCAAACGATAAACCGTAAATGTTGTGATAATAGCATAAAATTTGCAATATTAAATTTGGGATAAAAAATACGAACAATATTTAGAAGAAAAATATTTACATAAAAATTAAAAATCAAATTTGAACTTTTGCAAATATTTATTAGACAACAATTTTAAAATTACAACTTAACATAGCAAACATTGATTAAAAGTAAAAGCTCAAAGTAAACAAAAAAAAAAGAATTCATTTCTTCAGTTTAATTTTAAGCCCTGTAAATAGGAAAACAAAATTCTAAAATTATTGCAAATTTTCTAGAAATTTTTTAAAAAA\n";
close(INPUT_GFF);
($input_gff_linecnt,$errorcode,$errormsg) = &count_lines($input_gff);
if ($errorcode != 0) { print STDERR "ERROR: test_count_lines: failed test1\n"; exit;}
if ($input_gff_linecnt != 23) { print STDERR "ERROR: test_count_lines: failed test1b\n"; exit;}
system "rm -f $input_gff";
}
#------------------------------------------------------------------#
# COUNT THE NUMBER OF LINES IN A FILE:
sub count_lines
{
my $input_gff = $_[0]; # INPUT GFF FILE
my $linecnt = 0; # NUMBER OF LINES IN THE GFF FILE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my @temp; #
my $line; #
open(TMP,"wc -l $input_gff |");
while(<TMP>)
{
$line = $_;
chomp $line;
@temp = split(/\s+/,$line);
if ($linecnt != 0)
{
$errormsg = "ERROR: count_lines: linecnt is already $linecnt (line $line) for $input_gff\n";
$errorcode = 56; # ERRORCODE=56 (SHOULDN'T HAPPEN SO CAN'T TEST FOR IT)
return($linecnt,$errorcode,$errormsg);
}
$linecnt = $temp[0];
}
close(TMP);
if ($linecnt == 0)
{
$errormsg = "ERROR: count_lines: linecnt is $linecnt for $input_gff\n";
$errorcode = 57; # ERRORCODE=57 (SHOULDN'T HAPPEN, CAN'T TEST FOR)
return($linecnt,$errorcode,$errormsg);
}
return($linecnt,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &make_gene_gff
sub test_make_gene_gff
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN
my $input_gff; # THE INPUT GFF FILE
my $gene_gff; # THE GFF FILE WITH GENE FEATURES
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
my $expected_gene_gff; # FILE WITH EXPECTED CONTENTS OF $gene_gff
my $differences; # DIFFERENCES BETWEEN $gene_gff AND $expected_gene_gff
my $length_differences; # LENGTH OF $differences
my $line; #
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_make_gene_gff: cannot open $input_gff\n";
print INPUT_GFF "##gff-version 3\n";
print INPUT_GFF "BPAG.contig.00154.117101 . contig 1 117101 . . . ID=BPAG.contig.00154.117101;Name=BPAG.contig.00154.117101\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:0;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 36179 36341 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 37052 37204 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:2;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 38326 38535 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:3;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 38781 38871 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:4;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 39087 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:5;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 36179 36341 . + 1 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 38326 38535 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 38781 38871 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 39087 39253 . + 2 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker gene 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker mRNA 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;_AED=0.06;_eAED=0.06;_QI=0|0|0|1|1|1|3|0|276\n";
print INPUT_GFF "BPAG.contig.00154.117101 blastx protein_match 1346 1510 223 + . ID=BPAG.contig.00154.117101:hit:88:2.10.0.0;Name=tr|J9EFX0|J9EFX0_WUCBA\n";
print INPUT_GFF "BPAG.contig.00154.117101 blastx match_part 1346 1510 223 + . ID=BPAG.contig.00154.117101:hsp:88:2.10.0.0;Parent=BPAG.contig.00154.117101:hit:88:2.10.0.0;Target=tr|J9EFX0|J9EFX0_WUCBA 1 56;Gap=M24 I1 M31\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">BPAG.contig.10610.1000\n";
print INPUT_GFF "CCAAACGATGATTGAAAATTTATTCATTTCTTTGCTTTTAAAAGGTAATAGTTTAAAAGCAACATTCTCTTTCTTATATTTTCAAAGTTTAGAAGAAAGAAAATTTTAATAATTTTCATTAAAATAAAATTAGGTAAGATATTAGAAAGGTTCAGTTGAAGTATTCCAAAATTATTGCTATCCATTTATTATTTATTTAAAAAAAAAAAAAAAGATTGTTAATTACTTTCATTTCTTAAATTTGATTAATTATATCAAAGATTATGAAGAAAAGAAAAATATGCTTTTTGTATTTTTTAAATTTCTGTTTTCAATTATTTTACAAAAATTTTTGTTTATTTTATTAAATCATTTCTCTTTTTTTTTTTTTGCCATTTGATTTTGCCTATTATTTATACAAATAATCATAATTCACTTTAAATATTAATGGATGAATATTTAAAATTATTATTCCATTTGAATTTGAATGTTAGAAAAAAAAATTCTATATATGAAATGATGATTTCATATTATTAATTAATTTTTTTTTAATTAATTAATACTGAAATTATATTAGACGCAAATGCATTTGAAATATTTACAGGCATGAAGATAGGAATATATTAGGTGGCGTACAAATAATTGACGTTACGTAATACACGTAACTTGTTAAGAACGTCCATTCTGCAATGTTTCTATGCAAAAATATACCAAACGATAAACCGTAAATGTTGTGATAATAGCATAAAATTTGCAATATTAAATTTGGGATAAAAAATACGAACAATATTTAGAAGAAAAATATTTACATAAAAATTAAAAATCAAATTTGAACTTTTGCAAATATTTATTAGACAACAATTTTAAAATTACAACTTAACATAGCAAACATTGATTAAAAGTAAAAGCTCAAAGTAAACAAAAAAAAAAGAATTCATTTCTTCAGTTTAATTTTAAGCCCTGTAAATAGGAAAACAAAATTCTAAAATTATTGCAAATTTTCTAGAAATTTTTTAAAAAA\n";
close(INPUT_GFF);
($gene_gff,$errorcode,$errormsg) = &make_gene_gff($input_gff);
if ($errorcode != 0) { print STDERR "ERROR: test_make_gene_gff: failed test1\n"; exit;}
($expected_gene_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_gene_gff") || die "ERROR: test_make_gene_gff: failed test1\n";
print EXPECTED "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110\n";
print EXPECTED "BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:0;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 36179 36341 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 37052 37204 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:2;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 38326 38535 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:3;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 38781 38871 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:4;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker exon 39087 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:5;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 36179 36341 . + 1 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 38326 38535 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 38781 38871 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker CDS 39087 39253 . + 2 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print EXPECTED "BPAG.contig.00154.117101 maker gene 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119\n";
print EXPECTED "BPAG.contig.00154.117101 maker mRNA 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;_AED=0.06;_eAED=0.06;_QI=0|0|0|1|1|1|3|0|276\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $gene_gff $expected_gene_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_make_gene_gff: failed test1 (gene_gff $gene_gff expected_gene_gff $expected_gene_gff)\n"; exit;}
system "rm -f $gene_gff";
system "rm -f $input_gff";
system "rm -f $expected_gene_gff";
# TEST ERRORCODE=58 (UNEXPECETED FEATURE TYPE):
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_make_gene_gff: cannot open $input_gff\n";
print INPUT_GFF "##gff-version 3\n";
print INPUT_GFF "BPAG.contig.00154.117101 . contig 1 117101 . . . ID=BPAG.contig.00154.117101;Name=BPAG.contig.00154.117101\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:0;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker myexon 36179 36341 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 37052 37204 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:2;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 38326 38535 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:3;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 38781 38871 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:4;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 39087 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:5;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 36179 36341 . + 1 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 38326 38535 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 38781 38871 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 39087 39253 . + 2 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker gene 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker mRNA 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;_AED=0.06;_eAED=0.06;_QI=0|0|0|1|1|1|3|0|276\n";
print INPUT_GFF "BPAG.contig.00154.117101 blastx protein_match 1346 1510 223 + . ID=BPAG.contig.00154.117101:hit:88:2.10.0.0;Name=tr|J9EFX0|J9EFX0_WUCBA\n";
print INPUT_GFF "BPAG.contig.00154.117101 blastx match_part 1346 1510 223 + . ID=BPAG.contig.00154.117101:hsp:88:2.10.0.0;Parent=BPAG.contig.00154.117101:hit:88:2.10.0.0;Target=tr|J9EFX0|J9EFX0_WUCBA 1 56;Gap=M24 I1 M31\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">BPAG.contig.10610.1000\n";
print INPUT_GFF "CCAAACGATGATTGAAAATTTATTCATTTCTTTGCTTTTAAAAGGTAATAGTTTAAAAGCAACATTCTCTTTCTTATATTTTCAAAGTTTAGAAGAAAGAAAATTTTAATAATTTTCATTAAAATAAAATTAGGTAAGATATTAGAAAGGTTCAGTTGAAGTATTCCAAAATTATTGCTATCCATTTATTATTTATTTAAAAAAAAAAAAAAAGATTGTTAATTACTTTCATTTCTTAAATTTGATTAATTATATCAAAGATTATGAAGAAAAGAAAAATATGCTTTTTGTATTTTTTAAATTTCTGTTTTCAATTATTTTACAAAAATTTTTGTTTATTTTATTAAATCATTTCTCTTTTTTTTTTTTTGCCATTTGATTTTGCCTATTATTTATACAAATAATCATAATTCACTTTAAATATTAATGGATGAATATTTAAAATTATTATTCCATTTGAATTTGAATGTTAGAAAAAAAAATTCTATATATGAAATGATGATTTCATATTATTAATTAATTTTTTTTTAATTAATTAATACTGAAATTATATTAGACGCAAATGCATTTGAAATATTTACAGGCATGAAGATAGGAATATATTAGGTGGCGTACAAATAATTGACGTTACGTAATACACGTAACTTGTTAAGAACGTCCATTCTGCAATGTTTCTATGCAAAAATATACCAAACGATAAACCGTAAATGTTGTGATAATAGCATAAAATTTGCAATATTAAATTTGGGATAAAAAATACGAACAATATTTAGAAGAAAAATATTTACATAAAAATTAAAAATCAAATTTGAACTTTTGCAAATATTTATTAGACAACAATTTTAAAATTACAACTTAACATAGCAAACATTGATTAAAAGTAAAAGCTCAAAGTAAACAAAAAAAAAAGAATTCATTTCTTCAGTTTAATTTTAAGCCCTGTAAATAGGAAAACAAAATTCTAAAATTATTGCAAATTTTCTAGAAATTTTTTAAAAAA\n";
close(INPUT_GFF);
($gene_gff,$errorcode,$errormsg) = &make_gene_gff($input_gff);
if ($errorcode != 58) { print STDERR "ERROR: test_make_gene_gff: failed test2\n"; exit;}
system "rm -f $gene_gff";
system "rm -f $input_gff";
}
#------------------------------------------------------------------#
# MAKE A TEMPORARY GFF FILE WITH JUST THE FEATURES PERTAINING TO GENES:
sub make_gene_gff
{
my $input_gff = $_[0]; # INPUT GFF FILE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $gene_gff; # GFF FILE WITH FEATURES OF GENES
my $line; #
my @temp; #
my $feature_type; # GFF FEATURE TYPE
my $end_of_gff = 0; # SAYS WHETHER WE HAVE REACHED THE END OF THE GFF FILE
my $start; # FEATURE START
my $end; # FEATURE END
# OPEN THE GENE GFF OUTPUT FILE:
($gene_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GENE_GFF,">$gene_gff") || die "ERROR: make_gene_gff: cannot open $gene_gff\n";
# READ IN THE LINES OF $input_gff THAT OVERLAP WITH THE GENE:
open(GFF,"$input_gff") || die "ERROR: make_gene_gff: cannot open $input_gff\n";
while(<GFF>)
{
$line = $_;
chomp $line;
if ($line eq '##FASTA') { $end_of_gff = 1;}
if (substr($line,0,1) ne '#' && $end_of_gff == 0)
{
@temp = split(/\t+/,$line);
$feature_type = $temp[2];
if ($feature_type eq 'gene' || $feature_type eq 'mRNA' || $feature_type eq 'CDS' || $feature_type eq 'exon' || $feature_type eq 'five_prime_UTR' || $feature_type eq 'three_prime_UTR' || $feature_type eq 'transcript')
{
$start = $temp[3];
$end = $temp[4];
# SOMETIMES MAKER MAKES A PREDICTION WITH START AND/OR END POSITIONS OF -1 OR 0, THIS SHOULD BE 1 APPARENTLY (FROM TALKING TO THE DEVELOPER).
if ($start == 0 || $start == -1) { $start = 1;}
if ($end == 0 || $end == -1) { $end = 1;}
print GENE_GFF "$temp[0]\t$temp[1]\t$temp[2]\t$start\t$end\t$temp[5]\t$temp[6]\t$temp[7]\t$temp[8]\n";
}
else
{
if ($feature_type ne 'protein_match' && $feature_type ne 'match_part' && $feature_type ne 'contig' && $feature_type ne 'match' && $feature_type ne 'expressed_sequence_match')
{
$errormsg = "ERROR: make_gene_gff: feature_type $feature_type line $line\n";
$errorcode = 58; # ERRORCODE=58 (TESTED FOR)
return($gene_gff,$errorcode,$errormsg);
}
}
}
}
close(GFF);
close(GENE_GFF);
return($gene_gff,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &make_gene_gff2
sub test_make_gene_gff2
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN
my $input_gff; # THE INPUT GFF FILE
my $gene_gff; # THE GFF FILE WITH 'GENE' FEATURES
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
my $expected_gene_gff; # FILE WITH EXPECTED CONTENTS OF $gene_gff
my $differences; # DIFFERENCES BETWEEN $gene_gff AND $expected_gene_gff
my $length_differences; # LENGTH OF $differences
my $line; #
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_make_gene_gff2: cannot open $input_gff\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:0;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 36179 36341 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 37052 37204 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:2;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 38326 38535 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:3;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 38781 38871 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:4;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker exon 39087 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:5;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 36179 36341 . + 1 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 38326 38535 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 38781 38871 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker CDS 39087 39253 . + 2 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker gene 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119\n";
print INPUT_GFF "BPAG.contig.00154.117101 maker mRNA 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;_AED=0.06;_eAED=0.06;_QI=0|0|0|1|1|1|3|0|276\n";
close(INPUT_GFF);
($gene_gff,$errorcode,$errormsg) = &make_gene_gff2($input_gff);
if ($errorcode != 0) { print STDERR "ERROR: test_make_gene_gff2: failed test1\n"; exit;}
($expected_gene_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_gene_gff") || die "ERROR: test_make_gene_gff: failed test1\n";
print EXPECTED "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110\n";
print EXPECTED "BPAG.contig.00154.117101 maker gene 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $gene_gff $expected_gene_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_make_gene_gff2: failed test1 (gene_gff $gene_gff expected_gene_gff $expected_gene_gff)\n"; exit;}
system "rm -f $gene_gff";
system "rm -f $input_gff";
system "rm -f $expected_gene_gff";
}
#------------------------------------------------------------------#
# MAKE A TEMPORARY GFF FILE WITH JUST THE 'GENE' FEATURES:
sub make_gene_gff2
{
my $input_gff = $_[0]; # INPUT GFF FILE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $gene_gff; # GFF FILE WITH 'gene' FEATURES
my $line; #
my @temp; #
my $feature_type; # GFF FEATURE TYPE
my $end_of_gff = 0; # SAYS WHETHER WE HAVE REACHED THE END OF THE GFF FILE
# OPEN THE GENE GFF OUTPUT FILE:
($gene_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GENE_GFF,">$gene_gff") || die "ERROR: make_gene_gff: cannot open $gene_gff\n";
# READ IN THE LINES OF $input_gff (FILE WITH GENE FEATURES):
open(GFF,"$input_gff") || die "ERROR: make_gene_gff: cannot open $input_gff\n";
while(<GFF>)
{
$line = $_;
chomp $line;
if ($line eq '##FASTA') { $end_of_gff = 1;}
if (substr($line,0,1) ne '#' && $end_of_gff == 0)
{
@temp = split(/\t+/,$line);
$feature_type = $temp[2];
if ($feature_type eq 'gene')
{
print GENE_GFF "$line\n";
}
}
}
close(GFF);
close(GENE_GFF);
return($gene_gff,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &get_new_genename
sub test_get_new_genename
{
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
my $new_gene_name; # NEW NAME FOR GENE
($new_gene_name,$errorcode,$errormsg) = &get_new_genename(326,"SRAE");
if ($errorcode != 0) { print STDERR "ERROR: test_get_new_genename: failed test1\n"; exit;}
if ($new_gene_name ne 'SRAE_0000000326') { print STDERR "ERROR: test_get_new_genename: failed test1b (new_gene_name $new_gene_name)\n"; exit;}
($new_gene_name,$errorcode,$errormsg) = &get_new_genename(26,"SRAE");
if ($errorcode != 0) { print STDERR "ERROR: test_get_new_genename: failed test2\n"; exit;}
if ($new_gene_name ne 'SRAE_0000000026') { print STDERR "ERROR: failed test2b\n"; exit;}
($new_gene_name,$errorcode,$errormsg) = &get_new_genename(6,"SRAE");
if ($errorcode != 0) { print STDERR "ERROR: test_get_new_genename: failed test3\n"; exit;}
if ($new_gene_name ne 'SRAE_0000000006') { print STDERR "ERROR: failed test3b\n"; exit;}
($new_gene_name,$errorcode,$errormsg) = &get_new_genename(1326,"SRAE");
if ($errorcode != 0) { print STDERR "ERROR: test_get_new_genename: failed test4\n"; exit;}
if ($new_gene_name ne 'SRAE_0000001326') { print STDERR "ERROR: failed test4b\n"; exit;}
($new_gene_name,$errorcode,$errormsg) = &get_new_genename(14326,"SRAE");
if ($errorcode != 0) { print STDERR "ERROR: test_get_new_genename: failed test5\n"; exit;}
if ($new_gene_name ne 'SRAE_0000014326') { print STDERR "ERROR: failed test5b\n"; exit;}
($new_gene_name,$errorcode,$errormsg) = &get_new_genename(514326,"SRAE");
if ($errorcode != 0) { print STDERR "ERROR: test_get_new_genename: failed test6\n"; exit;}
if ($new_gene_name ne 'SRAE_0000514326') { print STDERR "ERROR: failed test6b\n"; exit;}
($new_gene_name,$errorcode,$errormsg) = &get_new_genename(7514326,"SRAE");
if ($errorcode != 0) { print STDERR "ERROR: test_get_new_genename: failed test7\n"; exit;}
if ($new_gene_name ne 'SRAE_0007514326') { print STDERR "ERROR: failed test7b\n"; exit;}
($new_gene_name,$errorcode,$errormsg) = &get_new_genename(17514326,"SRAE");
if ($errorcode != 0) { print STDERR "ERROR: test_get_new_genename: failed test7\n"; exit;}
if ($new_gene_name ne 'SRAE_0017514326') { print STDERR "ERROR: failed test7b\n"; exit;}
($new_gene_name,$errorcode,$errormsg) = &get_new_genename(817514326,"SRAE");
if ($errorcode != 0) { print STDERR "ERROR: test_get_new_genename: failed test8\n"; exit;}
if ($new_gene_name ne 'SRAE_0817514326') { print STDERR "ERROR: failed test8b\n"; exit;}
($new_gene_name,$errorcode,$errormsg) = &get_new_genename(9817514326,"SRAE");
if ($errorcode != 0) { print STDERR "ERROR: test_get_new_genename: failed test9\n"; exit;}
if ($new_gene_name ne 'SRAE_9817514326') { print STDERR "ERROR: failed test9b\n"; exit;}
# TEST ERRORCODE=50: LENGTH OF LAST PART OF NAME IS >10:
($new_gene_name,$errorcode,$errormsg) = &get_new_genename(49817514326,"SRAE");
if ($errorcode != 50) { print STDERR "ERROR: test_get_new_genename: failed test10\n"; exit;}
}
#------------------------------------------------------------------#
# GET THE NEW NAME FOR A GENE:
sub get_new_genename
{
my $gene_count = $_[0]; # COUNT OF GENES SEEN SO FAR
my $species = $_[1]; # ABBREVIATION FOR SPECIES
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $gene_name = ""; # GENE NAME
my $length_last_part; # LENGTH OF LAST PART OF GENE NAME
my $length_first_part; # LENGTH OF FIRST PART OF GENE NAME
my $i; #
# S. RATTI GENES ARE CALLED eg. SRAE_1243200, SRAE_X083500, etc.
$length_last_part = length($gene_count);
if ($length_last_part > 10)
{
$errormsg = "ERROR: get_new_genename: gene count $gene_count\n";
$errorcode = 50; # ERRORCODE=50 (TESTED FOR)
return($gene_name,$errorcode,$errormsg);
}
$length_first_part = 10 - $length_last_part;
for ($i = 1; $i <= $length_first_part; $i++) { $gene_name = $gene_name."0";}
$gene_name = $gene_name.$gene_count;
$gene_name = $species."_".$gene_name;
return($gene_name,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &find_overlapping_features
sub test_find_overlapping_features
{
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR
my $features_gff; # GFF FILE WITH FEATURES OF GENES ('exon', 'CDS', etc.)
my $genes_gff; # GFF FILE WITH 'gene' FEATURES
my $FEATURES; # HASH TABLE OF OVERLAPPING FEATURES FOR GENES
my $features; # OVERLAPPING FEATURES FOR A GENE
my @features; # OVERLAPPING FEATURES FOR A GENE
($features_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(FEATURES_GFF,">$features_gff") || die "ERROR: test_find_overlapping_features: cannot open $features_gff\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:0;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker exon 36179 36341 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker exon 37052 37204 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:2;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker exon 38326 38535 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:3;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker exon 38781 38871 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:4;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker exon 39087 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:5;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker CDS 36179 36341 . + 1 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker CDS 38326 38535 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker CDS 38781 38871 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker CDS 39087 39253 . + 2 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker gene 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker mRNA 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;_AED=0.06;_eAED=0.06;_QI=0|0|0|1|1|1|3|0|276\n";
close(FEATURES_GFF);
($genes_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GENES_GFF,">$genes_gff") || die "ERROR: test_find_overlapping_features: cannot open $features_gff\n";
print GENES_GFF "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110\n";
print GENES_GFF "BPAG.contig.00154.117101 maker gene 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119\n";
close(GENES_GFF);
($FEATURES,$errorcode,$errormsg) = &find_overlapping_features($features_gff,$genes_gff);
if ($errorcode != 0) { print STDERR "ERROR: test_find_overlapping_features: failed test1\n"; exit;}
if (!($FEATURES->{"BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110"})) { print STDERR "ERROR: test_find_overlapping_features: failed test1b\n"; exit;}
$features = $FEATURES->{"BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110"};
@features = split(/\n+/,$features);
if ($#features != 13) { print STDERR "ERROR: test_find_overlapping_features: failed test1c\n"; exit;}
if ($features[0] ne 'BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110') { print STDERR "ERROR: test_find_overlapping_features: failed test1d\n"; exit;}
if ($features[1] ne 'BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309') { print STDERR "ERROR: test_find_overlapping_features: failed test1e\n"; exit;}
if ($features[2] ne 'BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:0;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test1f\n"; exit;}
if ($features[3] ne 'BPAG.contig.00154.117101 maker exon 36179 36341 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test1g\n"; exit;}
if ($features[4] ne 'BPAG.contig.00154.117101 maker exon 37052 37204 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:2;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test1h\n"; exit;}
if ($features[5] ne 'BPAG.contig.00154.117101 maker exon 38326 38535 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:3;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test1i\n"; exit;}
if ($features[6] ne 'BPAG.contig.00154.117101 maker exon 38781 38871 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:4;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test1j\n"; exit;}
if ($features[7] ne 'BPAG.contig.00154.117101 maker exon 39087 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:5;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test1k\n"; exit;}
if ($features[8] ne 'BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test1l\n"; exit;}
if ($features[9] ne 'BPAG.contig.00154.117101 maker CDS 36179 36341 . + 1 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test1m\n"; exit;}
if ($features[10] ne 'BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test1n\n"; exit;}
if ($features[11] ne 'BPAG.contig.00154.117101 maker CDS 38326 38535 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test1o\n"; exit;}
if ($features[12] ne 'BPAG.contig.00154.117101 maker CDS 38781 38871 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test1p\n"; exit;}
if ($features[13] ne 'BPAG.contig.00154.117101 maker CDS 39087 39253 . + 2 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test1q\n"; exit;}
if (!($FEATURES->{"BPAG.contig.00154.117101 maker gene 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119"})) { print STDERR "ERROR: test_find_overalpping_features: failed test2\n"; exit;}
$features = $FEATURES->{"BPAG.contig.00154.117101 maker gene 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119"};
@features = split(/\n+/,$features);
if ($#features != 1) { print STDERR "ERROR: test_find_overlapping_features: failed test2b\n"; exit;}
if ($features[0] ne 'BPAG.contig.00154.117101 maker gene 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119') { print STDERR "ERROR: test_find_overlapping_features: failed test2c\n"; exit;}
if ($features[1] ne 'BPAG.contig.00154.117101 maker mRNA 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119-mRNA-1;_AED=0.06;_eAED=0.06;_QI=0|0|0|1|1|1|3|0|276') { print STDERR "ERROR: test_find_overlapping_features: failed test2d\n"; exit;}
system "rm -f $features_gff";
system "rm -f $genes_gff";
# TEST ERRORCODE=100 (OVERLAPPING GENES OF THE SAME NAME ON THE SAME STRAND):
($features_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(FEATURES_GFF,">$features_gff") || die "ERROR: test_find_overlapping_features: cannot open $features_gff\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:0;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker exon 36179 36341 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker exon 37052 37204 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:2;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker exon 38326 38535 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:3;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker exon 38781 38871 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:4;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker exon 39087 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:5;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker CDS 36179 36341 . + 1 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker CDS 38326 38535 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker CDS 38781 38871 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker CDS 39087 39253 . + 2 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker gene 35584 39255 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110\n";
print FEATURES_GFF "BPAG.contig.00154.117101 maker mRNA 35584 39255 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;_AED=0.06;_eAED=0.06;_QI=0|0|0|1|1|1|3|0|276\n";
close(FEATURES_GFF);
($genes_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GENES_GFF,">$genes_gff") || die "ERROR: test_find_overlapping_features: cannot open $features_gff\n";
print GENES_GFF "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110\n";
print GENES_GFF "BPAG.contig.00154.117101 maker gene 103190 104650 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.119;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.119\n";
close(GENES_GFF);
($FEATURES,$errorcode,$errormsg) = &find_overlapping_features($features_gff,$genes_gff);
if ($errorcode != 100) { print STDERR "ERROR: test_find_overlapping_features: failed test3\n"; exit;}
system "rm -f $features_gff";
system "rm -f $genes_gff";
# TEST AN EXAMPLE WITH TWO OVERLAPPING GENES ON THE SAME STRAND, WHERE ONE OF THE GENES HAS BOTH POSITIVE AND NEGATIVE EXONS:
($features_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(FEATURES_GFF,">$features_gff") || die "ERROR: test_find_overlapping_features: cannot open $features_gff\n";
print FEATURES_GFF "AG00002 maker gene 683300 686711 . - . ID=maker-AG00002-pred_gff_protein_coding-gene-7.12;Name=maker-AG00002-pred_gff_protein_coding-gene-7.12\n";
print FEATURES_GFF "AG00002 maker mRNA 683300 686711 . + . ID=F07C4.9;Parent=maker-AG00002-pred_gff_protein_coding-gene-7.12;Name=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1;_AED=1.00;_eAED=1.00;_QI=84|0|0|0|1|1|6|0|282\n";
print FEATURES_GFF "AG00002 model_gff:maker match 683300 686711 . + . ID=AG00002:hit:2251:4.5.0.7;Name=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1\n";
print FEATURES_GFF "AG00002 model_gff:maker match_part 683300 683550 . - . ID=AG00002:hsp:8272:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 1 251 +;Gap=M251\n";
print FEATURES_GFF "AG00002 model_gff:maker match_part 683670 683878 . - . ID=AG00002:hsp:8273:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 252 460 +;Gap=M209\n";
print FEATURES_GFF "AG00002 model_gff:maker match_part 684544 684548 . - . ID=AG00002:hsp:8274:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 461 465 +;Gap=M5\n";
print FEATURES_GFF "AG00002 model_gff:maker match_part 685452 685459 . + . ID=AG00002:hsp:8275:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 466 473 +;Gap=M8\n";
print FEATURES_GFF "AG00002 model_gff:maker match_part 686133 686341 . + . ID=AG00002:hsp:8276:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 474 682 +;Gap=M209\n";
print FEATURES_GFF "AG00002 model_gff:maker match_part 686461 686711 . + . ID=AG00002:hsp:8277:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 683 933 +;Gap=M251\n";
print FEATURES_GFF "AG00002 maker exon 684544 684548 . - . ID=F07C4.9:exon:255;Parent=F07C4.9\n";
print FEATURES_GFF "AG00002 maker exon 683670 683878 . - . ID=F07C4.9:exon:254;Parent=F07C4.9\n";
print FEATURES_GFF "AG00002 maker exon 683300 683550 . - . ID=F07C4.9:exon:253;Parent=F07C4.9\n";
print FEATURES_GFF "AG00002 maker exon 685452 685459 . + . ID=F07C4.9:exon:256;Parent=F07C4.9\n";
print FEATURES_GFF "AG00002 maker exon 686133 686341 . + . ID=F07C4.9:exon:257;Parent=F07C4.9\n";
print FEATURES_GFF "AG00002 maker exon 686461 686711 . + . ID=F07C4.9:exon:258;Parent=F07C4.9\n";
print FEATURES_GFF "AG00002 maker five_prime_UTR 683550 683633 . + . ID=F07C4.9:five_prime_utr;Parent=F07C4.9\n";
print FEATURES_GFF "AG00002 maker CDS 683300 683634 . + 0 ID=F07C4.9:cds;Parent=F07C4.9\n";
print FEATURES_GFF "AG00002 maker CDS 683670 683878 . + 1 ID=F07C4.9:cds;Parent=F07C4.9\n";
print FEATURES_GFF "AG00002 maker CDS 684544 684548 . + 2 ID=F07C4.9:cds;Parent=F07C4.9\n";
print FEATURES_GFF "AG00002 maker CDS 685452 685459 . + 0 ID=F07C4.9:cds;Parent=F07C4.9\n";
print FEATURES_GFF "AG00002 maker CDS 686133 686341 . + 1 ID=F07C4.9:cds;Parent=F07C4.9\n";
print FEATURES_GFF "AG00002 maker CDS 686461 686711 . + 2 ID=F07C4.9:cds;Parent=F07C4.9\n";
print FEATURES_GFF "AG00002 maker gene 683300 684111 . - . ID=augustus_masked-AG00002-processed-gene-6.5;Name=augustus_masked-AG00002-processed-gene-6.5\n";
print FEATURES_GFF "AG00002 maker mRNA 683300 684111 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1;Parent=augustus_masked-AG00002-processed-gene-6.5;Name=augustus_masked-AG00002-processed-gene-6.5-mRNA-1;_AED=0.23;_eAED=0.29;_QI=54|0|0|0.66|1|1|3|0|162\n";
print FEATURES_GFF "AG00002 maker exon 684077 684111 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:exon:210;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1\n";
print FEATURES_GFF "AG00002 maker exon 683646 683902 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:exon:209;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1\n";
print FEATURES_GFF "AG00002 maker exon 683300 683550 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:exon:208;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1\n";
print FEATURES_GFF "AG00002 maker five_prime_UTR 684077 684111 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:five_prime_utr;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1\n";
print FEATURES_GFF "AG00002 maker five_prime_UTR 683884 683902 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:five_prime_utr;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1\n";
print FEATURES_GFF "AG00002 maker CDS 683646 683883 . - 0 ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:cds;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1\n";
print FEATURES_GFF "AG00002 maker CDS 683300 683550 . - 2 ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:cds;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1\n";
close(FEATURES_GFF);
($genes_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GENES_GFF,">$genes_gff") || die "ERROR: test_find_overlapping_features: cannot open $genes_gff\n";
print GENES_GFF "AG00002 maker gene 683300 686711 . - . ID=maker-AG00002-pred_gff_protein_coding-gene-7.12;Name=maker-AG00002-pred_gff_protein_coding-gene-7.12\n";
print GENES_GFF "AG00002 maker gene 683300 684111 . - . ID=augustus_masked-AG00002-processed-gene-6.5;Name=augustus_masked-AG00002-processed-gene-6.5\n";
close(GENES_GFF);
($FEATURES,$errorcode,$errormsg) = &find_overlapping_features($features_gff,$genes_gff);
if ($errorcode != 0) { print STDERR "ERROR: test_find_overlapping_features: failed test4\n"; exit;}
if (!($FEATURES->{"AG00002 maker gene 683300 686711 . - . ID=maker-AG00002-pred_gff_protein_coding-gene-7.12;Name=maker-AG00002-pred_gff_protein_coding-gene-7.12"})) { print STDERR "ERROR: test_find_overlapping_features: failed test4b\n"; exit;}
$features = $FEATURES->{"AG00002 maker gene 683300 686711 . - . ID=maker-AG00002-pred_gff_protein_coding-gene-7.12;Name=maker-AG00002-pred_gff_protein_coding-gene-7.12"};
@features = split(/\n+/,$features);
if ($#features != 15) { print STDERR "ERROR: test_find_overlapping_features: failed test4c ($#features):\n$features"; exit;}
if ($features[0] ne 'AG00002 maker gene 683300 686711 . - . ID=maker-AG00002-pred_gff_protein_coding-gene-7.12;Name=maker-AG00002-pred_gff_protein_coding-gene-7.12') { print STDERR "ERROR: test_find_overlapping_features: failed test4d\n"; exit;}
if ($features[1] ne 'AG00002 model_gff:maker match_part 683300 683550 . - . ID=AG00002:hsp:8272:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 1 251 +;Gap=M251') { print STDERR "ERROR: test_find_overlapping_features: failed test4e ($features[1])\n"; exit;}
if ($features[2] ne 'AG00002 model_gff:maker match_part 683670 683878 . - . ID=AG00002:hsp:8273:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 252 460 +;Gap=M209') { print STDERR "ERROR: test_find_overlapping_features: failed test4f\n"; exit;}
if ($features[3] ne 'AG00002 model_gff:maker match_part 684544 684548 . - . ID=AG00002:hsp:8274:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 461 465 +;Gap=M5') { print STDERR "ERROR: test_find_overlapping_features: failed test4g ($features[3])\n"; exit;}
if ($features[4] ne 'AG00002 maker exon 684544 684548 . - . ID=F07C4.9:exon:255;Parent=F07C4.9') { print STDERR "ERROR: test_find_overlapping_features: failed test4h\n"; exit;}
if ($features[5] ne 'AG00002 maker exon 683670 683878 . - . ID=F07C4.9:exon:254;Parent=F07C4.9') { print STDERR "ERROR: test_find_overlapping_features: failed test4i\n"; exit;}
if ($features[6] ne 'AG00002 maker exon 683300 683550 . - . ID=F07C4.9:exon:253;Parent=F07C4.9') { print STDERR "ERROR: test_find_overlapping_features: failed test4j\n"; exit;}
if ($features[7] ne 'AG00002 maker gene 683300 684111 . - . ID=augustus_masked-AG00002-processed-gene-6.5;Name=augustus_masked-AG00002-processed-gene-6.5') { print STDERR "ERROR: test_find_overlapping_features: failed test4k\n"; exit;}
if ($features[8] ne 'AG00002 maker mRNA 683300 684111 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1;Parent=augustus_masked-AG00002-processed-gene-6.5;Name=augustus_masked-AG00002-processed-gene-6.5-mRNA-1;_AED=0.23;_eAED=0.29;_QI=54|0|0|0.66|1|1|3|0|162') { print STDERR "ERROR: test_find_overlapping_features: failed test4l\n"; exit;}
if ($features[9] ne 'AG00002 maker exon 684077 684111 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:exon:210;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test4m\n"; exit;}
if ($features[10] ne 'AG00002 maker exon 683646 683902 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:exon:209;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test4n\n"; exit;}
if ($features[11] ne 'AG00002 maker exon 683300 683550 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:exon:208;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test4o\n"; exit;}
if ($features[12] ne 'AG00002 maker five_prime_UTR 684077 684111 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:five_prime_utr;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test4p\n"; exit;}
if ($features[13] ne 'AG00002 maker five_prime_UTR 683884 683902 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:five_prime_utr;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test4q\n"; exit;}
if ($features[14] ne 'AG00002 maker CDS 683646 683883 . - 0 ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:cds;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test4r\n"; exit;}
if ($features[15] ne 'AG00002 maker CDS 683300 683550 . - 2 ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:cds;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test4s\n"; exit;}
if (!($FEATURES->{"AG00002 maker gene 683300 684111 . - . ID=augustus_masked-AG00002-processed-gene-6.5;Name=augustus_masked-AG00002-processed-gene-6.5"})) { print STDERR "ERROR: test_find_overlapping_features: failed test4aa\n"; exit;}
$features = $FEATURES->{"AG00002 maker gene 683300 684111 . - . ID=augustus_masked-AG00002-processed-gene-6.5;Name=augustus_masked-AG00002-processed-gene-6.5"};
@features = split(/\n+/,$features);
if ($#features != 13) { print STDERR "ERROR: test_find_overlapping_features: failed test4ab (($#features)\n"; exit;}
if ($features[0] ne 'AG00002 maker gene 683300 686711 . - . ID=maker-AG00002-pred_gff_protein_coding-gene-7.12;Name=maker-AG00002-pred_gff_protein_coding-gene-7.12') { print STDERR "ERROR: test_find_overlapping_features: failed test4ac\n"; exit;}
if ($features[1] ne 'AG00002 model_gff:maker match_part 683300 683550 . - . ID=AG00002:hsp:8272:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 1 251 +;Gap=M251') { print STDERR "ERROR: test_find_overlapping_features: failed test4ad ($features[3])\n"; exit;}
if ($features[2] ne 'AG00002 model_gff:maker match_part 683670 683878 . - . ID=AG00002:hsp:8273:4.5.0.7;Parent=AG00002:hit:2251:4.5.0.7;Target=maker-AG00002-pred_gff_protein_coding-gene-7.12-mRNA-1 252 460 +;Gap=M209') { print STDERR "ERROR: test_find_overlapping_features: failed testae\n"; exit;}
if ($features[3] ne 'AG00002 maker exon 683670 683878 . - . ID=F07C4.9:exon:254;Parent=F07C4.9') { print STDERR "ERROR: test_find_overlapping_features: failed test4af\n"; exit;}
if ($features[4] ne 'AG00002 maker exon 683300 683550 . - . ID=F07C4.9:exon:253;Parent=F07C4.9') { print STDERR "ERROR: test_find_overlapping_features: failed test4ag\n"; exit;}
if ($features[5] ne 'AG00002 maker gene 683300 684111 . - . ID=augustus_masked-AG00002-processed-gene-6.5;Name=augustus_masked-AG00002-processed-gene-6.5') { print STDERR "ERROR: test_find_overlapping_features: failed test4ah\n"; exit;}
if ($features[6] ne 'AG00002 maker mRNA 683300 684111 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1;Parent=augustus_masked-AG00002-processed-gene-6.5;Name=augustus_masked-AG00002-processed-gene-6.5-mRNA-1;_AED=0.23;_eAED=0.29;_QI=54|0|0|0.66|1|1|3|0|162') { print STDERR "ERROR: test_find_overlapping_features: failed test4ai\n"; exit;}
if ($features[7] ne 'AG00002 maker exon 684077 684111 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:exon:210;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test4aj\n"; exit;}
if ($features[8] ne 'AG00002 maker exon 683646 683902 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:exon:209;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test4ak\n"; exit;}
if ($features[9] ne 'AG00002 maker exon 683300 683550 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:exon:208;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test4al\n"; exit;}
if ($features[10] ne 'AG00002 maker five_prime_UTR 684077 684111 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:five_prime_utr;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test4am\n"; exit;}
if ($features[11] ne 'AG00002 maker five_prime_UTR 683884 683902 . - . ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:five_prime_utr;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test4an\n"; exit;}
if ($features[12] ne 'AG00002 maker CDS 683646 683883 . - 0 ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:cds;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test4ao\n"; exit;}
if ($features[13] ne 'AG00002 maker CDS 683300 683550 . - 2 ID=augustus_masked-AG00002-processed-gene-6.5-mRNA-1:cds;Parent=augustus_masked-AG00002-processed-gene-6.5-mRNA-1') { print STDERR "ERROR: test_find_overlapping_features: failed test4ap\n"; exit;}
system "rm -f $features_gff";
system "rm -f $genes_gff";
}
#------------------------------------------------------------------#
# FIND FEATURES THAT OVERLAP BETWEEN EACH GENE IN $gene_gff2, AND ALL THE GENE FEATURES ('exon', 'CDS', etc.)
# IN $gene_gff:
sub find_overlapping_features
{
my $features_gff = $_[0]; # GFF FILE WITH FEATURES IN A GENE
my $genes_gff = $_[1]; # GFF FILE WITH 'gene' FEATURES ONLY
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my %FEATURES = (); # HASH TABLE WITH OVERLAPPING FEATURES FOR EACH GENE
my $overlaps; # FILE WITH OVERLAPS
my $line; #
my @temp; #
my $gene; # NAME OF GENE
my $cmd; # COMMAND TO RUN
my $gene1; # NAME OF THE FIRST GENE
my $start1; # START POSITION OF $gene1
my $end1; # END POSITION OF $gene1
my $feature_type2; # TYPE OF THE SECOND FEATURE
my $start2; # START OF THE SECOND FEATURE
my $end2; # END OF THE SECOND FEATURE
my $feature2; # NAME OF THE SECOND FEATURE
my $overlap; # GFF LINE FOR OVERLAPPING FEATURE
my $gene1_line; # GFF LINE FOR $gene1
($overlaps,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
$cmd = "bedtools intersect -loj -s -a $genes_gff -b $features_gff > $overlaps";
system "$cmd";
# The -loj option means that for each feature in gff1, BEDTools will report each overlapping feature in gff2.
# -s MEANS THAT WE ONLY REPORT FEATURES ON THE SAME STRAND. THIS IS BECAUSE IN SOME CASES THERE ARE TWO
# OVERLAPPING GENES WITH THE SAME NAME IN AN INPUT GFF, BUT THEY ARE ON OPPOSITE STRANDS.
# READ IN THE OVERLAPPING FEATURES FOR EACH GENE:
open(OVERLAPS,"$overlaps") || die "ERROR: find_overlapping_features: cannot open $overlaps\n";
while(<OVERLAPS>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
# eg. BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110 BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110
$gene1 = $temp[8];
$start1 = $temp[3];
$end1 = $temp[4];
$feature_type2 = $temp[11];
$start2 = $temp[12];
$end2 = $temp[13];
$feature2 = $temp[17];
if ($feature_type2 eq 'gene')
{
if ($gene1 eq $feature2 && !($start1 == $start2 && $end1 == $end2))
{
$errorcode = 100; # ERRORCODE=100 (TESTED FOR)
$errormsg = "ERROR: find_overlapping_features: gene $gene1 overlaps gene2 $feature2: line:\n$line\n";
system "rm -f $overlaps";
return(\%FEATURES,$errorcode,$errormsg);
}
}
# GET THE GFF LINE FOR THE OVERLAPPING FEATURE:
$overlap = $temp[9]."\t".$temp[10]."\t".$temp[11]."\t".$temp[12]."\t".$temp[13]."\t".$temp[14]."\t".$temp[15]."\t".$temp[16]."\t".$temp[17]."\n";
# RECORD THE OVERLAPS FOR $gene1:
$gene1_line = $temp[0]."\t".$temp[1]."\t".$temp[2]."\t".$temp[3]."\t".$temp[4]."\t".$temp[5]."\t".$temp[6]."\t".$temp[7]."\t".$temp[8];
if (!($FEATURES{$gene1_line})) { $FEATURES{$gene1_line} = $overlap; }
else { $FEATURES{$gene1_line} = $FEATURES{$gene1_line}.$overlap;}
}
close(OVERLAPS);
# DELETE TEMPORARY FILES:
system "rm -f $overlaps";
return(\%FEATURES,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &fix_gff_lines_for_gene
sub test_fix_gff_lines_for_gene
{
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES IN
my $gff_lines; # GFF LINES
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
my $new_lines; # NEW LINES
my @temp; #
$gff_lines = "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:0;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 36179 36341 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 37052 37204 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:2;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 38326 38535 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:3;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 38781 38871 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:4;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 39087 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:5;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 36179 36341 . + 1 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 38326 38535 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 38781 38871 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 39087 39253 . + 2 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
($new_lines,$errorcode,$errormsg) = &fix_gff_lines_for_gene($gff_lines,'SRAE_00003232','maker-BPAG.contig.00154.117101-augustus-gene-0.110');
if ($errorcode != 0) { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test1 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
@temp = split(/\n/,$new_lines);
if ($#temp != 13) { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test1b ($#temp)\n"; exit;}
if ($temp[0] ne 'BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=SRAE_00003232;Name=SRAE_00003232') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test1c\n"; exit;}
if ($temp[1] ne 'BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=SRAE_00003232-mRNA-1;Parent=SRAE_00003232;Name=SRAE_00003232-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test1d (temp1 $temp[1])\n"; exit;}
if ($temp[2] ne 'BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=SRAE_00003232-mRNA-1:exon:1;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test1e (temp2 $temp[2])\n"; exit;}
if ($temp[3] ne 'BPAG.contig.00154.117101 maker exon 36179 36341 . + . ID=SRAE_00003232-mRNA-1:exon:2;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test1f\n"; exit;}
if ($temp[4] ne 'BPAG.contig.00154.117101 maker exon 37052 37204 . + . ID=SRAE_00003232-mRNA-1:exon:3;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test1g\n"; exit;}
if ($temp[5] ne 'BPAG.contig.00154.117101 maker exon 38326 38535 . + . ID=SRAE_00003232-mRNA-1:exon:4;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test1h\n"; exit;}
if ($temp[6] ne 'BPAG.contig.00154.117101 maker exon 38781 38871 . + . ID=SRAE_00003232-mRNA-1:exon:5;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test1i\n"; exit;}
if ($temp[7] ne 'BPAG.contig.00154.117101 maker exon 39087 39253 . + . ID=SRAE_00003232-mRNA-1:exon:6;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test1j\n"; exit;}
if ($temp[8] ne 'BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test1k (temp8 $temp[8])\n"; exit;}
if ($temp[9] ne 'BPAG.contig.00154.117101 maker CDS 36179 36341 . + 1 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test1l\n"; exit;}
if ($temp[10] ne 'BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test1m\n"; exit;}
if ($temp[11] ne 'BPAG.contig.00154.117101 maker CDS 38326 38535 . + 0 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test1n\n"; exit;}
if ($temp[12] ne 'BPAG.contig.00154.117101 maker CDS 38781 38871 . + 0 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test1o\n"; exit;}
if ($temp[13] ne 'BPAG.contig.00154.117101 maker CDS 39087 39253 . + 2 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test1p\n"; exit;}
# EXAMPLE OF A GENE ON THE NEGATIVE STRAND:
$gff_lines = "BPAG.contig.00154.117101 maker gene 35582 39253 . - . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker mRNA 35582 39253 . - . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 35582 35727 . - . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:0;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 36179 36341 . - . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 37052 37204 . - . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:2;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 38326 38535 . - . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:3;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 38781 38871 . - . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:4;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 39087 39253 . - . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:5;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 35582 35727 . - 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 36179 36341 . - 1 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 37052 37204 . - 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 38326 38535 . - 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 38781 38871 . - 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 39087 39253 . - 2 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
($new_lines,$errorcode,$errormsg) = &fix_gff_lines_for_gene($gff_lines,'SRAE_00003232','maker-BPAG.contig.00154.117101-augustus-gene-0.110');
if ($errorcode != 0) { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test2\n"; exit;}
@temp = split(/\n/,$new_lines);
if ($#temp != 13) { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test2b\n"; exit;}
if ($temp[0] ne 'BPAG.contig.00154.117101 maker gene 35582 39253 . - . ID=SRAE_00003232;Name=SRAE_00003232') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test2c\n"; exit;}
if ($temp[1] ne 'BPAG.contig.00154.117101 maker mRNA 35582 39253 . - . ID=SRAE_00003232-mRNA-1;Parent=SRAE_00003232;Name=SRAE_00003232-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test2d (temp1 $temp[1])\n"; exit;}
if ($temp[2] ne 'BPAG.contig.00154.117101 maker exon 35582 35727 . - . ID=SRAE_00003232-mRNA-1:exon:6;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test2e (temp2 $temp[2])\n"; exit;}
if ($temp[3] ne 'BPAG.contig.00154.117101 maker exon 36179 36341 . - . ID=SRAE_00003232-mRNA-1:exon:5;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test2f\n"; exit;}
if ($temp[4] ne 'BPAG.contig.00154.117101 maker exon 37052 37204 . - . ID=SRAE_00003232-mRNA-1:exon:4;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test2g\n"; exit;}
if ($temp[5] ne 'BPAG.contig.00154.117101 maker exon 38326 38535 . - . ID=SRAE_00003232-mRNA-1:exon:3;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test2h\n"; exit;}
if ($temp[6] ne 'BPAG.contig.00154.117101 maker exon 38781 38871 . - . ID=SRAE_00003232-mRNA-1:exon:2;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test2i\n"; exit;}
if ($temp[7] ne 'BPAG.contig.00154.117101 maker exon 39087 39253 . - . ID=SRAE_00003232-mRNA-1:exon:1;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test2j\n"; exit;}
if ($temp[8] ne 'BPAG.contig.00154.117101 maker CDS 35582 35727 . - 0 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test2k (temp8 $temp[8])\n"; exit;}
if ($temp[9] ne 'BPAG.contig.00154.117101 maker CDS 36179 36341 . - 1 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test2l\n"; exit;}
if ($temp[10] ne 'BPAG.contig.00154.117101 maker CDS 37052 37204 . - 0 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test2m\n"; exit;}
if ($temp[11] ne 'BPAG.contig.00154.117101 maker CDS 38326 38535 . - 0 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test2n\n"; exit;}
if ($temp[12] ne 'BPAG.contig.00154.117101 maker CDS 38781 38871 . - 0 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test2o\n"; exit;}
if ($temp[13] ne 'BPAG.contig.00154.117101 maker CDS 39087 39253 . - 2 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test2p\n"; exit;}
# TEST ERRORCODE=60 (UNEXPECTED FEATURE TYPE):
$gff_lines = "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:0;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 36179 36341 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 37052 37204 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:2;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 38326 38535 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:3;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 38781 38871 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:4;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 39087 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:5;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 36179 36341 . + 1 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 38326 38535 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 38781 38871 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker contig 39087 39253 . + 2 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
($new_lines,$errorcode,$errormsg) = &fix_gff_lines_for_gene($gff_lines,'SRAE_00003232','maker-BPAG.contig.00154.117101-augustus-gene-0.110');
if ($errorcode != 60) { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test3\n"; exit;}
# EXAMPLE OF A GENE THAT DOES NOT HAVE -mRNA- IN TRANSCRIPT NAMES:
$gff_lines = "BPAG.contig.00154.117101 maker gene 53633 55479 . + . ID=maker-BPAG.contig.00154.117101-pred_gff_protein_coding-gene-0.33;Name=maker-BPAG.contig.00154.117101-pred_gff_protein_coding-gene-0.33\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker mRNA 53633 55479 . + . ID=1:ZK795.3;Parent=maker-BPAG.contig.00154.117101-pred_gff_protein_coding-gene-0.33;Name=maker-BPAG.contig.00154.117101-pred_gff_protein_coding-gene-0.33-mRNA-1;_AED=0.13;_eAED=0.13;_QI=0|0|0|1|1|1|6|0|288\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 53633 53816 . + . ID=1:ZK795.3:exon:18;Parent=1:ZK795.3\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 54152 54249 . + . ID=1:ZK795.3:exon:19;Parent=1:ZK795.3\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 54398 54487 . + . ID=1:ZK795.3:exon:20;Parent=1:ZK795.3\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 54707 54904 . + . ID=1:ZK795.3:exon:21;Parent=1:ZK795.3\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 55077 55245 . + . ID=1:ZK795.3:exon:22;Parent=1:ZK795.3\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 55352 55479 . + . ID=1:ZK795.3:exon:23;Parent=1:ZK795.3\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 53633 53816 . + 0 ID=1:ZK795.3:cds;Parent=1:ZK795.3\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 54152 54249 . + 2 ID=1:ZK795.3:cds;Parent=1:ZK795.3\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 54398 54487 . + 0 ID=1:ZK795.3:cds;Parent=1:ZK795.3\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 54707 54904 . + 0 ID=1:ZK795.3:cds;Parent=1:ZK795.3\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 55077 55245 . + 0 ID=1:ZK795.3:cds;Parent=1:ZK795.3\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 55352 55479 . + 2 ID=1:ZK795.3:cds;Parent=1:ZK795.3\n";
($new_lines,$errorcode,$errormsg) = &fix_gff_lines_for_gene($gff_lines,'SRAE_00003232','maker-BPAG.contig.00154.117101-pred_gff_protein_coding-gene-0.33');
if ($errorcode != 0) { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test4\n"; exit;}
@temp = split(/\n/,$new_lines);
if ($#temp != 13) { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test4b ($#temp)\n"; exit;}
if ($temp[0] ne 'BPAG.contig.00154.117101 maker gene 53633 55479 . + . ID=SRAE_00003232;Name=SRAE_00003232') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test4c\n"; exit;}
if ($temp[1] ne 'BPAG.contig.00154.117101 maker mRNA 53633 55479 . + . ID=SRAE_00003232-mRNA-1;Parent=SRAE_00003232;Name=SRAE_00003232-mRNA-1;_AED=0.13;_eAED=0.13;_QI=0|0|0|1|1|1|6|0|288') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test4d\n"; exit;}
if ($temp[2] ne 'BPAG.contig.00154.117101 maker exon 53633 53816 . + . ID=SRAE_00003232-mRNA-1:exon:1;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test4e (temp2 $temp[2])\n"; exit;}
if ($temp[3] ne 'BPAG.contig.00154.117101 maker exon 54152 54249 . + . ID=SRAE_00003232-mRNA-1:exon:2;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test4f\n"; exit;}
if ($temp[4] ne 'BPAG.contig.00154.117101 maker exon 54398 54487 . + . ID=SRAE_00003232-mRNA-1:exon:3;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test4g\n"; exit;}
if ($temp[5] ne 'BPAG.contig.00154.117101 maker exon 54707 54904 . + . ID=SRAE_00003232-mRNA-1:exon:4;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test4h\n"; exit;}
if ($temp[6] ne 'BPAG.contig.00154.117101 maker exon 55077 55245 . + . ID=SRAE_00003232-mRNA-1:exon:5;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test4i\n"; exit;}
if ($temp[7] ne 'BPAG.contig.00154.117101 maker exon 55352 55479 . + . ID=SRAE_00003232-mRNA-1:exon:6;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test4j\n"; exit;}
if ($temp[8] ne 'BPAG.contig.00154.117101 maker CDS 53633 53816 . + 0 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test4k\n"; exit;}
if ($temp[9] ne 'BPAG.contig.00154.117101 maker CDS 54152 54249 . + 2 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test4l\n"; exit;}
if ($temp[10] ne 'BPAG.contig.00154.117101 maker CDS 54398 54487 . + 0 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test4m\n"; exit;}
if ($temp[11] ne 'BPAG.contig.00154.117101 maker CDS 54707 54904 . + 0 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test4n\n"; exit;}
if ($temp[12] ne 'BPAG.contig.00154.117101 maker CDS 55077 55245 . + 0 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test4o\n"; exit;}
if ($temp[13] ne 'BPAG.contig.00154.117101 maker CDS 55352 55479 . + 2 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test4p\n"; exit;}
# EXAMPLE OF DUPLICATE LINES IN THE INPUT:
$gff_lines = "BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:0;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 36179 36341 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 37052 37204 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:2;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 38326 38535 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:3;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 38781 38871 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:4;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker exon 39087 39253 . + . ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:5;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 36179 36341 . + 1 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 38326 38535 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 38781 38871 . + 0 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
$gff_lines = $gff_lines."BPAG.contig.00154.117101 maker CDS 39087 39253 . + 2 ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1\n";
($new_lines,$errorcode,$errormsg) = &fix_gff_lines_for_gene($gff_lines,'SRAE_00003232','maker-BPAG.contig.00154.117101-augustus-gene-0.110');
if ($errorcode != 0) { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test5 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
@temp = split(/\n/,$new_lines);
if ($#temp != 13) { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test5b ($#temp)\n"; exit;}
if ($temp[0] ne 'BPAG.contig.00154.117101 maker gene 35582 39253 . + . ID=SRAE_00003232;Name=SRAE_00003232') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test5c\n"; exit;}
if ($temp[1] ne 'BPAG.contig.00154.117101 maker mRNA 35582 39253 . + . ID=SRAE_00003232-mRNA-1;Parent=SRAE_00003232;Name=SRAE_00003232-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test5d (temp1 $temp[1])\n"; exit;}
if ($temp[2] ne 'BPAG.contig.00154.117101 maker exon 35582 35727 . + . ID=SRAE_00003232-mRNA-1:exon:1;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test5e (temp2 $temp[2])\n"; exit;}
if ($temp[3] ne 'BPAG.contig.00154.117101 maker exon 36179 36341 . + . ID=SRAE_00003232-mRNA-1:exon:2;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test5f\n"; exit;}
if ($temp[4] ne 'BPAG.contig.00154.117101 maker exon 37052 37204 . + . ID=SRAE_00003232-mRNA-1:exon:3;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test5g\n"; exit;}
if ($temp[5] ne 'BPAG.contig.00154.117101 maker exon 38326 38535 . + . ID=SRAE_00003232-mRNA-1:exon:4;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test5h\n"; exit;}
if ($temp[6] ne 'BPAG.contig.00154.117101 maker exon 38781 38871 . + . ID=SRAE_00003232-mRNA-1:exon:5;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test5i\n"; exit;}
if ($temp[7] ne 'BPAG.contig.00154.117101 maker exon 39087 39253 . + . ID=SRAE_00003232-mRNA-1:exon:6;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test5j\n"; exit;}
if ($temp[8] ne 'BPAG.contig.00154.117101 maker CDS 35582 35727 . + 0 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test5k (temp8 $temp[8])\n"; exit;}
if ($temp[9] ne 'BPAG.contig.00154.117101 maker CDS 36179 36341 . + 1 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test5l\n"; exit;}
if ($temp[10] ne 'BPAG.contig.00154.117101 maker CDS 37052 37204 . + 0 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test5m\n"; exit;}
if ($temp[11] ne 'BPAG.contig.00154.117101 maker CDS 38326 38535 . + 0 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test5n\n"; exit;}
if ($temp[12] ne 'BPAG.contig.00154.117101 maker CDS 38781 38871 . + 0 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test5o\n"; exit;}
if ($temp[13] ne 'BPAG.contig.00154.117101 maker CDS 39087 39253 . + 2 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test5p\n"; exit;}
# EXAMPLE WITH MULTIPLE ALTERNATIVE TRANSCRIPTS:
$gff_lines = "Sratt_Chr00_000002 maker gene 371193 372952 . + . ID=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1;Name=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1\n";
$gff_lines = $gff_lines."Sratt_Chr00_000002 maker mRNA 371193 372952 . + . ID=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-1;Parent=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1;Name=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-1;_AED=0.02;_eAED=0.02;_QI=550|1|1|1|0|0|3|60|350;score=3922\n";
$gff_lines = $gff_lines."Sratt_Chr00_000002 maker mRNA 371415 372952 . + . ID=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-2;Parent=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1;Name=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-2;_AED=0.07;_eAED=0.07;_QI=378|1|1|1|0|0|2|60|350;score=7307\n";
$gff_lines = $gff_lines."Sratt_Chr00_000002 maker exon 371193 371384 . + . ID=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-1:exon:229;Parent=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-1\n";
$gff_lines = $gff_lines."Sratt_Chr00_000002 maker exon 371435 371821 . + . ID=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-1:exon:230;Parent=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-1\n";
$gff_lines = $gff_lines."Sratt_Chr00_000002 maker exon 371869 372952 . + . ID=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-1:exon:231;Parent=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-1,maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-2\n";
$gff_lines = $gff_lines."Sratt_Chr00_000002 maker exon 371415 371821 . + . ID=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-2:exon:232;Parent=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-2\n";
$gff_lines = $gff_lines."Sratt_Chr00_000002 maker five_prime_UTR 371193 371384 . + . ID=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-1:five_prime_utr;Parent=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-1\n";
$gff_lines = $gff_lines."Sratt_Chr00_000002 maker five_prime_UTR 371435 371792 . + . ID=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-1:five_prime_utr;Parent=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-1\n";
$gff_lines = $gff_lines."Sratt_Chr00_000002 maker CDS 371793 371821 . + 0 ID=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-1:cds;Parent=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-1\n";
$gff_lines = $gff_lines."Sratt_Chr00_000002 maker CDS 371869 372892 . + 1 ID=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-1:cds;Parent=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-1\n";
$gff_lines = $gff_lines."Sratt_Chr00_000002 maker three_prime_UTR 372893 372952 . + . ID=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-1:three_prime_utr;Parent=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-1\n";
$gff_lines = $gff_lines."Sratt_Chr00_000002 maker five_prime_UTR 371415 371792 . + . ID=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-2:five_prime_utr;Parent=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-2\n";
$gff_lines = $gff_lines."Sratt_Chr00_000002 maker CDS 371793 371821 . + 0 ID=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-2:cds;Parent=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-2\n";
$gff_lines = $gff_lines."Sratt_Chr00_000002 maker CDS 371869 372892 . + 1 ID=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-2:cds;Parent=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-2\n";
$gff_lines = $gff_lines."Sratt_Chr00_000002 maker three_prime_UTR 372893 372952 . + . ID=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-2:three_prime_utr;Parent=maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1-mRNA-2\n";
($new_lines,$errorcode,$errormsg) = &fix_gff_lines_for_gene($gff_lines,'SRAE_00003232','maker-Sratt_Chr00_000002-exonerate_est2genome-gene-3.1');
if ($errorcode != 0) { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test6 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
@temp = split(/\n/,$new_lines);
if ($#temp != 16) { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test6b ($#temp)\n"; exit;}
if ($temp[0] ne 'Sratt_Chr00_000002 maker gene 371193 372952 . + . ID=SRAE_00003232;Name=SRAE_00003232') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test6c\n"; exit;}
if ($temp[1] ne 'Sratt_Chr00_000002 maker mRNA 371193 372952 . + . ID=SRAE_00003232-mRNA-1;Parent=SRAE_00003232;Name=SRAE_00003232-mRNA-1;_AED=0.02;_eAED=0.02;_QI=550|1|1|1|0|0|3|60|350;score=3922') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test6d\n"; exit;}
if ($temp[2] ne 'Sratt_Chr00_000002 maker mRNA 371415 372952 . + . ID=SRAE_00003232-mRNA-2;Parent=SRAE_00003232;Name=SRAE_00003232-mRNA-2;_AED=0.07;_eAED=0.07;_QI=378|1|1|1|0|0|2|60|350;score=7307') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test6e\n"; exit;}
if ($temp[3] ne 'Sratt_Chr00_000002 maker exon 371193 371384 . + . ID=SRAE_00003232-mRNA-1:exon:1;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test6f\n"; exit;}
if ($temp[4] ne 'Sratt_Chr00_000002 maker exon 371435 371821 . + . ID=SRAE_00003232-mRNA-1:exon:2;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test6g\n"; exit;}
if ($temp[5] ne 'Sratt_Chr00_000002 maker exon 371869 372952 . + . ID=SRAE_00003232-mRNA-1:exon:3;Parent=SRAE_00003232-mRNA-1') { print STDER "ERROR: test_fix_gff_lines_for_gene: failed test6h\n"; exit;}
if ($temp[6] ne 'Sratt_Chr00_000002 maker exon 371869 372952 . + . ID=SRAE_00003232-mRNA-2:exon:2;Parent=SRAE_00003232-mRNA-2') { print STDER "ERROR: test_fix_gff_lines_for_gene: failed test6i\n"; exit;}
if ($temp[7] ne 'Sratt_Chr00_000002 maker exon 371415 371821 . + . ID=SRAE_00003232-mRNA-2:exon:1;Parent=SRAE_00003232-mRNA-2') { print STDERR "ERROR: test_fix_gff_lines_for_gene: failed test6j\n"; exit;}
if ($temp[8] ne 'Sratt_Chr00_000002 maker five_prime_UTR 371193 371384 . + . ID=SRAE_00003232-mRNA-1:five_prime_utr;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_line_for_gene: failed test6k\n"; exit;}
if ($temp[9] ne 'Sratt_Chr00_000002 maker five_prime_UTR 371435 371792 . + . ID=SRAE_00003232-mRNA-1:five_prime_utr;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_line_for_gene: failed test6l\n"; exit;}
if ($temp[10] ne 'Sratt_Chr00_000002 maker CDS 371793 371821 . + 0 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_line_for_gene: failed test6m\n"; exit;}
if ($temp[11] ne 'Sratt_Chr00_000002 maker CDS 371869 372892 . + 1 ID=SRAE_00003232-mRNA-1:cds;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_line_for_gene: failed test6n\n"; exit;}
if ($temp[12] ne 'Sratt_Chr00_000002 maker three_prime_UTR 372893 372952 . + . ID=SRAE_00003232-mRNA-1:three_prime_utr;Parent=SRAE_00003232-mRNA-1') { print STDERR "ERROR: test_fix_gff_line_for_gene: failed test6o\n"; exit;}
if ($temp[13] ne 'Sratt_Chr00_000002 maker five_prime_UTR 371415 371792 . + . ID=SRAE_00003232-mRNA-2:five_prime_utr;Parent=SRAE_00003232-mRNA-2') { print STDERR "ERROR: test_fix_gff_line_for_gene: failed test6p\n"; exit;}
if ($temp[14] ne 'Sratt_Chr00_000002 maker CDS 371793 371821 . + 0 ID=SRAE_00003232-mRNA-2:cds;Parent=SRAE_00003232-mRNA-2') { print STDERR "ERROR: test_fix_gff_line_for_gene: failed test6q\n"; exit;}
if ($temp[15] ne 'Sratt_Chr00_000002 maker CDS 371869 372892 . + 1 ID=SRAE_00003232-mRNA-2:cds;Parent=SRAE_00003232-mRNA-2') { print STDERR "ERROR: test_fix_gff_line_for_gene: failed test6r\n"; exit;}
if ($temp[16] ne 'Sratt_Chr00_000002 maker three_prime_UTR 372893 372952 . + . ID=SRAE_00003232-mRNA-2:three_prime_utr;Parent=SRAE_00003232-mRNA-2') { print STDERR "ERROR: test_fix_gff_line_for_gene: failed test6s\n"; exit;}
}
#------------------------------------------------------------------#
# RENAME A GENE AND ALL ITS EXONS etc.:
sub fix_gff_lines_for_gene
{
my $gff_lines = $_[0]; # GFF LINES THAT OVERLAP WITH THE GENE
my $new_gene_name = $_[1]; # NEW NAME FOR A GENE
my $old_gene_name = $_[2]; # OLD NAME FOR THE GENE
my $new_lines = ""; # REVISED GFF LINES FOR GENE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $GENE; # HASH TABLE OF GENE NAMES FOR TRANSCRIPTS
my $line; #
my @temp; #
my @temp2; #
my $new_line; # NEW GFF LINE
my $feature_type; # FEATURE TYPE
my $gene; # GENE NAME
my $transcript; # TRANSCRIPT NAME
my $new_transcript_name; # NEW NAME FOR TRANSCRIPT
my $aed_score; # AED SCORE INFORMATION
my $exon_start; # EXON START
my $exon_end; # EXON END
my $strand; # EXON STRAND
my %EXONS = (); # EXONS IN A TRANSCRIPT
my $EXONNUM; # HASH TABLE FOR STORING NUMBER OF AN EXON ALONG A TRANSCRIPT
my $exon_num; # EXON NUMBER FOR EXON
my $transcript_num; # TRANSCRIPT NUMBER IN THE GENE
my @new_lines; # THE NEW LINES
my $gene_gff_linecnt; # NUMBER OF LINES IN $gene_gff
my @gff_lines; # GFF LINES
my $i; #
my $scaffold; # SCAFFOLD
my %SEENLINE = (); # SAYS WHETHER WE HAVE ALREADY SEEN A NEW GFF LINE
my $j; #
my @temp3; #
my %SEEN = (); # HASH TABLE TO RECORD WHICH EXONS WE HAVE SEEN IN A TRANSCRIPT
# READ IN THE GENE NAME FOR EACH TRANSCRIPT IN $gff_lines (WHICH JUST CONTAINS FEATURES THAT OVERLAP GENE $old_gene_name):
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($gff_lines);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# READ IN THE GFF LINES THAT OVERLAP WITH THE GENE, AND ASSIGN NUMBERS TO THE EXONS:
# THE EXONS SHOULD BE NUMBERED 1-... FROM 5' TO 3' OF THE TRANSCRIPT.
@gff_lines = split(/\n+/,$gff_lines);
for ($i = 0; $i <= $#gff_lines; $i++)
{
$line = $gff_lines[$i];
chomp $line;
@temp = split(/\t+/,$line);
$scaffold = $temp[0];
$feature_type = $temp[2];
if ($feature_type eq 'gene' || $feature_type eq 'mRNA' || $feature_type eq 'CDS' || $feature_type eq 'exon' || $feature_type eq 'five_prime_UTR' || $feature_type eq 'three_prime_UTR')
{
if ($feature_type eq 'exon')
{
# ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:0;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1
$exon_start = $temp[3];
$exon_end = $temp[4];
$strand = $temp[6];
$transcript = $temp[8];
@temp2 = split(/Parent=/,$transcript);
$transcript = $temp2[1];
@temp2 = split(/\;/,$transcript);
$transcript = $temp2[0]; # eg. maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1
if ($transcript =~ /\,/)
{
@temp2 = split(/\,/,$transcript);
for ($j = 0; $j <= $#temp2; $j++)
{
$transcript = $temp2[$j];
# RECORD THE EXONS IN THE TRANSCRIPT:
# WE NEED TO RECORD WHETHER WE'VE SEEN A CERTAIN EXON IN THE TRANSCRIPT BEFORE, AS SOME MAKER LINES HAVE
# THE SAME TRANSCRIPT NAME TWICE:
# eg. N_americanus-1.0_Cont202 maker exon 178995 179125 . + . ID=1:ZK809.5b:exon:143;Parent=1:ZK809.5b,1:ZK809.5b
if (!($SEEN{$transcript."=".$exon_start."_".$exon_end."_".$strand}))
{
$SEEN{$transcript."=".$exon_start."_".$exon_end."_".$strand} = 1;
if ($EXONS{$transcript}) { $EXONS{$transcript} = $EXONS{$transcript}.",".$transcript."=".$exon_start."=".$exon_end."=".$strand;}
else { $EXONS{$transcript} = $transcript."=".$exon_start."=".$exon_end."=".$strand; }
}
}
}
else
{
# RECORD THE EXONS IN THE TRANSCRIPT:
if (!($SEEN{$transcript."=".$exon_start."_".$exon_end."_".$strand}))
{
$SEEN{$transcript."=".$exon_start."_".$exon_end."_".$strand} = 1;
if ($EXONS{$transcript}) { $EXONS{$transcript} = $EXONS{$transcript}.",".$transcript."=".$exon_start."=".$exon_end."=".$strand;}
else { $EXONS{$transcript} = $transcript."=".$exon_start."=".$exon_end."=".$strand; }
}
}
}
}
else
{
$errormsg = "ERROR: fix_gff_lines_for_gene: feature_type $feature_type line $line\n";
$errorcode = 60; # ERRORCODE=60 (TESTED FOR)
return($new_lines,$errorcode,$errormsg);
}
}
# ASSIGN NUMBERS TO THE EXONS IN THE TRANSCRIPTS:
($EXONNUM,$errorcode,$errormsg) = &assign_exon_numbers(\%EXONS);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# READ IN THE GFF LINES THAT OVERLAP WITH THE GENE:
for ($i = 0; $i <= $#gff_lines; $i++)
{
$line = $gff_lines[$i];
chomp $line;
@temp = split(/\t+/,$line);
$feature_type = $temp[2];
if ($feature_type eq 'gene' || $feature_type eq 'mRNA' || $feature_type eq 'CDS' || $feature_type eq 'exon' || $feature_type eq 'five_prime_UTR' || $feature_type eq 'three_prime_UTR')
{
if ($feature_type eq 'gene')
{
$gene = $temp[8]; # ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=m...
@temp2 = split(/ID=/,$gene);
$gene = $temp2[1]; # maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=m...
@temp2 = split(/\;/,$gene);
$gene = $temp2[0];
if ($gene eq $old_gene_name)
{
$new_line = $temp[0]."\t".$temp[1]."\t".$temp[2]."\t".$temp[3]."\t".$temp[4]."\t".$temp[5]."\t".$temp[6]."\t".$temp[7]."\tID=$new_gene_name\;Name=$new_gene_name\n";
if (!($SEENLINE{$new_line}))
{
$SEENLINE{$new_line} = 1;
$new_lines = $new_lines.$new_line;
}
}
}
elsif ($feature_type eq 'mRNA')
{
# eg. ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110;Name=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1;_AED=0.05;_eAED=0.05;_QI=0|0|0|1|1|1|6|0|309
$transcript = $temp[8];
@temp2 = split(/ID=/,$transcript);
$transcript = $temp2[1];
@temp2 = split(/\;/,$transcript);
$transcript = $temp2[0]; # eg. maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1
if ($transcript =~ /-mRNA-/)
{
@temp2 = split(/-mRNA-/,$transcript);
$transcript_num= $temp2[1]; # eg. 1
}
else
{
$transcript_num= 1;
}
$transcript = $scaffold."=".$transcript;
if ($GENE->{$transcript}) # WE IGNORE TRANSCRIPTS FOR WHICH WE DON'T KNOW THE GENE NAME, AS IT MEANS THEY ARE NOT WITHIN THE GENE OF INTEREST
{
$gene = $GENE->{$transcript};
if ($gene eq $old_gene_name)
{
$new_transcript_name = $new_gene_name."-mRNA-".$transcript_num;
# GET THE AED SCORE INFORMATION FOR THE TRANSCRIPT:
$transcript = $temp[8];
if ($transcript =~ /AED/) # FOR MAKER
{
@temp2 = split(/\;\_AED/,$transcript);
$aed_score = $temp2[1];
$aed_score = "_AED".$aed_score;
$new_line = $temp[0]."\t".$temp[1]."\t".$temp[2]."\t".$temp[3]."\t".$temp[4]."\t".$temp[5]."\t".$temp[6]."\t".$temp[7]."\tID=$new_transcript_name\;Parent=$new_gene_name\;Name=$new_transcript_name\;$aed_score\n";
}
else
{
$new_line = $temp[0]."\t".$temp[1]."\t".$temp[2]."\t".$temp[3]."\t".$temp[4]."\t".$temp[5]."\t".$temp[6]."\t".$temp[7]."\tID=$new_transcript_name\;Parent=$new_gene_name\;Name=$new_transcript_name\n";
}
if (!($SEENLINE{$new_line}))
{
$SEENLINE{$new_line} = 1;
$new_lines = $new_lines.$new_line;
}
}
}
}
elsif ($feature_type eq 'CDS')
{
# eg. ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:cds;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1
# ID=1:ZK795.3:cds;Parent=1:ZK795.3
$transcript = $temp[8];
@temp2 = split(/Parent=/,$transcript);
$transcript = $temp2[1];
@temp2 = split(/\;/,$transcript);
$transcript = $temp2[0]; # eg. maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1
if ($transcript =~ /-mRNA-/)
{
@temp2 = split(/-mRNA-/,$transcript);
$transcript_num= $temp2[1]; # eg. 1
}
else
{
$transcript_num = 1;
}
$transcript = $scaffold."=".$transcript;
if ($GENE->{$transcript}) # WE IGNORE TRANSCRIPTS FOR WHICH WE DON'T KNOW THE GENE NAME, AS IT MEANS THEY ARE NOT WITHIN THE GENE OF INTEREST
{
$gene = $GENE->{$transcript};
if ($gene eq $old_gene_name)
{
$new_transcript_name = $new_gene_name."-mRNA-".$transcript_num;
$new_line = $temp[0]."\t".$temp[1]."\t".$temp[2]."\t".$temp[3]."\t".$temp[4]."\t".$temp[5]."\t".$temp[6]."\t".$temp[7]."\tID=$new_transcript_name:cds\;Parent=$new_transcript_name\n";
if (!($SEENLINE{$new_line}))
{
$SEENLINE{$new_line} = 1;
$new_lines = $new_lines.$new_line;
}
}
}
}
elsif ($feature_type eq 'exon')
{
# ID=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1:exon:0;Parent=maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1
$exon_start = $temp[3];
$exon_end = $temp[4];
$strand = $temp[6];
$transcript = $temp[8];
@temp2 = split(/Parent=/,$transcript);
$transcript = $temp2[1];
@temp2 = split(/\;/,$transcript);
$transcript = $temp2[0]; # eg. maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1
if ($transcript =~ /\,/) # HAPPENS OCCASIONALLY
{
@temp2 = split(/\,/,$transcript);
for ($j = 0; $j <= $#temp2; $j++)
{
$transcript = $temp2[$j];
if ($transcript =~ /-mRNA-/)
{
@temp3= split(/-mRNA-/,$transcript);
$transcript_num= $temp3[1]; # eg. 1
}
else
{
$transcript_num = 1;
}
# GET THE EXON NUMBER:
if (!($EXONNUM->{$transcript."=".$exon_start."=".$exon_end."=".$strand}))
{
$errorcode = 63; # ERRORCODE=63 (CAN'T TEST FOR, SHOULDN'T HAPPEN)
$errormsg = "ERROR: fix_gff_lines_for_gene: do not know exon number for $transcript $exon_start $exon_end $strand\n";
return($new_lines,$errorcode,$errormsg);
}
$exon_num= $EXONNUM->{$transcript."=".$exon_start."=".$exon_end."=".$strand};
$transcript = $scaffold."=".$transcript;
if ($GENE->{$transcript}) # WE IGNORE TRANSCRIPTS FOR WHICH WE DON'T KNOW THE GENE NAME, AS IT MEANS THEY ARE NOT WITHIN THE GENE OF INTEREST
{
$gene = $GENE->{$transcript};
if ($gene eq $old_gene_name)
{
$new_transcript_name = $new_gene_name."-mRNA-".$transcript_num;
$new_line = $temp[0]."\t".$temp[1]."\t".$temp[2]."\t".$temp[3]."\t".$temp[4]."\t".$temp[5]."\t".$temp[6]."\t".$temp[7]."\tID=$new_transcript_name:exon:$exon_num\;Parent=$new_transcript_name\n";
if (!($SEENLINE{$new_line}))
{
$SEENLINE{$new_line} = 1;
$new_lines = $new_lines.$new_line;
}
}
}
}
}
else
{
if ($transcript =~ /-mRNA-/)
{
@temp3= split(/-mRNA-/,$transcript);
$transcript_num= $temp3[1]; # eg. 1
}
else
{
$transcript_num = 1;
}
# GET THE EXON NUMBER:
if (!($EXONNUM->{$transcript."=".$exon_start."=".$exon_end."=".$strand}))
{
$errorcode = 63; # ERRORCODE=63 (CAN'T TEST FOR, SHOULDN'T HAPPEN)
$errormsg = "ERROR: fix_gff_lines_for_gene: do not know exon number for $transcript $exon_start $exon_end $strand\n";
return($new_lines,$errorcode,$errormsg);
}
$exon_num= $EXONNUM->{$transcript."=".$exon_start."=".$exon_end."=".$strand};
$transcript = $scaffold."=".$transcript;
if ($GENE->{$transcript}) # WE IGNORE TRANSCRIPTS FOR WHICH WE DON'T KNOW THE GENE NAME, AS IT MEANS THEY ARE NOT WITHIN THE GENE OF INTEREST
{
$gene = $GENE->{$transcript};
if ($gene eq $old_gene_name)
{
$new_transcript_name = $new_gene_name."-mRNA-".$transcript_num;
$new_line = $temp[0]."\t".$temp[1]."\t".$temp[2]."\t".$temp[3]."\t".$temp[4]."\t".$temp[5]."\t".$temp[6]."\t".$temp[7]."\tID=$new_transcript_name:exon:$exon_num\;Parent=$new_transcript_name\n";
if (!($SEENLINE{$new_line}))
{
$SEENLINE{$new_line} = 1;
$new_lines = $new_lines.$new_line;
}
}
}
}
}
elsif ($feature_type eq 'five_prime_UTR')
{
# eg. ID=maker-BPAG.scaffold.00151.117925-snap-gene-0.187-mRNA-1:five_prime_utr;Parent=maker-BPAG.scaffold.00151.117925-snap-gene-0.187-mRNA-1
$transcript = $temp[8];
@temp2 = split(/Parent=/,$transcript);
$transcript = $temp2[1];
@temp2 = split(/\;/,$transcript);
$transcript = $temp2[0]; # eg. maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1
if ($transcript =~ /-mRNA-/)
{
@temp2 = split(/-mRNA-/,$transcript);
$transcript_num= $temp2[1]; # eg. 1
}
else
{
$transcript_num = 1;
}
$transcript = $scaffold."=".$transcript;
if ($GENE->{$transcript}) # WE IGNORE TRANSCRIPTS FOR WHICH WE DON'T KNOW THE GENE NAME, AS IT MEANS THEY ARE NOT WITHIN THE GENE OF INTEREST
{
$gene = $GENE->{$transcript};
if ($gene eq $old_gene_name)
{
$new_transcript_name = $new_gene_name."-mRNA-".$transcript_num;
$new_line = $temp[0]."\t".$temp[1]."\t".$temp[2]."\t".$temp[3]."\t".$temp[4]."\t".$temp[5]."\t".$temp[6]."\t".$temp[7]."\tID=$new_transcript_name:five_prime_utr\;Parent=$new_transcript_name\n";
if (!($SEENLINE{$new_line}))
{
$SEENLINE{$new_line} = 1;
$new_lines = $new_lines.$new_line;
}
}
}
}
elsif ($feature_type eq 'three_prime_UTR')
{
# ID=maker-BPAG.contig.05972.1748-augustus-gene-0.1-mRNA-1:three_prime_utr;Parent=maker-BPAG.contig.05972.1748-augustus-gene-0.1-mRNA-1
$transcript = $temp[8];
@temp2 = split(/Parent=/,$transcript);
$transcript = $temp2[1];
@temp2 = split(/\;/,$transcript);
$transcript = $temp2[0]; # eg. maker-BPAG.contig.00154.117101-augustus-gene-0.110-mRNA-1
if ($transcript =~ /-mRNA-/)
{
@temp2 = split(/-mRNA-/,$transcript);
$transcript_num= $temp2[1]; # eg. 1
}
else
{
$transcript_num = 1;
}
$transcript = $scaffold."=".$transcript;
if ($GENE->{$transcript}) # WE IGNORE TRANSCRIPTS FOR WHICH WE DON'T KNOW THE GENE NAME, AS IT MEANS THEY ARE NOT WITHIN THE GENE OF INTEREST
{
$gene = $GENE->{$transcript};
if ($gene eq $old_gene_name)
{
$new_transcript_name = $new_gene_name."-mRNA-".$transcript_num;
$new_line = $temp[0]."\t".$temp[1]."\t".$temp[2]."\t".$temp[3]."\t".$temp[4]."\t".$temp[5]."\t".$temp[6]."\t".$temp[7]."\tID=$new_transcript_name:three_prime_utr\;Parent=$new_transcript_name\n";
if (!($SEENLINE{$new_line}))
{
$SEENLINE{$new_line} = 1;
$new_lines = $new_lines.$new_line;
}
}
}
}
}
else
{
$errormsg = "ERROR: fix_gff_lines_for_gene: feature_type $feature_type line $line\n";
$errorcode = 62; # ERRORCODE=62 (CAN'T TEST FOR, SHOULD BE CAUGHT BY ERRORCODE=60)
return($new_lines,$errorcode,$errormsg);
}
}
# NOTE: THERE MAY NOT BE THE SAME NUMBER OF LINES IN $new_lines AS IN $gff_lines, AS $gff_lines MAY NOT ALL
# CORRESPOND TO THE SAME GENE, WHILE $new_lines WILL.
return($new_lines,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &assign_exon_numbers
sub test_assign_exon_numbers
{
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my %EXONS = (); # HASH TABLE OF EXONS IN TRANSCRIPTS
my $EXONNUM = (); # HASH TABLE OF EXON NUMBERS IN TRANSCRIPTS
$EXONS{"transcript1"} = "transcript1=20=40=+,transcript1=5=10=+,transcript1=60=70=+";
$EXONS{"transcript2"} = "transcript2=20=40=-,transcript2=5=10=-,transcript2=60=70=-";
($EXONNUM,$errorcode,$errormsg) = &assign_exon_numbers(\%EXONS);
if ($errorcode != 0) { print STDERR "ERROR: test_assign_exon_numbers: failed test1 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
if ($EXONNUM->{"transcript1=5=10=+"} != 1) { print STDERR "ERROR: test_assign_exon_numbers: failed test1b\n"; exit;}
if ($EXONNUM->{"transcript1=20=40=+"} != 2) { print STDERR "ERROR: test_assign_exon_numbers: failed test1c\n"; exit;}
if ($EXONNUM->{"transcript1=60=70=+"} != 3) { print STDERR "ERROR: test_assign_exon_numbers: failed test1d\n"; exit;}
if ($EXONNUM->{"transcript2=5=10=-"} != 3) { print STDERR "ERROR: test_assign_exon_numbers: failed test1e\n"; exit;}
if ($EXONNUM->{"transcript2=20=40=-"} != 2) { print STDERR "ERROR: test_assign_exon_numbers: failed test1f\n"; exit;}
if ($EXONNUM->{"transcript2=60=70=-"} != 1) { print STDERR "ERROR: test_assign_exon_numbers: failed test1g\n"; exit;}
# TEST ERRORCODE=51 (REPEATED EXON IN SAME TRANSCRIPT):
%EXONS = ();
$EXONS{"transcript1"} = "transcript1=20=40=+,transcript1=5=10=+,transcript1=20=40=+";
$EXONS{"transcript2"} = "transcript2=20=40=-,transcript2=5=10=-,transcript2=60=70=-";
($EXONNUM,$errorcode,$errormsg) = &assign_exon_numbers(\%EXONS);
if ($errorcode != 51) { print STDERR "ERROR: test_assign_exon_numbers: failed test2 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
# TEST ERRORCODE=52 (EXONS IN SAME TRANSCRIPT WITH DIFFERENT STRANDS):
%EXONS = ();
$EXONS{"transcript1"} = "transcript1=20=40=+,transcript1=5=10=-";
$EXONS{"transcript2"} = "transcript2=20=40=-,transcript2=5=10=-,transcript2=60=70=-";
($EXONNUM,$errorcode,$errormsg) = &assign_exon_numbers(\%EXONS);
if ($errorcode != 52) { print STDERR "ERROR: test_assign_exon_numbers: failed test3 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
# TEST ERRORCODE=55 (STRAND OF AN EXON IS NOT +/-):
%EXONS = ();
$EXONS{"transcript1"} = "transcript1=20=40=A,transcript1=5=10=A,transcript1=50=60=A";
$EXONS{"transcript2"} = "transcript2=20=40=-,transcript2=5=10=-,transcript2=60=70=-";
($EXONNUM,$errorcode,$errormsg) = &assign_exon_numbers(\%EXONS);
if ($errorcode != 55) { print STDERR "ERROR: test_assign_exon_numbers: failed test4 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
}
#------------------------------------------------------------------#
# ASSIGN NUMBERS TO THE EXONS IN THE TRANSCRIPTS:
sub assign_exon_numbers
{
my $EXONS = $_[0]; # HASH TABLE OF EXONS IN TRANSCRIPTS
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my %EXONNUM = (); # HASH TABLE OF EXON NUMBERS IN TRANSCRIPTS
my $transcript; # TRANSCRIPT NAME
my $exons; # EXONS IN A TRANSCRIPT
my @exons; # EXONS IN A TRANSCRIPT
my $exon; # EXON IN A TRANSCRIPT
my $exon_start; # START POSITION OF AN EXON
my @temp; #
my $i; #
my $strand; # STRAND OF THE EXON
my %START = (); # HASH TABLE TO RECORD START POSITIONS OF EXONS
my %STRAND = (); # HASH TABLE TO RECORD STRAND OF A TRANSCRIPT
my $exon_num; # EXON NUMBER ALONG A TRANSCRIPT
my %SEEN = (); # HASH TABLE TO RECORD WHICH EXONS WE HAVE SEEN ALREADY
# FOR EACH TRANSCRIPT, NUMBER THE EXONS ALONG THE TRANSCRIPT FROM 5' TO 3':
foreach $transcript (keys %{$EXONS})
{
$exons = $EXONS->{$transcript};
@exons = split(/\,/,$exons);
# RECORD THE START POSITION AND STRAND:
%START = ();
for ($i = 0; $i <= $#exons; $i++)
{
$exon = $exons[$i];
@temp = split(/=/,$exon);
$exon_start = $temp[1];
$strand = $temp[3];
if ($START{$exon})
{
$errormsg = "ERROR: assign_exon_numbers: already have start for exon $exon in transcript $transcript\n";
$errorcode = 51; # ERRORCODE=51 (TESTED FOR)
return(\%EXONNUM,$errorcode,$errormsg);
}
$START{$exon} = $exon_start;
if ($STRAND{$transcript})
{
if ($STRAND{$transcript} ne $strand)
{
$errormsg = "ERROR: assign_exon_numbers: stored $STRAND{$transcript} as strand of $transcript, not $strand\n";
$errorcode = 52; # ERRORCODE=52 (TESTED FOR)
return(\%EXONNUM,$errorcode,$errormsg);
}
}
else
{
$STRAND{$transcript} = $strand;
}
}
# SORT THE EXONS ALONG THE CHROMOSOME:
@exons = sort { $START{$b} <=> $START{$a} } keys %START;
# FIND THE STRAND OF THE TRANSCRIPT:
if (!($STRAND{$transcript}))
{
$errormsg = "ERROR: assign_exon_numbers: do not know strand of $transcript\n";
$errorcode = 53; # ERRORCODE=53 (SHOULDN'T HAPPEN, CAN'T TEST FOR)
return(\%EXONNUM,$errorcode,$errormsg);
}
$strand = $STRAND{$transcript};
# NUMBER THE EXONS FROM 5' TO 3' ALONG THE TRANSCRIPT:
if ($strand eq '-')
{
$exon_num = 0;
for ($i = 0; $i <= $#exons; $i++)
{
$exon = $exons[$i];
$exon_num++;
if ($EXONNUM{$exon})
{
$errormsg = "ERROR: assign_exon_numbers: already have exon number $EXONNUM{$exon} for exon $exon\n";
$errorcode = 64; # ERRORCODE=64 (SHOULDN'T HAPPEN, CAN'T TEST FOR)
return(\%EXONNUM,$errorcode,$errormsg);
}
$EXONNUM{$exon} = $exon_num;
}
}
elsif ($strand eq '+')
{
$exon_num = 0;
for ($i = $#exons; $i >= 0; $i--)
{
$exon = $exons[$i];
$exon_num++;
if ($EXONNUM{$exon})
{
$errormsg = "ERROR: assign_exon_numbers: already have exon number $EXONNUM{$exon} for exon $exon\n";
$errorcode = 64; # ERRORCODE=64 (SHOULDN'T HAPPEN, CAN'T TEST FOR)
return(\%EXONNUM,$errorcode,$errormsg);
}
$EXONNUM{$exon} = $exon_num;
}
}
else
{
$errormsg = "ERROR: assign_exon_numbers: strand $strand\n";
$errorcode = 55; # ERRORCODE=55 (TESTED FOR)
return(\%EXONNUM,$errorcode,$errormsg);
}
}
return(\%EXONNUM,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &read_genes_for_transcripts
sub test_read_genes_for_transcripts
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN
my $GENE; # HASH TABLE OF GENE NAMES FOR TRANSCRIPTS
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
my $gff_lines; # INPUT GFF LINES
$gff_lines = "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
$gff_lines = $gff_lines."scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
$gff_lines = $gff_lines."scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
$gff_lines = $gff_lines."scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
$gff_lines = $gff_lines."scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
$gff_lines = $gff_lines."scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
$gff_lines = $gff_lines."scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
$gff_lines = $gff_lines."scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
$gff_lines = $gff_lines."scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
$gff_lines = $gff_lines."scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
$gff_lines = $gff_lines."##FASTA\n";
$gff_lines = $gff_lines.">seq1\n";
$gff_lines = $gff_lines."ACAACACAGATAGATATATAGAGCA\n";
close(GFF);
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($gff_lines);
if ($errorcode != 0) { print STDERR "ERROR: test_read_genes_for_transcripst: failed test1\n"; exit;}
if ($GENE->{"scaff_000010=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1"} ne "augustus_masked-scaff_000010-processed-gene-0.3") { print STDERR "ERROR: test_read_genes_for_transcripts: failed test1b\n"; exit;}
if ($GENE->{"scaff_000010=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1"} ne "augustus_masked-scaff_000010-processed-gene-0.4") { print STDERR "ERROR: test_read_genes_for_transcripts: failed test1c\n"; exit;}
# TEST ERRORCODE=9 (mRNA APPEARS TWICE IN GFF):
$gff_lines = "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
$gff_lines = $gff_lines."scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
$gff_lines = $gff_lines."scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
$gff_lines = $gff_lines."scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
$gff_lines = $gff_lines."scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
$gff_lines = $gff_lines."scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
$gff_lines = $gff_lines."scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
$gff_lines = $gff_lines."scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
$gff_lines = $gff_lines."scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
$gff_lines = $gff_lines."scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
$gff_lines = $gff_lines."##FASTA\n";
$gff_lines = $gff_lines.">seq1\n";
$gff_lines = $gff_lines."ACAACACAGATAGATATATAGAGCA\n";
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($gff_lines);
if ($errorcode != 9) { print STDERR "ERROR: test_read_genes_for_transcripts: failed test2\n"; exit;}
# TEST ERRORCODE=3 (LINE DOES NOT HAVE 9 COLUMNS):
$gff_lines = "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
$gff_lines = $gff_lines."scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
$gff_lines = $gff_lines."scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
$gff_lines = $gff_lines."scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
$gff_lines = $gff_lines."scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
$gff_lines = $gff_lines."scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
$gff_lines = $gff_lines."scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
$gff_lines = $gff_lines."scaff_000010\tmaker\tCDS\t29000\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
$gff_lines = $gff_lines."scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
$gff_lines = $gff_lines."scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
$gff_lines = $gff_lines."##FASTA\n";
$gff_lines = $gff_lines.">seq1\n";
$gff_lines = $gff_lines."ACAACACAGATAGATATATAGAGCA\n";
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($gff_lines);
if ($errorcode != 3) { print STDERR "ERROR: test_read_genes_for_transcripts: failed test3\n"; exit;}
}
#------------------------------------------------------------------#
# READ IN THE GENE NAME FOR EACH TRANSCRIPT IN THE INPUT GFF FILE:
sub read_genes_for_transcripts
{
my $gff_lines = $_[0]; # INPUT GFF LINES
my $errorcode = 0; # THIS IS RETURNED AS 0 IF NO ERROR OCCURRED.
my $errormsg = "none";# THIS IS RETURNED AS 'none' IF NO ERROR OCCURRED.
my %GENE = (); # HASH TABLE OF GENE NAMES FOR TRANSCRIPT NAMES.
my $line; #
my @temp; #
my $end_of_gff = 0; # SAYS WHETHER THE END OF THE GFF FILE HAS BEEN REACHED
my $name; # NAME OF mRNA
my $gene; # NAME OF GENE
my @gff_lines; # INPUT GFF LINES
my $i; #
my $scaffold; #
@gff_lines = split(/\n+/,$gff_lines);
for ($i = 0; $i <= $#gff_lines; $i++)
{
$line = $gff_lines[$i];
chomp $line;
if ($line eq '##FASTA') { $end_of_gff = 1;}
if (substr($line,0,1) ne '#' && $end_of_gff == 0)
{
@temp = split(/\t+/,$line);
if ($#temp == 8)
{
if ($temp[2] eq 'mRNA')
{
# eg. scaff_000010 maker mRNA 88793 89828 . + . ID=maker-scaff_000010-augustus-gene-0.21-mRNA-1;Parent=maker-scaff_000010-augustus-gene-0.21;...
$scaffold = $temp[0];
$name = $temp[8];
$gene = $temp[8];
@temp = split(/ID=/,$name);
$name = $temp[1]; # eg. maker-scaff_000010-augustus-gene-0.21-mRNA-1;...
@temp = split(/\;/,$name);
$name = $temp[0]; # eg. maker-scaff_000010-augustus-gene-0.21-mRNA-1
# GET THE NAME OF THE GENE:
@temp = split(/Parent=/,$gene);
$gene = $temp[1]; # eg. maker-scaff_000010-augustus-gene-0.21;...
@temp = split(/\;/,$gene);
$gene = $temp[0]; # eg. maker-scaff_000010-augustus-gene-0.21
$name = $scaffold."=".$name;
# RECORD THE GENE NAME FOR THE mRNA:
if ($GENE{$name})
{
if ($GENE{$name} ne $gene)
{
$errormsg= "ERROR: read_genes_for_transcripts: already have gene name $GENE{$name} for $name (scaffold $scaffold)\n";
$errorcode= 9; # ERRORCODE=9 (TESTED FOR)
return(\%GENE,$errorcode,$errormsg);
}
}
$GENE{$name} = $gene;
}
}
else
{
$errormsg = "ERROR: read_genes_for_transcripts: line does not have 9 columns: $line\n";
$errorcode = 3; # ERRORCODE=3 (TESTED FOR)
return(\%GENE,$errorcode,$errormsg);
}
}
}
return(\%GENE,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# SUBROUTINE TO MAKE A FILE NAME FOR A TEMPORARY FILE:
sub make_filename
{
my $outputdir = $_[0]; # OUTPUT DIRECTORY TO WRITE TEMPORARY FILE NAME TO
my $found_name = 0; # SAYS WHETHER WE HAVE FOUND A FILE NAME YET
my $filename = "none";# NEW FILE NAME TO USE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $poss_filename; # POSSIBLE FILE NAME TO USE
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAME
while($found_name == 0)
{
$random_number = rand();
$poss_filename = $outputdir."/tmp".$random_number;
if (!(-e $poss_filename))
{
$filename = $poss_filename;
$found_name = 1;
}
}
if ($found_name == 0 || $filename eq 'none')
{
$errormsg = "ERROR: make_filename: found_name $found_name filename $filename\n";
$errorcode = 6; # ERRORCODE=6
return($filename,$errorcode,$errormsg);
}
return($filename,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &print_error
sub test_print_error
{
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR
($errormsg,$errorcode) = &print_error(45,45,1);
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message','My error message',1);
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;}
($errormsg,$errorcode) = &print_error('none',45,1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message', 0, 1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;}
}
#------------------------------------------------------------------#
# PRINT OUT AN ERROR MESSAGE AND EXIT.
sub print_error
{
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED.
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED.
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/)
{
if ($called_from_test == 1)
{
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n";
exit;
}
}
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/))
{
if ($called_from_test == 1)
{
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n";
exit;
}
}
if ($errormsg eq 'none' || $errorcode == 0)
{
if ($called_from_test == 1)
{
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n";
exit;
}
}
else
{
print STDERR "$errormsg";
exit;
}
return($errormsg,$errorcode);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment