Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created August 27, 2013 14:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/6354055 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/6354055 to your computer and use it in GitHub Desktop.
Perl script that removes genes that are only supported by 'model_gff' features from a maker gff file.
#!/usr/bin/env perl
=head1 NAME
remove_modelgff_genes_from_gff.pl
=head1 SYNOPSIS
remove_modelgff_genes_from_gff.pl input_gff output_gff outputdir
where input_gff is the input gff file,
output_gff is the output gff file,
outputdir is the output directory for writing output files.
=head1 DESCRIPTION
This script takes an input gff file (<input_gff>), and removes genes that
are only supported by 'model_gff' entries (from maker), and writes the output
gff file (<output_gff>) in directory <outputdir>.
=head1 VERSION
Perl script last edited 1-May-2013.
=head1 CONTACT
alc@sanger.ac.uk (Avril Coghlan)
=cut
#
# Perl script remove_modelgff_genes_from_gff.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 1-May-13.
# Last edited 1-May-2013.
# SCRIPT SYNOPSIS: remove_modelgff_genes_from_gff.pl: removes genes that are only supported by 'model_gff' features from a maker gff file.
#
# NOTE: ELEANOR SAID THAT WE WANT TO KEEP THE GENE MODEL IF IT HAS ANY ONE OR MORE OF THE FOLLOWING AS SUPPORT:
# (a) Genemark, (b) Augustus, (c) SNAP, (d) pred_gff
# HOW TO RECOGNISE THESE:
# (i) IF THERE IS genemark/snap/pred_gff IN THE GENE NAME, KEEP THE GENE
# (ii) IF THERE IS blastx/exonerate IN THE GENE NAME, AND THE GENE OVERLAPS AN augustus*
# OR pred_gff:embl OR pred_gff:protein_coding FEATURE, KEEP THE GENE
# (iii) IF THERE IS augustus/blastx/exonerate IN THE GENE NAME, AND THE GENE DOESN'T OVERLAP AN
# augustus* OR pred_gff:embl OR pred_gff:protein_coding FEATURE, DISCARD THE GENE
#
# THIS IS WHAT THE SCRIPT DOES:
# 1) IT CHECKS IF A GENE OVERLAPS A pred_gff:embl OR pred_gff:protein_coding OR augustus* (augustus OR augustus_masked) FEATURE
# 2) IF IT DOESN'T, IT CHECKS WHETHER THE GENE NAME CONTAINS snap/genemark/pred_gff
# - IF THE GENE NAME DOES CONTAIN snap/genemark/pred_gff, KEEP THE GENE
# - IF THE GENE NAME DOESN'T CONTAIN snap/genemark/pred_gff, CHECK IF THE GENE NAME CONTAINS blastx/exonerate/augustus
# 3) IF THE GENE NAME DOESN'T CONTAIN blastx/exonerate/augustus, PRINT AN ERROR MESSAGE AND DIE (AS WE THINK THIS SHOULDN'T HAPPEN)
# 4) IF THE GENE NAME DOES CONTAIN blastx/exonerate/augustus, DELETE THE GENE FROM THE OUTPUT GFF
#
# NOTE: pred_gff COMES FROM genblastg OR RATT.
# NOTE: IT'S NECESSARY TO RUN rename_genes_in_maker_gff.pl AFTER RUNNING THIS SCRIPT.
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
use strict;
use warnings;
my $num_args = $#ARGV + 1;
if ($num_args != 3)
{
print "Usage of remove_modelgff_genes_from_gff.pl\n\n";
print "perl remove_modelgff_genes_from_gff.pl <input_gff> <output_gff> <outputdir>\n";
print "where <input_gff> is the input gff file,\n";
print " <output_gff> is the output gff file,\n";
print " <outputdir> is the output directory for writing output files.\n";
print "For example, >perl remove_modelgff_genes_from_gff.pl round3.v2.gff new_round3.v2.gff\n";
print "/lustre/scratch108/parasites/alc/\n";
print "NOTE: Do *NOT* run rename_genes_in_maker_gff.pl before running this script!\n";
exit;
}
# FIND THE PATH TO THE INPUT GFF FILE:
my $input_gff = $ARGV[0];
# FIND THE PATH TO THE OUTPUT GFF FILE:
my $output_gff = $ARGV[1];
# FIND THE DIRECTORY TO USE FOR OUTPUT FILES:
my $outputdir = $ARGV[2];
#------------------------------------------------------------------#
# TEST SUBROUTINES:
my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING.
&test_print_error;
&test_make_gene_gff($outputdir);
&test_make_features_gff($outputdir);
&test_find_genes_to_discard($outputdir);
&test_read_genes_for_transcripts($outputdir);
&test_write_output_gff($outputdir);
&test_check_whether_to_discard;
&test_run_main_program($outputdir);
print STDERR "Finished tests, running main code now...\n";
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
&run_main_program($outputdir,$input_gff,$output_gff);
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
sub run_main_program
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN.
my $input_gff = $_[1]; # THE INPUT GFF FILE
my $output_gff = $_[2]; # THE OUTPUT GFF FILE
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR
my $gene_gff; # GFF FILE FOR GENES
my $features_gff; # GFF FILE WITH NON-GENE FEATURES
my $DISCARD; # HASH TABLE OF GENES TO DISCARD
my $GENE; # HASH TABLE OF GENE NAMES FOR TRANSCRIPTS
# READ IN THE GENE NAME FOR EACH TRANSCRIPT IN THE INPUT GFF FILE:
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# MAKE A GFF FILE WITH JUST THE GENES:
($gene_gff,$errorcode,$errormsg) = &make_gene_gff($input_gff,$outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# MAKE A GFF FILE WITH NON-GENE FEATURES: (SPECIFICALLY pred_gff:embl, pred_gff:protein_coding & augustus*)
($features_gff,$errorcode,$errormsg) = &make_features_gff($input_gff,$outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# FIND GENES THAT DO NOT OVERLAP ANY OF THE NON-GENE FEATURES:
($DISCARD,$errorcode,$errormsg) = &find_genes_to_discard($outputdir,$gene_gff,$features_gff,0);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# READ IN THE INPUT GFF FILE, AND MAKE THE OUTPUT GFF FILE WITH GENES THAT ARE FROM snap/pred_gff/genemark, OR SUPPORTED BY augustus*/pred_gff:
($errorcode,$errormsg) = &write_output_gff($outputdir,$input_gff,$output_gff,$DISCARD,$GENE);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# DELETE TEMPORARY FILES:
system "rm -f $gene_gff";
system "rm -f $features_gff";
}
#------------------------------------------------------------------#
# TEST &run_main_program
sub test_run_main_program
{
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES INTO
my $input_gff; # INPUT GFF FILE
my $output_gff; # OUTPUT GFF FILE
my $errorcode; # RETURNED FROM A FUNCTION AS 0 IF THERE IS NO ERROR
my $errormsg; # RETURNED FROM A FUNCTION AS 'none' IF THERE IS NO ERROR
my $expected_output_gff; # FILE CONTAINING EXPECTED CONTENTS OF $output_gff
my $differences; # DIFFERENCES BETWEEN $output_gff AND $expected_output_gff
my $length_differences; # LENGTH OF $differences
my $line; #
# EXAMPLE WHERE WE SHOULD DELETE A GENE, ALSO WHERE START POSITION IS 0 IN MAKER FILE:
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_run_main_program: cannot open $input_gff\n";
print INPUT_GFF "BPAG.contig.04971.2025 maker gene 0 1613 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0;Name=maker-BPAG.contig.04971.2025-augustus-gene-0.0\n";
print INPUT_GFF "BPAG.contig.04971.2025 maker mRNA 0 1613 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0;Name=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1;_AED=0.30;_eAED=0.30;_QI=0|0|0|1|0|0|5|1|157\n";
print INPUT_GFF "BPAG.contig.04971.2025 maker exon 0 120 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:exon:137;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.04971.2025 maker exon 348 480 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:exon:136;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.04971.2025 maker exon 608 653 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:exon:135;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.04971.2025 maker exon 778 855 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:exon:134;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.04971.2025 maker exon 1516 1613 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:exon:133;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.04971.2025 maker three_prime_UTR 0 1 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:three_prime_utr;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.04971.2025 maker CDS 1516 1613 . - 2 ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:cds;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.04971.2025 maker CDS 778 855 . - 2 ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:cds;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.04971.2025 maker CDS 608 653 . - 0 ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:cds;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.04971.2025 maker CDS 348 480 . - 1 ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:cds;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.04971.2025 maker CDS 2 120 . - 0 ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:cds;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n";
close(INPUT_GFF);
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
&run_main_program($outputdir,$input_gff,$output_gff);
($expected_output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_output_gff") || die "ERROR: test_run_main_program: cannot open $expected_output_gff\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $output_gff $expected_output_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_run_main_program: failed test1 (output_gff $output_gff expected_output_gff $expected_output_gff)\n"; exit;}
system "rm -f $output_gff";
system "rm -f $expected_output_gff";
system "rm -f $input_gff";
# EXAMPLE WHERE WE SHOULD NOT DELETE A GENE, ALSO WHERE START POSITION IS 0 OR -1 IN MAKER FILE:
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_run_main_program: cannot open $input_gff\n";
print INPUT_GFF "BPAG.contig.07548.1438 maker gene -1 1267 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0;Name=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0\n";
print INPUT_GFF "BPAG.contig.07548.1438 maker mRNA -1 1267 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0;Name=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1;_AED=1.00;_eAED=1.00;_QI=0|0|0|0|1|1|4|0|197\n";
print INPUT_GFF "BPAG.contig.07548.1438 maker exon -1 170 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:exon:117;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.07548.1438 maker exon 441 605 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:exon:116;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.07548.1438 maker exon 951 1167 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:exon:115;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.07548.1438 maker exon 1226 1267 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:exon:114;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.07548.1438 maker three_prime_UTR -1 0 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:three_prime_utr;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.07548.1438 maker CDS 1226 1267 . - 0 ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:cds;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.07548.1438 maker CDS 951 1167 . - 1 ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:cds;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.07548.1438 maker CDS 441 605 . - 1 ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:cds;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.07548.1438 maker CDS 1 170 . - 0 ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:cds;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.07548.1438 augustus_masked match 19 1267 0.51 - . ID=BPAG.contig.07548.1438:hit:573:3.5.0.0;Name=augustus_masked-BPAG.contig.07548.1438-abinit-gene-0.0-mRNA-1\n";
print INPUT_GFF "BPAG.contig.07548.1438 augustus_masked match_part 1226 1267 0.51 - . ID=BPAG.contig.07548.1438:hsp:749:3.5.0.0;Parent=BPAG.contig.07548.1438:hit:573:3.5.0.0;Target=augustus_masked-BPAG.contig.07548.1438-abinit-gene-0.0-mRNA-1 535 576 +;Gap=M42\n";
print INPUT_GFF "BPAG.contig.07548.1438 augustus_masked match_part 951 1167 0.51 - . ID=BPAG.contig.07548.1438:hsp:750:3.5.0.0;Parent=BPAG.contig.07548.1438:hit:573:3.5.0.0;Target=augustus_masked-BPAG.contig.07548.1438-abinit-gene-0.0-mRNA-1 318 534 +;Gap=M217\n";
print INPUT_GFF "BPAG.contig.07548.1438 augustus_masked match_part 441 605 0.51 - . ID=BPAG.contig.07548.1438:hsp:751:3.5.0.0;Parent=BPAG.contig.07548.1438:hit:573:3.5.0.0;Target=augustus_masked-BPAG.contig.07548.1438-abinit-gene-0.0-mRNA-1 153 317 +;Gap=M165\n";
print INPUT_GFF "BPAG.contig.07548.1438 augustus_masked match_part 19 170 0.51 - . ID=BPAG.contig.07548.1438:hsp:752:3.5.0.0;Parent=BPAG.contig.07548.1438:hit:573:3.5.0.0;Target=augustus_masked-BPAG.contig.07548.1438-abinit-gene-0.0-mRNA-1 1 152 +;Gap=M152\n";
close(INPUT_GFF);
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
&run_main_program($outputdir,$input_gff,$output_gff);
($expected_output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_output_gff") || die "ERROR: test_run_main_program: cannot open $expected_output_gff\n";
print EXPECTED "BPAG.contig.07548.1438 maker gene 1 1267 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0;Name=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0\n";
print EXPECTED "BPAG.contig.07548.1438 maker mRNA 1 1267 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0;Name=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1;_AED=1.00;_eAED=1.00;_QI=0|0|0|0|1|1|4|0|197\n";
print EXPECTED "BPAG.contig.07548.1438 maker exon 1 170 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:exon:117;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n";
print EXPECTED "BPAG.contig.07548.1438 maker exon 441 605 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:exon:116;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n";
print EXPECTED "BPAG.contig.07548.1438 maker exon 951 1167 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:exon:115;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n";
print EXPECTED "BPAG.contig.07548.1438 maker exon 1226 1267 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:exon:114;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n";
print EXPECTED "BPAG.contig.07548.1438 maker three_prime_UTR 1 0 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:three_prime_utr;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n";
print EXPECTED "BPAG.contig.07548.1438 maker CDS 1226 1267 . - 0 ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:cds;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n";
print EXPECTED "BPAG.contig.07548.1438 maker CDS 951 1167 . - 1 ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:cds;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n";
print EXPECTED "BPAG.contig.07548.1438 maker CDS 441 605 . - 1 ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:cds;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n";
print EXPECTED "BPAG.contig.07548.1438 maker CDS 1 170 . - 0 ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:cds;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n";
print EXPECTED "BPAG.contig.07548.1438 augustus_masked match 19 1267 0.51 - . ID=BPAG.contig.07548.1438:hit:573:3.5.0.0;Name=augustus_masked-BPAG.contig.07548.1438-abinit-gene-0.0-mRNA-1\n";
print EXPECTED "BPAG.contig.07548.1438 augustus_masked match_part 1226 1267 0.51 - . ID=BPAG.contig.07548.1438:hsp:749:3.5.0.0;Parent=BPAG.contig.07548.1438:hit:573:3.5.0.0;Target=augustus_masked-BPAG.contig.07548.1438-abinit-gene-0.0-mRNA-1 535 576 +;Gap=M42\n";
print EXPECTED "BPAG.contig.07548.1438 augustus_masked match_part 951 1167 0.51 - . ID=BPAG.contig.07548.1438:hsp:750:3.5.0.0;Parent=BPAG.contig.07548.1438:hit:573:3.5.0.0;Target=augustus_masked-BPAG.contig.07548.1438-abinit-gene-0.0-mRNA-1 318 534 +;Gap=M217\n";
print EXPECTED "BPAG.contig.07548.1438 augustus_masked match_part 441 605 0.51 - . ID=BPAG.contig.07548.1438:hsp:751:3.5.0.0;Parent=BPAG.contig.07548.1438:hit:573:3.5.0.0;Target=augustus_masked-BPAG.contig.07548.1438-abinit-gene-0.0-mRNA-1 153 317 +;Gap=M165\n";
print EXPECTED "BPAG.contig.07548.1438 augustus_masked match_part 19 170 0.51 - . ID=BPAG.contig.07548.1438:hsp:752:3.5.0.0;Parent=BPAG.contig.07548.1438:hit:573:3.5.0.0;Target=augustus_masked-BPAG.contig.07548.1438-abinit-gene-0.0-mRNA-1 1 152 +;Gap=M152\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $output_gff $expected_output_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_run_main_program: failed test2 (output_gff $output_gff expected_output_gff $expected_output_gff)\n"; exit;}
system "rm -f $output_gff";
system "rm -f $expected_output_gff";
system "rm -f $input_gff";
}
#------------------------------------------------------------------#
# TEST &find_genes_to_discard
sub test_find_genes_to_discard
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
my $gff; # INPUT GFF FILE
my $gene_gff; # GFF FILE WITH JUST THE GENES
my $features_gff; # GFF FILE WITH JUST THE FEATURES
my $DISCARD; # HASH TABLE OF GENES TO DISCARD #
# EXAMPLE WITH TWO GENES TO KEEP, THAT HAVE 'snap' IN THEIR NAMES:
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n";
print GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=snap-scaff_000010-processed-gene-0.3;Name=snap-scaff_000010-processed-gene-0.3\n";
print GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=snap-scaff_000010-processed-gene-0.3-mRNA-1;Parent=snap-scaff_000010-processed-gene-0.3;Name=snap-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=snap-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=snap-scaff_000010-processed-gene-0.3-mRNA-1\n";
print GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=snap-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=snap-scaff_000010-processed-gene-0.3-mRNA-1\n";
print GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=snap-scaff_000010-processed-gene-0.4;Name=snap-scaff_000010-processed-gene-0.4\n";
print GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=snap-scaff_000010-processed-gene-0.4-mRNA-1;Parent=snap-scaff_000010-processed-gene-0.4;Name=snap-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=snap-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=snap-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=snap-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=snap-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print GFF "scaff_000010\tprotein2genome\tprotein_match\t339754\t340801\t478\t+\t.\tID=scaff_000010:hit:2470:2.10.0.3;Name=tr|G0MIG4|G0MIG4_CAEBE\n";
print GFF "scaff_000001\taugustus_masked\tmatch\t3010038\t3012310\t0.64\t+\t.\tID=scaff_000001:hit:48921:3.5.0.30;Name=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1\n";
print GFF "scaff_000001\taugustus_masked\tmatch_part\t3010038\t3010098\t0.64\t+\t.\tID=scaff_000001:hsp:87184:3.5.0.30;Parent=scaff_000001:hit:48921:3.5.0.30;Target=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1 1 61 +;Gap=M61\n";
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch\t1590\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1;score=1000\n";
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch_part\t2498\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38715:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1 859 936 +;Gap=M78\n";
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch\t1590\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1\n";
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch_part\t2494\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38793:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1 1 82 +;Gap=M82\n";
print GFF "##FASTA\n";
print GFF ">seq1\n";
print GFF "ACAACACAGATAGATATATAGAGCA\n";
close(GFF);
# MAKE A GFF FILE WITH JUST THE GENES:
($gene_gff,$errorcode,$errormsg) = &make_gene_gff($gff,$outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# MAKE A GFF FILE WITH NON-GENE FEATURES: (SPECIFICALLY pred_gff:embl, pred_gff:protein_coding & augustus*)
($features_gff,$errorcode,$errormsg) = &make_features_gff($gff,$outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# FIND GENES THAT DO NOT OVERLAP ANY OF THE NON-GENE FEATURES:
($DISCARD,$errorcode,$errormsg) = &find_genes_to_discard($outputdir,$gene_gff,$features_gff,1);
if ($errorcode != 0) { print STDERR "ERROR: test_find_genes_to_discard: failed test1 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
if (keys %{$DISCARD} > 0) { print STDERR "ERROR: test_find_genes_to_discard: failed test1b\n"; exit;}
system "rm -f $gff";
system "rm -f $gene_gff";
system "rm -f $features_gff";
# EXAMPLE WITH TWO GENES TO DISCARD, THAT DO NOT HAVE snap/genemark/pred_gff IN THEIR NAMES AND DO NOT OVERLAP ANY pred_gff/augustus* FEATURE:
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n";
print GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=exonerate-scaff_000010-processed-gene-0.3;Name=exonerate-scaff_000010-processed-gene-0.3\n";
print GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=exonerate-scaff_000010-processed-gene-0.3-mRNA-1;Parent=exonerate-scaff_000010-processed-gene-0.3;Name=exonerate-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=exonerate-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=exonerate-scaff_000010-processed-gene-0.3-mRNA-1\n";
print GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=exonerate-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=exonerate-scaff_000010-processed-gene-0.3-mRNA-1\n";
print GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=exonerate-scaff_000010-processed-gene-0.4;Name=exonerate-scaff_000010-processed-gene-0.4\n";
print GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=exonerate-scaff_000010-processed-gene-0.4-mRNA-1;Parent=exonerate-scaff_000010-processed-gene-0.4;Name=exonerate-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=exonerate-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=exonerate-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=exonerate-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=exonerate-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=exonerate-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=exonerate-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print GFF "scaff_000010\tprotein2genome\tprotein_match\t339754\t340801\t478\t+\t.\tID=scaff_000010:hit:2470:2.10.0.3;Name=tr|G0MIG4|G0MIG4_CAEBE\n";
print GFF "##FASTA\n";
print GFF ">seq1\n";
print GFF "ACAACACAGATAGATATATAGAGCA\n";
close(GFF);
# MAKE A GFF FILE WITH JUST THE GENES:
($gene_gff,$errorcode,$errormsg) = &make_gene_gff($gff,$outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# MAKE A GFF FILE WITH NON-GENE FEATURES: (SPECIFICALLY pred_gff:embl, pred_gff:protein_coding & augustus*)
($features_gff,$errorcode,$errormsg) = &make_features_gff($gff,$outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# FIND GENES THAT DO NOT OVERLAP ANY OF THE NON-GENE FEATURES:
($DISCARD,$errorcode,$errormsg) = &find_genes_to_discard($outputdir,$gene_gff,$features_gff,1);
if ($errorcode != 0) { print STDERR "ERROR: test_find_genes_to_discard: failed test2 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
if (!($DISCARD->{"exonerate-scaff_000010-processed-gene-0.3"})) { print STDERR "ERROR: test_find_genes_to_discard: failed test2b\n"; exit;}
if (!($DISCARD->{"exonerate-scaff_000010-processed-gene-0.4"})) { print STDERR "ERROR: test_find_genes_to_discard: failed test2c\n"; exit;}
system "rm -f $gff";
system "rm -f $gene_gff";
system "rm -f $features_gff";
# TEST ERRORCODE=100: (GENE TO DISCARD DOES NOT HAVE exonerate/blastx IN ITS NAME):
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n";
print GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=exonerate-scaff_000010-processed-gene-0.3;Name=exonerate-scaff_000010-processed-gene-0.3\n";
print GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=exonerate-scaff_000010-processed-gene-0.3-mRNA-1;Parent=exonerate-scaff_000010-processed-gene-0.3;Name=exonerate-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=exonerate-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=exonerate-scaff_000010-processed-gene-0.3-mRNA-1\n";
print GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=exonerate-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=exonerate-scaff_000010-processed-gene-0.3-mRNA-1\n";
print GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=auugustus_masked-scaff_000010-processed-gene-0.4;Name=auugustus_masked-scaff_000010-processed-gene-0.4\n";
print GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=auugustus_masked-scaff_000010-processed-gene-0.4;Name=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print GFF "scaff_000010\tprotein2genome\tprotein_match\t339754\t340801\t478\t+\t.\tID=scaff_000010:hit:2470:2.10.0.3;Name=tr|G0MIG4|G0MIG4_CAEBE\n";
print GFF "##FASTA\n";
print GFF ">seq1\n";
print GFF "ACAACACAGATAGATATATAGAGCA\n";
close(GFF);
# MAKE A GFF FILE WITH JUST THE GENES:
($gene_gff,$errorcode,$errormsg) = &make_gene_gff($gff,$outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# MAKE A GFF FILE WITH NON-GENE FEATURES: (SPECIFICALLY pred_gff:embl, pred_gff:protein_coding & augustus*)
($features_gff,$errorcode,$errormsg) = &make_features_gff($gff,$outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# FIND GENES THAT DO NOT OVERLAP ANY OF THE NON-GENE FEATURES:
($DISCARD,$errorcode,$errormsg) = &find_genes_to_discard($outputdir,$gene_gff,$features_gff,1);
if ($errorcode != 100) { print STDERR "ERROR: test_find_genes_to_discard: failed test2d (errorcode $errorcode errormsg $errormsg)\n"; exit;}
system "rm -f $gff";
system "rm -f $gene_gff";
system "rm -f $features_gff";
# EXAMPLE WITH TWO GENES TO KEEP, THAT DO NOT HAVE snap/genemark/pred_gff IN THEIR NAMES BUT OVERLAP ANY pred_gff/augustus* FEATURE:
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n";
print GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=auugustus_masked-scaff_000010-processed-gene-0.3;Name=auugustus_masked-scaff_000010-processed-gene-0.3\n";
print GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=auugustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=auugustus_masked-scaff_000010-processed-gene-0.3;Name=auugustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=auugustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=auugustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=auugustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=auugustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=auugustus_masked-scaff_000010-processed-gene-0.4;Name=auugustus_masked-scaff_000010-processed-gene-0.4\n";
print GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=auugustus_masked-scaff_000010-processed-gene-0.4;Name=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print GFF "scaff_000010\tprotein2genome\tprotein_match\t339754\t340801\t478\t+\t.\tID=scaff_000010:hit:2470:2.10.0.3;Name=tr|G0MIG4|G0MIG4_CAEBE\n";
print GFF "scaff_000010\taugustus_masked\tmatch\t20477\t21300\t0.64\t+\t.\tID=scaff_000001:hit:48921:3.5.0.30;Name=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1\n";
print GFF "scaff_000010\taugustus_masked\tmatch\t29010\t29020\t0.64\t+\t.\tID=scaff_000001:hit:48921:3.5.0.30;Name=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1\n";
print GFF "##FASTA\n";
print GFF ">seq1\n";
print GFF "ACAACACAGATAGATATATAGAGCA\n";
close(GFF);
# MAKE A GFF FILE WITH JUST THE GENES:
($gene_gff,$errorcode,$errormsg) = &make_gene_gff($gff,$outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# MAKE A GFF FILE WITH NON-GENE FEATURES: (SPECIFICALLY pred_gff:embl, pred_gff:protein_coding & augustus*)
($features_gff,$errorcode,$errormsg) = &make_features_gff($gff,$outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# FIND GENES THAT DO NOT OVERLAP ANY OF THE NON-GENE FEATURES:
($DISCARD,$errorcode,$errormsg) = &find_genes_to_discard($outputdir,$gene_gff,$features_gff,1);
if ($errorcode != 0) { print STDERR "ERROR: test_find_genes_to_discard: failed test3 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
if ($DISCARD->{"auugustus_masked-scaff_000010-processed-gene-0.3"}) { print STDERR "ERROR: test_find_genes_to_discard: failed test3b\n"; exit;}
if ($DISCARD->{"auugustus_masked-scaff_000010-processed-gene-0.4"}) { print STDERR "ERROR: test_find_genes_to_discard: failed test3c\n"; exit;}
system "rm -f $gff";
system "rm -f $gene_gff";
system "rm -f $features_gff";
# TEST ERRORCODE=101 (BEDTOOLS COULD NOT RUN PROPERLY):
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n";
print GFF "BPAG.contig.04966.2027 maker gene 73 1002 . - . ID=maker-BPAG.contig.04966.2027-snap-gene-0.6;Name=maker-BPAG.contig.04966.2027-snap-gene-0.6\n";
print GFF "BPAG.contig.04967.2027 maker gene 1129 1876 . - . ID=genemark-BPAG.contig.04967.2027-processed-gene-0.0;Name=genemark-BPAG.contig.04967.2027-processed-gene-0.0\n";
print GFF "BPAG.contig.04971.2025 maker gene 1675 1797 . + . ID=maker-BPAG.contig.04971.2025-exonerate_protein2genome-gene-0.2;Name=maker-BPAG.contig.04971.2025-exonerate_protein2genome-gene-0.2\n";
print GFF "BPAG.contig.04971.2025 maker gene 0 1613 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0;Name=maker-BPAG.contig.04971.2025-augustus-gene-0.0\n";
print GFF "##FASTA\n";
print GFF ">seq1\n";
print GFF "ACAACACAGATAGATATATAGAGCA\n";
close(GFF);
# RUN &find_genes_to_discard:
($DISCARD,$errorcode,$errormsg) = &find_genes_to_discard($outputdir,$gff,$gff,1);
if ($errorcode != 101) { print STDERR "ERROR: test_find_genes_to_discard: failed test4 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
system "rm -f $gff";
system "rm -f $gene_gff";
system "rm -f $features_gff";
}
#------------------------------------------------------------------#
# FIND GENES THAT DO NOT OVERLAP ANY OF THE NON-GENE FEATURES:
sub find_genes_to_discard
{
my $outputdir = $_[0]; ## DIRECTORY TO PUT OUTPUT FILES IN
my $gene_gff = $_[1]; ## GENE GFF FILE
my $features_gff = $_[2]; ## FEATURES GFF FILE
my $testing = $_[3]; ## SAYS WHETHER THIS IS CALLED FROM A TESTING SUBROUTINE
my $cmd; ## COMMAND TO RUN
my $output; ## BEDTOOLS OUTPUT FILE
my $line; ##
my @temp; ##
my $name; ##
my %DISCARD = (); ## HASH TABLE OF GENES TO DISCARD
my $errorcode = 0; ## RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";## RETURNED AS 'none' IF THERE IS NO ERROR
my %KEEP = (); ## HASH TABLE OF GENES TO KEEP
my $features_gff_len = 0; ## NUMBER OF LINES IN THE $features_gff FILE
my $scaffold; ## SCAFFOLD THAT A GENE LIES ON
my $gene_start; ## START OF A GENE
my $gene_end; ## END OF A GENE
my $gene_name; ## GENE NAME
my $systemcall; ## RETURN VALUE FROM SYSTEM CALL
# CHECK IF THE $features_gff FILE HAS SOME LINES IN IT:
open(TEMP,"wc -l $features_gff |");
while(<TEMP>)
{
$line = $_;
chomp $line;
@temp = split(/\s+/,$line);
$features_gff_len = $temp[0];
}
close(TEMP);
if ($features_gff_len > 0)
{
# RUN BEDTOOLS TO FIND OVERLAPS BETWEEN THE GENE GFF AND FEATURES GFF:
($output,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# NOTE: I SEEM TO NEED -loj IN BEDTOOLS, OR ELSE THE COORDINATES ARE WITH RESPECT TO THE OVERLAP REGION, NOT THE GENE REGION:
if ($testing == 0) { $cmd = "bedtools intersect -loj -a $gene_gff -b $features_gff > $output";}
else { $cmd = "bedtools intersect -loj -a $gene_gff -b $features_gff >& $output";}
$systemcall = system "$cmd";
if ($systemcall != 0)
{
system "rm -f $output";
$errormsg = "ERROR: find_genes_to_discard: return value $systemcall when tried to run: $cmd\n";
$errorcode = 101; # ERRORCODE=101 (TESTED FOR)
return(\%DISCARD,$errorcode,$errormsg);
}
system "sleep 1"; # SLEEP FOR 1 SECOND TO MAKE SURE THE OUTPUT IS COMPLETED
# READ IN THE BEDTOOLS OUTPUT FILE:
open(OUTPUT,"$output") || die "ERROR: find_genes_to_discard: cannot open $output\n";
while(<OUTPUT>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
# GET THE NAME OF THE GENE:
$scaffold = $temp[0];
$gene_start = $temp[3];
$gene_end = $temp[4];
$name = $temp[8]; # ID=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.45;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.45;score=1000
if ($temp[9] ne '.') # THIS IS NOT A PROPER OVERLAP
{
@temp = split(/ID=/,$name);
$name = $temp[1]; # maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.45;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.45;score=1000
@temp = split(/\;/,$name);
$name = $temp[0]; # maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.45
# THE GENE OVERLAPS AN augustus* OR pred_gff FEATURE, SO WE WANT TO KEEP IT:
$gene_name = $scaffold."=".$gene_start."=".$gene_end."=".$name;
$KEEP{$gene_name} = 1;
}
}
close(OUTPUT);
system "rm -f $output";
}
# NOW READ IN THE INPUT GFF FILE OF GENES, AND FIGURE OUT WHICH GENES WE WANT TO DISCARD:
open(GENE_GFF,"$gene_gff") || die "ERROR: find_genes_to_discard: cannot open $gene_gff\n";
while(<GENE_GFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
$scaffold = $temp[0];
$gene_start = $temp[3];
$gene_end = $temp[4];
$name = $temp[8]; # ID=augustus_masked-scaff_000001-processed-gene-0.142;Name=augustus_masked-scaff_000001-processed-gene-0.142
@temp = split(/ID=/,$name);
$name = $temp[1]; # augustus_masked-scaff_000001-processed-gene-0.142;Name=augustus_masked-scaff_000001-processed-gene-0.142
@temp = split(/\;/,$name);
$name = $temp[0]; # augustus_masked-scaff_000001-processed-gene-0.142;
$gene_name = $scaffold."=".$gene_start."=".$gene_end."=".$name;
# IF THE GENE DOES NOT OVERLAP AN augustus*/pred_gff FEATURE, AND DOES NOT HAVE
# snap/genemark/pred_gff IN ITS NAME, THEN WE WANT TO DISCARD IT:
if (!($KEEP{$gene_name}))
{
# eg. maker-scaff_000010-snap-gene-0.227,snap_masked-New_000054-processed-gene-0.2-mRNA-1
# eg. genemark-scaff_000010-processed-gene-1.31, genemark-New_000047-processed-gene-0.2-mRNA-1
# eg. maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.64, maker-scaff_000023-pred_gff_protein_coding-gene-0.25-mRNA-1
if (!($name =~ /snap/ || $name =~ /genemark/ || $name =~ /pred_gff/ || $name =~ /est_gff_source/))
# est_gff_source MEANS THE GENE IS SUPPORTED BY RNA-SEQ DATA. WE WANT TO KEEP IT EVEN IF IT IS NOT SUPPORTED BY PROTEIN EVIDENCE.
{
if ($testing == 0) { print STDERR "Discarding gene $name (scaffold $scaffold $gene_start-$gene_end) ...\n";}
$DISCARD{$gene_name} = 1;
$DISCARD{$name} = 1;
if (!($name =~ /exonerate/ || $name =~ /BLASTX/ || $name =~ /blastx/ || $name =~ /augustus/ || $name =~ /protein2genome/ || $name =~ /est2genome/ || $name =~ /blastn/))
{
$errormsg = "ERROR: find_genes_to_discard: gene name $name does not contain exonerate or BLASTX or blastx or blastn or or augustus or protein2genome or est2genome\n";
$errorcode = 100; # ERRORCODE=100 (TESTED FOR)
return(\%DISCARD,$errorcode,$errormsg);
}
}
}
}
close(GENE_GFF);
return(\%DISCARD,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &make_features_gff
sub test_make_features_gff
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES INTO
my $gff; # GFF FILE
my $features_gff; # FEATURES GFF FILE
my $expected_features_gff; # FILE WITH EXPECTED CONTENTS OF $features_gff
my $differences; # DIFFERENES BETWEEN $features_gff AND $expected_features_gff
my $length_differences; # LENGTH OF $differences
my $line; #
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n";
print GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print GFF "scaff_000010\tprotein2genome\tprotein_match\t339754\t340801\t478\t+\t.\tID=scaff_000010:hit:2470:2.10.0.3;Name=tr|G0MIG4|G0MIG4_CAEBE\n";
print GFF "scaff_000001\taugustus_masked\tmatch\t3010038\t3012310\t0.64\t+\t.\tID=scaff_000001:hit:48921:3.5.0.30;Name=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1\n";
print GFF "scaff_000001\taugustus_masked\tmatch_part\t3010038\t3010098\t0.64\t+\t.\tID=scaff_000001:hsp:87184:3.5.0.30;Parent=scaff_000001:hit:48921:3.5.0.30;Target=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1 1 61 +;Gap=M61\n";
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch\t1590\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1;score=1000\n";
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch_part\t2498\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38715:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1 859 936 +;Gap=M78\n";
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch\t1590\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1\n";
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch_part\t2494\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38793:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1 1 82 +;Gap=M82\n";
print GFF "##FASTA\n";
print GFF ">seq1\n";
print GFF "ACAACACAGATAGATATATAGAGCA\n";
close(GFF);
($features_gff,$errorcode,$errormsg) = &make_features_gff($gff,$outputdir);
if ($errorcode != 0) { print STDERR "ERROR: test_make_features_gff: failed test1\n"; exit;}
($expected_features_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_features_gff") || die "ERROR: test_make_features_gff: cannot open $expected_features_gff\n";
print EXPECTED "scaff_000001\taugustus_masked\tmatch\t3010038\t3012310\t0.64\t+\t.\tID=scaff_000001:hit:48921:3.5.0.30;Name=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1\n";
print EXPECTED "scaff_000001\taugustus_masked\tmatch_part\t3010038\t3010098\t0.64\t+\t.\tID=scaff_000001:hsp:87184:3.5.0.30;Parent=scaff_000001:hit:48921:3.5.0.30;Target=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1 1 61 +;Gap=M61\n";
print EXPECTED "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch\t1590\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1;score=1000\n";
print EXPECTED "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch_part\t2498\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38715:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1 859 936 +;Gap=M78\n";
print EXPECTED "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch\t1590\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1\n";
print EXPECTED "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch_part\t2494\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38793:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1 1 82 +;Gap=M82\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $features_gff $expected_features_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_make_features_gff: failed test1 (features_gff $features_gff expected_features_gff $expected_features_gff)\n"; exit;}
system "rm -f $features_gff";
system "rm -f $expected_features_gff";
system "rm -f $gff";
# TEST ERRORCODE=16 ('match' FEATURE OF AN USUSUAL TYPE):
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n";
print GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmodel_gff:maker2\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print GFF "scaff_000010\tprotein2genome\tprotein_match\t339754\t340801\t478\t+\t.\tID=scaff_000010:hit:2470:2.10.0.3;Name=tr|G0MIG4|G0MIG4_CAEBE\n";
print GFF "scaff_000001\taugustus_masked\tmatch\t3010038\t3012310\t0.64\t+\t.\tID=scaff_000001:hit:48921:3.5.0.30;Name=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1\n";
print GFF "scaff_000001\taugustus_masked\tmatch_part\t3010038\t3010098\t0.64\t+\t.\tID=scaff_000001:hsp:87184:3.5.0.30;Parent=scaff_000001:hit:48921:3.5.0.30;Target=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1 1 61 +;Gap=M61\n";
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch\t1590\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1;score=1000\n";
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch_part\t2498\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38715:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1 859 936 +;Gap=M78\n";
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch\t1590\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1\n";
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch_part\t2494\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38793:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1 1 82 +;Gap=M82\n";
print GFF "##FASTA\n";
print GFF ">seq1\n";
print GFF "ACAACACAGATAGATATATAGAGCA\n";
close(GFF);
($features_gff,$errorcode,$errormsg) = &make_features_gff($gff,$outputdir);
if ($errorcode != 16) { print STDERR "ERROR: test_make_features_gff: failed test1\n"; exit;}
system "rm -f $features_gff";
system "rm -f $gff";
# TEST ERRORCODE=17 (UNUSUAL FEATURE TYPE IN GFF):
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n";
print GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print GFF "scaff_000010\tmaker\tCDS3\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print GFF "scaff_000010\tprotein2genome\tprotein_match\t339754\t340801\t478\t+\t.\tID=scaff_000010:hit:2470:2.10.0.3;Name=tr|G0MIG4|G0MIG4_CAEBE\n";
print GFF "scaff_000001\taugustus_masked\tmatch\t3010038\t3012310\t0.64\t+\t.\tID=scaff_000001:hit:48921:3.5.0.30;Name=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1\n";
print GFF "scaff_000001\taugustus_masked\tmatch_part\t3010038\t3010098\t0.64\t+\t.\tID=scaff_000001:hsp:87184:3.5.0.30;Parent=scaff_000001:hit:48921:3.5.0.30;Target=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1 1 61 +;Gap=M61\n";
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch\t1590\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1;score=1000\n";
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch_part\t2498\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38715:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1 859 936 +;Gap=M78\n";
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch\t1590\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1\n";
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch_part\t2494\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38793:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1 1 82 +;Gap=M82\n";
print GFF "##FASTA\n";
print GFF ">seq1\n";
print GFF "ACAACACAGATAGATATATAGAGCA\n";
close(GFF);
($features_gff,$errorcode,$errormsg) = &make_features_gff($gff,$outputdir);
if ($errorcode != 17) { print STDERR "ERROR: test_make_features_gff: failed test3\n"; exit;}
system "rm -f $features_gff";
system "rm -f $gff";
# TEST ERRORCODE=18 (NOT 9 COLUMNS IN GFF FILE):
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n";
print GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print GFF "scaff_000010\tprotein2genome\tprotein_match\t339754\t340801\t478\t+\t.\tID=scaff_000010:hit:2470:2.10.0.3;Name=tr|G0MIG4|G0MIG4_CAEBE\n";
print GFF "scaff_000001\taugustus_masked\tmatch\t3010038\t3012310\t0.64\t+\t.\tID=scaff_000001:hit:48921:3.5.0.30;Name=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1\n";
print GFF "scaff_000001\taugustus_masked\tmatch_part\t3010038\t3010098\t0.64\t+\t.\tID=scaff_000001:hsp:87184:3.5.0.30;Parent=scaff_000001:hit:48921:3.5.0.30;Target=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1 1 61 +;Gap=M61\n";
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch\t1590\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1;score=1000\n";
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch_part\t2498\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38715:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1 859 936 +;Gap=M78\n";
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch\t1590\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1\n";
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch_part\t2494\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38793:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1 1 82 +;Gap=M82\n";
print GFF "##FASTA\n";
print GFF ">seq1\n";
print GFF "ACAACACAGATAGATATATAGAGCA\n";
close(GFF);
($features_gff,$errorcode,$errormsg) = &make_features_gff($gff,$outputdir);
if ($errorcode != 18) { print STDERR "ERROR: test_make_features_gff: failed test4\n"; exit;}
system "rm -f $features_gff";
system "rm -f $gff";
}
#------------------------------------------------------------------#
# MAKE A GFF FILE WITH NON-GENE FEATURES: (SPECIFICALLY pred_gff:embl, pred_gff:protein_coding & augustus_masked)
sub make_features_gff
{
my $gff = $_[0]; # GFF FILE FOR ALL SCAFFOLDS
my $outputdir = $_[1]; # DIRECTORY TO PUT THE OUTPUT FILES INTO
my $features_gff; # GFF FILE FOR NON-GENE FEATURES
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $line; #
my @temp; #
my $end_of_gff = 0; # SAYS WHETHER WE HAVE REACHED THE END OF THE GFF FILE
my $name; # NAME OF THE FEATURE
($features_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(FEATURES_GFF,">$features_gff") || die "ERROR: make_features_gff: cannot open $features_gff\n";
open(GFF,"$gff") || die "ERROR: make_features_gff: cannot open $gff\n";
while(<GFF>)
{
$line = $_;
chomp $line;
if ($line eq '##FASTA') { $end_of_gff = 1;}
if (substr($line,0,1) ne '#' && $end_of_gff == 0)
{
@temp = split(/\t+/,$line);
# ELEANOR SAID THAT WE WANT TO KEEP THE GENE MODEL IF IT HAS ANY ONE OR MORE OF THE FOLLOWING AS SUPPORT:
# (a) Genemark, (b) SNAP, (c) pred_gff
# HOW TO RECOGNISE THESE:
# (i) IF THERE IS genemark/snap/pred_gff IN THE GENE NAME, KEEP THE GENE
# (ii) IF THERE IS blastx/exonerate IN THE GENE NAME, AND THE GENE OVERLAPS AN augustus*
# OR pred_gff:embl OR pred_gff:protein_coding FEATURE, KEEP THE GENE
# (iii) IF THERE IS blastx/exonerate/augustus IN THE GENE NAME, AND THE GENE DOESN'T OVERLAP AN
# augustus* OR pred_gff:embl OR pred_gff:protein_coding FEATURE, DISCARD THE GENE
if ($temp[2] eq 'match' || $temp[2] eq 'match_part')
{
# pred_gff CAN BE EITHER FROM genblastg OR RATT, ELEANOR SAID.
if ($temp[1] eq 'pred_gff:embl' || $temp[1] eq 'pred_gff:protein_coding' || $temp[1] =~ /augustus/)
{
print FEATURES_GFF "$line\n";
}
else
{
if ($temp[1] ne 'blastn' && $temp[1] ne 'blastx' && $temp[1] ne 'est2genome' && $temp[1] ne 'model_gff:maker' &&
$temp[1] ne 'repeatmasker' && $temp[1] ne 'repeatrunner' && $temp[1] ne 'protein2genome' && $temp[1] ne 'genemark' && $temp[1] ne 'snap_masked' &&
$temp[1] ne 'est_gff:source')
{
$errormsg = "ERROR: make_features_gff: match/match_part feature source $temp[1] on line $line\n";
$errorcode = 16; # ERRORCODE=16 (TESTED FOR)
system "rm -f $features_gff";
return($features_gff,$errorcode,$errormsg);
}
}
}
else
{
if ($temp[2] ne 'contig' && $temp[2] ne 'exon' && $temp[2] ne 'CDS' && $temp[2] ne 'mRNA' && $temp[2] ne 'gene' &&
$temp[2] ne 'five_prime_UTR' && $temp[2] ne 'three_prime_UTR' && $temp[2] ne 'expressed_sequence_match' && $temp[2] ne 'protein_match')
{
$errormsg = "ERROR: make_features_gff: feature $temp[2] on line $line\n";
$errorcode = 17; # ERRORCODE=17 (TESTED FOR)
system "rm -f $features_gff";
return($features_gff,$errorcode,$errormsg);
}
}
if ($#temp != 8)
{
$errormsg = "ERROR: make_features_gff: do not have 9 columns in $gff\n";
$errorcode = 18; # ERRORCODE=18 (TESTED FOR)
system "rm -f $features_gff";
return($features_gff,$errorcode,$errormsg);
}
}
}
close(GFF);
close(FEATURES_GFF);
return($features_gff,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &make_gene_gff
sub test_make_gene_gff
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES INTO
my $gff; # GFF FILE
my $gene_gff; # GENE GFF FILE
my $expected_gene_gff; # FILE WITH EXPECTED CONTENTS OF $gene_gff
my $differences; # DIFFERENES BETWEEN $gene_gff AND $expected_gene_gff
my $length_differences; # LENGTH OF $differences
my $line; #
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n";
print GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print GFF "##FASTA\n";
print GFF ">seq1\n";
print GFF "ACAACACAGATAGATATATAGAGCA\n";
close(GFF);
($gene_gff,$errorcode,$errormsg) = &make_gene_gff($gff,$outputdir);
if ($errorcode != 0) { print STDERR "ERROR: test_make_gene_gff: failed test1\n"; exit;}
($expected_gene_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_gene_gff") || die "ERROR: test_make_gene_gff: cannot open $expected_gene_gff\n";
print EXPECTED "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print EXPECTED "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $gene_gff $expected_gene_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_make_gene_gff: failed test1 (gene_gff $gene_gff expected_gene_gff $expected_gene_gff)\n"; exit;}
system "rm -f $gene_gff";
system "rm -f $expected_gene_gff";
system "rm -f $gff";
# TEST ERRORCODE=15 (DO NOT HAVE 9 COLUMNS IN THE GFF FILE):
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n";
print GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print GFF "##FASTA\n";
print GFF ">seq1\n";
print GFF "ACAACACAGATAGATATATAGAGCA\n";
close(GFF);
($gene_gff,$errorcode,$errormsg) = &make_gene_gff($gff,$outputdir);
if ($errorcode != 15) { print STDERR "ERROR: test_make_gene_gff: failed test2\n"; exit;}
system "rm -f $gene_gff";
system "rm -f $gff";
# ADD EXAMPLE WITH MAKER GFF WHERE THE START POINT IS 0:
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n";
print GFF "BPAG.contig.04967.2027 maker gene 1129 1876 . - . ID=genemark-BPAG.contig.04967.2027-processed-gene-0.0;Name=genemark-BPAG.contig.04967.2027-processed-gene-0.0\n";
print GFF "BPAG.contig.04971.2025 maker gene 1675 1797 . + . ID=maker-BPAG.contig.04971.2025-exonerate_protein2genome-gene-0.2;Name=maker-BPAG.contig.04971.2025-exonerate_protein2genome-gene-0.2\n";
print GFF "BPAG.contig.04971.2025 maker gene 0 1613 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0;Name=maker-BPAG.contig.04971.2025-augustus-gene-0.0\n";
print GFF "##FASTA\n";
print GFF ">seq1\n";
print GFF "ACAACACAGATAGATATATAGAGCA\n";
close(GFF);
($gene_gff,$errorcode,$errormsg) = &make_gene_gff($gff,$outputdir);
if ($errorcode != 0) { print STDERR "ERROR: test_make_gene_gff: failed test1\n"; exit;}
($expected_gene_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_gene_gff") || die "ERROR: test_make_gene_gff: cannot open $expected_gene_gff\n";
print EXPECTED "BPAG.contig.04967.2027 maker gene 1129 1876 . - . ID=genemark-BPAG.contig.04967.2027-processed-gene-0.0;Name=genemark-BPAG.contig.04967.2027-processed-gene-0.0\n";
print EXPECTED "BPAG.contig.04971.2025 maker gene 1675 1797 . + . ID=maker-BPAG.contig.04971.2025-exonerate_protein2genome-gene-0.2;Name=maker-BPAG.contig.04971.2025-exonerate_protein2genome-gene-0.2\n";
print EXPECTED "BPAG.contig.04971.2025 maker gene 1 1613 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0;Name=maker-BPAG.contig.04971.2025-augustus-gene-0.0\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $gene_gff $expected_gene_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_make_gene_gff: failed test3 (gene_gff $gene_gff expected_gene_gff $expected_gene_gff)\n"; exit;}
system "rm -f $gene_gff";
system "rm -f $expected_gene_gff";
system "rm -f $gff";
}
#------------------------------------------------------------------#
# MAKE A GFF FILE WITH JUST THE GENES:
sub make_gene_gff
{
my $gff = $_[0]; # GFF FILE FOR ALL SCAFFOLDS
my $outputdir = $_[1]; # DIRECTORY TO PUT THE OUTPUT FILES INTO
my $gene_gff; # GFF FILE FOR GENES
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $line; #
my @temp; #
my $end_of_gff = 0; # SAYS WHETHER WE HAVE REACHED THE END OF THE GFF FILE
my $start; # START OF THE FEATURE IN THE GFF
($gene_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GENE_GFF,">$gene_gff") || die "ERROR: make_gene_gff: cannot open $gene_gff\n";
open(GFF,"$gff") || die "ERROR: make_gene_gff: cannot open $gff\n";
while(<GFF>)
{
$line = $_;
chomp $line;
if ($line eq '##FASTA') { $end_of_gff = 1;}
if (substr($line,0,1) ne '#' && $end_of_gff == 0)
{
@temp = split(/\t+/,$line);
if ($temp[2] eq 'gene')
{
$start = $temp[3];
if ($start <= 0) { $start = 1;} # THIS SOMETIMES HAPPENS WITH MAKER GFF FILES.
print GENE_GFF "$temp[0]\t$temp[1]\t$temp[2]\t$start\t$temp[4]\t$temp[5]\t$temp[6]\t$temp[7]\t$temp[8]\n";
}
if ($#temp != 8)
{
$errormsg = "ERROR: make_gene_gff: do not have 9 columns in $gff\n";
$errorcode = 15; # ERRORCODE=15 (TESTED FOR)
system "rm -f $gene_gff";
return($gene_gff,$errorcode,$errormsg);
}
}
}
close(GFF);
close(GENE_GFF);
return($gene_gff,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &write_output_gff
sub test_write_output_gff
{
my $outputdir = $_[0]; ## DIRECTORY TO WRITE OUTPUT FILES IN
my $input_gff; ## INPUT GFF FILE
my $errorcode; ## RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; ## RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
my %DISCARD = (); ## HASH TABLE OF mRNAs/GENES TO DISCARD
my $output_gff; ## OUTPUT GFF FILE
my @temp; ##
my $expected_output_gff; ## FILE WITH EXPECTED CONTENTS OF $output_gff
my $differences; ## DIFFERENCES BETWEEN $output_gff AND $expected_output_gff
my $length_differences; ## LENGTH OF $differences
my $line; ##
my $GENE; ## HASH TABLE WITH GENE NAMES FOR TRANSCRIPTS
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_write_output_gff: cannot open $input_gff\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">seq1\n";
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n";
close(INPUT_GFF);
# READ IN THE GENE NAME FOR EACH TRANSCRIPT IN THE INPUT GFF FILE:
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
$DISCARD{"augustus_masked-scaff_000010-processed-gene-0.4"} = 1;
$DISCARD{"scaff_000010=29000=29497=augustus_masked-scaff_000010-processed-gene-0.4"} = 1;
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@temp = split(/\//,$output_gff);
$output_gff = $temp[$#temp];
($errorcode,$errormsg) = &write_output_gff($outputdir,$input_gff,$output_gff,\%DISCARD,$GENE);
if ($errorcode != 0) { print STDERR "ERROR: test_write_output_gff: failed test1\n"; exit;}
($expected_output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_output_gff") || die "ERROR: test_write_output_gff: cannot open $expected_output_gff\n";
print EXPECTED "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print EXPECTED "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print EXPECTED "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print EXPECTED "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print EXPECTED "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print EXPECTED "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print EXPECTED "##FASTA\n";
print EXPECTED ">seq1\n";
print EXPECTED "ACAACACAGATAGATATATAGAGCA\n";
close(EXPECTED);
$output_gff = $outputdir."/".$output_gff;
$differences = "";
open(TEMP,"diff $output_gff $expected_output_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_write_output_gff: failed test1 (output_gff $output_gff expected_output_gff $expected_output_gff)\n"; exit;}
system "rm -f $input_gff";
system "rm -f $output_gff";
system "rm -f $expected_output_gff";
# TEST ERRORCODE=1 (DO NOT KNOW GENE NAMES FOR TRANSCRIPTS):
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_write_output_gff: cannot open $input_gff\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">seq1\n";
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n";
close(INPUT_GFF);
%{$GENE} = ();
$DISCARD{"augustus_masked-scaff_000010-processed-gene-0.4"} = 1;
$DISCARD{"scaff_000010=29000=29497=augustus_masked-scaff_000010-processed-gene-0.4"} = 1;
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@temp = split(/\//,$output_gff);
$output_gff = $temp[$#temp];
($errorcode,$errormsg) = &write_output_gff($outputdir,$input_gff,$output_gff,\%DISCARD,$GENE);
if ($errorcode != 1) { print STDERR "ERROR: test_write_output_gff: failed test2\n"; exit;}
system "rm -f $input_gff";
system "rm -f $output_gff";
# TEST ERRORCODE=14 (STRANGE FEATURE IN GFF FILE):
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_write_output_gff: cannot open $input_gff\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print INPUT_GFF "scaff_000010\tmaker\texonz\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">seq1\n";
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n";
close(INPUT_GFF);
# READ IN THE GENE NAME FOR EACH TRANSCRIPT IN THE INPUT GFF FILE:
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
$DISCARD{"augustus_masked-scaff_000010-processed-gene-0.4"} = 1;
$DISCARD{"scaff_000010=29000=29497=augustus_masked-scaff_000010-processed-gene-0.4"} = 1;
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@temp = split(/\//,$output_gff);
$output_gff = $temp[$#temp];
($errorcode,$errormsg) = &write_output_gff($outputdir,$input_gff,$output_gff,\%DISCARD,$GENE);
if ($errorcode != 14) { print STDERR "ERROR: test_write_output_gff: failed test3\n"; exit;}
system "rm -f $input_gff";
system "rm -f $output_gff";
# TEST ERRORCODE=5 (GFF FILE DOES NOT HAVE 9 COLUMNS):
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_write_output_gff: cannot open $input_gff\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">seq1\n";
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n";
close(INPUT_GFF);
$DISCARD{"augustus_masked-scaff_000010-processed-gene-0.4"} = 1;
$DISCARD{"scaff_000010=29000=29497=augustus_masked-scaff_000010-processed-gene-0.4"} = 1;
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@temp = split(/\//,$output_gff);
$output_gff = $temp[$#temp];
($errorcode,$errormsg) = &write_output_gff($outputdir,$input_gff,$output_gff,\%DISCARD,$GENE);
if ($errorcode != 5) { print STDERR "ERROR: test_write_output_gff: failed test4\n"; exit;}
system "rm -f $input_gff";
system "rm -f $output_gff";
}
#------------------------------------------------------------------#
# TEST &check_whether_to_discard
sub test_check_whether_to_discard
{
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
my $discard; # SAYS WHETHER TO DISCARD A PARTICULAR GENE
my %DISCARD = (); # HASH TABLE OF GENES TO DISCARD
# CHECK A CASE WHERE WE SHOULD DISCARD THE GENE:
$DISCARD{"scaff1=100=200=gene1"} = 1;
$DISCARD{"gene1"} = 1;
($discard,$errorcode,$errormsg) = &check_whether_to_discard("gene1","scaff1","100","200",\%DISCARD);
if ($errorcode != 0 || $discard != 1) { print STDERR "ERROR: test_check_whether_to_discard: failed test1 (errorcode $errorcode errormsg $errormsg discard $discard)\n"; exit;}
# CHECK A CASE WHERE WE SHOULD DISCARD THE GENE:
$DISCARD{"scaff1=100=200=gene1"} = 1;
($discard,$errorcode,$errormsg) = &check_whether_to_discard("gene1","scaff1","150","175",\%DISCARD);
if ($errorcode != 0 || $discard != 1) { print STDERR "ERROR: test_check_whether_to_discard: failed test2\n"; exit;}
# CHECK A CASE WHERE WE SHOULD *NOT* DISCARD THE GENE:
$DISCARD{"scaff1=100=200=gene1"} = 1;
($discard,$errorcode,$errormsg) = &check_whether_to_discard("gene1","scaff1","150","250",\%DISCARD);
if ($errorcode != 0 || $discard != 0) { print STDERR "ERROR: test_check_whether_to_discard: failed test3\n"; exit;}
# CHECK A CASE WHERE WE SHOULD *NOT* DISCARD THE GENE:
$DISCARD{"scaff1=100=200=gene1"} = 1;
($discard,$errorcode,$errormsg) = &check_whether_to_discard("gene1","scaff2","100","200",\%DISCARD);
if ($errorcode != 0 || $discard != 0) { print STDERR "ERROR: test_check_whether_to_discard: failed test4\n"; exit;}
# CHECK A CASE WHERE WE SHOULD *NOT* DISCARD THE GENE:
$DISCARD{"scaff1=100=200=gene1"} = 1;
($discard,$errorcode,$errormsg) = &check_whether_to_discard("gene2","scaff1","150","175",\%DISCARD);
if ($errorcode != 0 || $discard != 0) { print STDERR "ERROR: test_check_whether_to_discard: failed test5\n"; exit;}
}
#------------------------------------------------------------------#
# CHECK WHETHER TO DISCARD A FEATURE
sub check_whether_to_discard
{
my $gene = $_[0]; # GENE NAME
my $scaffold = $_[1]; # SCAFFOLD
my $start = $_[2]; # FEATURE START
my $end = $_[3]; # FEATURE END
my $DISCARD = $_[4]; # HASH TABLE OF GENES TO DISCARD
my $discard = 0; # ASSUME WE SHOULD NOT DISCARD THE FEATURE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $gene2; # GENE WE WANT TO DISCARD
my @temp; #
my $scaffold2; # SCAFFOLD FOR $gene2
my $start2; # START FOR $gene2
my $end2; # END FOR $gene2
if ($DISCARD->{$gene}) # WE DO WANT TO DISCARD A GENE OF THIS NAME:
{
# CHECK THAT THE FEATURE OVERLAPS A GENE WE WANT TO DISCARD OF THIS NAME:
foreach $gene2 (keys %{$DISCARD})
{
@temp = split(/=/,$gene2);
if ($#temp == 3)
{
$scaffold2 = $temp[0];
$start2 = $temp[1];
$end2 = $temp[2];
$gene2 = $temp[3];
if ($scaffold eq $scaffold2 && ($start >= $start2 && $end <= $end2) && $gene eq $gene2)
{
$discard = 1;
return($discard,$errorcode,$errormsg);
}
}
}
}
else # WE DO NOT WANT TO DISCARD A GENE OF THIS NAME:
{
$discard = 0;
}
return($discard,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# READ IN THE INPUT GFF FILE, AND MAKE THE OUTPUT GFF FILE WITH GENES THAT ARE FROM snap/pred_gff/genemark, OR SUPPORTED BY augustus*/pred_gff:
sub write_output_gff
{
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES IN
my $input_gff = $_[1]; # INPUT GFF FILE
my $output_gff = $_[2]; # OUTPUT GFF FILE
my $DISCARD = $_[3]; # HASH TABLE OF GENES TO DISCARD
my $GENE = $_[4]; # HASH TABLE OF GENE NAMES FOR TRANSCRIPTS
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $end_of_gff = 0; # SAYS WHETHER WE HAVE REACHED THE END OF THE INPUT GFF
my $line; #
my @temp; #
my $name; # GENE NAME
my $scaffold; # NAME OF SCAFFOLD
my $start; # START OF A FEATURE
my $end; # END OF A FEATURE
my $discard; # SAYS WHETHER TO DISCARD A FEATURE OR NOT
# OPEN THE OUTPUT GFF FILE:
$output_gff = $outputdir."/".$output_gff;
open(OUTPUT,">$output_gff") || die "ERROR: write_output_gff: cannot open $output_gff\n";
# READ IN THE INPUT GFF FILE:
open(INPUT_GFF,"$input_gff") || die "ERROR: write_output_gff: cannot open $input_gff\n";
while(<INPUT_GFF>)
{
$line = $_;
chomp $line;
if ($line eq '##FASTA') { $end_of_gff = 1;}
if (substr($line,0,1) ne '#' && $end_of_gff == 0)
{
@temp = split(/\t+/,$line);
if ($#temp == 8)
{
$scaffold = $temp[0];
$start = $temp[3];
$end = $temp[4];
if ($start <= 0) { $start = 1;} # THIS SOMETIMES HAPPENS WITH MAKER GFF FILES.
if ($temp[2] eq 'contig')
{
print OUTPUT "$line\n";
}
elsif ($temp[2] eq 'gene')
{
$name = $temp[8]; # eg. ID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3
@temp = split(/ID=/,$name);
$name = $temp[1]; # eg. augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3
@temp = split(/\;/,$name);
$name = $temp[0]; # eg. augustus_masked-scaff_000010-processed-gene-0.3
# CHECK IF THIS OVERLAPS A GENE OF THE SAME NAME THAT WE WANT TO DELETE:
($discard,$errorcode,$errormsg) = &check_whether_to_discard($name,$scaffold,$start,$end,$DISCARD);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@temp = split(/\t+/,$line);
if ($discard == 0) { print OUTPUT "$temp[0]\t$temp[1]\t$temp[2]\t$start\t$temp[4]\t$temp[5]\t$temp[6]\t$temp[7]\t$temp[8]\n";}
}
elsif ($temp[2] eq 'mRNA')
{
$name = $temp[8]; # eg. ID=augustus_masked-scaff_000010-processed-gene-0.5-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.5;N...
@temp = split(/ID=/,$name);
$name = $temp[1]; # eg. augustus_masked-scaff_000010-processed-gene-0.5-mRNA-1;Parent=augustus_...
@temp = split(/\;/,$name);
$name = $temp[0]; # eg. augustus_masked-scaff_000010-processed-gene-0.5-mRNA-1 : NAME OF THE mRNA
$name = $scaffold."=".$name;
if (!($GENE->{$name}))
{
$errormsg = "ERROR: write_output_gff: do not know gene name for transcript $name (line $line, mRNA)\n";
$errorcode = 1; # ERRORCODE=1 (TESTED FOR)
return($errorcode,$errormsg);
}
$name = $GENE->{$name};
# CHECK IF THIS OVERLAPS A GENE OF THE SAME NAME THAT WE WANT TO DELETE:
($discard,$errorcode,$errormsg) = &check_whether_to_discard($name,$scaffold,$start,$end,$DISCARD);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@temp = split(/\t+/,$line);
if ($discard == 0) { print OUTPUT "$temp[0]\t$temp[1]\t$temp[2]\t$start\t$temp[4]\t$temp[5]\t$temp[6]\t$temp[7]\t$temp[8]\n"; }
}
elsif ($temp[2] eq 'exon' || $temp[2] eq 'CDS' || $temp[2] eq 'five_prime_UTR' || $temp[2] eq 'three_prime_UTR')
{
$name = $temp[8]; # eg. ID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1
@temp = split(/Parent=/,$name);
$name = $temp[1]; # eg. augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1 : NAME OF THE mRNA
@temp = split(/\;/,$name);
$name = $temp[0];
@temp = split(/\,/,$name); # IT SOMETIMES HAPPENS THAT MAKER GIVES THE TRANSCRIPT NAME TWICE
$name = $temp[0];
$name = $scaffold."=".$name;
if (!($GENE->{$name}))
{
$errormsg = "ERROR: write_output_gff: do not know gene name for transcript $name (line $line, exon/CDS/UTR)\n";
$errorcode = 1; # ERRORCODE=1 (TESTED FOR)
return($errorcode,$errormsg);
}
$name = $GENE->{$name};
# CHECK IF THIS OVERLAPS A GENE OF THE SAME NAME THAT WE WANT TO DELETE:
($discard,$errorcode,$errormsg) = &check_whether_to_discard($name,$scaffold,$start,$end,$DISCARD);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@temp = split(/\t+/,$line);
if ($discard == 0) { print OUTPUT "$temp[0]\t$temp[1]\t$temp[2]\t$start\t$temp[4]\t$temp[5]\t$temp[6]\t$temp[7]\t$temp[8]\n";}
}
elsif ($temp[2] eq 'match' || $temp[2] eq 'match_part' || $temp[2] eq 'protein_match' || $temp[2] eq 'expressed_sequence_match')
# NOTE: WE CAN HAVE GFF LINES WITH:
# blastx IN COLUMN 2 AND protein_match/match_part IN COLUMN 3 -> ELEANOR SAYS THESE ARE DIRECT BLAST HITS.
# protein2genome IN COLUMN 2 AND match/match_part IN COLUMN 3 -> ELEANOR SAYS THESE ARE FROM EXONERATE ALIGNING
# THE PROTEIN TO THE GENOME.
{
print OUTPUT "$line\n";
}
else
{
$errormsg = "ERROR: write_output_gff: feature $temp[2] in gff line $line\n";
$errorcode = 14; # ERRORCODE=14 (TESTED FOR)
return($errorcode,$errormsg);
}
}
else
{
$errormsg = "ERROR: write_output_gff: line does not have 9 columns: $line\n";
$errorcode = 5; # ERRORCODE=5 (TESTED FOR)
return($errorcode,$errormsg);
}
}
else
{
print OUTPUT "$line\n";
}
}
close(INPUT_GFF);
close(OUTPUT);
return($errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &read_genes_for_transcripts
sub test_read_genes_for_transcripts
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN
my $GENE; # HASH TABLE OF GENE NAMES FOR TRANSCRIPTS
my $input_gff; # INPUT GFF FILE
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_genes_for_transcripts: cannot open $input_gff\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">seq1\n";
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n";
close(INPUT_GFF);
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff);
if ($errorcode != 0) { print STDERR "ERROR: test_read_genes_for_transcripst: failed test1\n"; exit;}
if ($GENE->{"scaff_000010=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1"} ne "augustus_masked-scaff_000010-processed-gene-0.3") { print STDERR "ERROR: test_read_genes_for_transcripts: failed test1b\n"; exit;}
if ($GENE->{"scaff_000010=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1"} ne "augustus_masked-scaff_000010-processed-gene-0.4") { print STDERR "ERROR: test_read_genes_for_transcripts: failed test1c\n"; exit;}
system "rm -f $input_gff";
# TEST ERRORCODE=9 (mRNA APPEARS TWICE IN GFF):
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_genes_for_transcripts: cannot open $input_gff\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">seq1\n";
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n";
close(INPUT_GFF);
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff);
if ($errorcode != 9) { print STDERR "ERROR: test_read_genes_for_transcripts: failed test2\n"; exit;}
system "rm -f $input_gff";
# TEST ERRORCODE=3 (LINE DOES NOT HAVE 9 COLUMNS):
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_genes_for_transcripts: cannot open $input_gff\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">seq1\n";
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n";
close(INPUT_GFF);
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff);
if ($errorcode != 3) { print STDERR "ERROR: test_read_genes_for_transcripts: failed test3\n"; exit;}
system "rm -f $input_gff";
}
#------------------------------------------------------------------#
# READ IN THE GENE NAME FOR EACH TRANSCRIPT IN THE INPUT GFF FILE:
sub read_genes_for_transcripts
{
my $input_gff = $_[0]; # INPUT GFF FILE
my $errorcode = 0; # THIS IS RETURNED AS 0 IF NO ERROR OCCURRED.
my $errormsg = "none";# THIS IS RETURNED AS 'none' IF NO ERROR OCCURRED.
my %GENE = (); # HASH TABLE OF GENE NAMES FOR TRANSCRIPT NAMES.
my $line; #
my @temp; #
my $end_of_gff = 0; # SAYS WHETHER THE END OF THE GFF FILE HAS BEEN REACHED
my $name; # NAME OF mRNA
my $gene; # NAME OF GENE
my $scaffold; # NAME OF SCAFFOLD
open(INPUT_GFF,"$input_gff") || die "ERROR: read_genes_for_transcripts: cannot open $input_gff\n";
while(<INPUT_GFF>)
{
$line = $_;
chomp $line;
if ($line eq '##FASTA') { $end_of_gff = 1;}
if (substr($line,0,1) ne '#' && $end_of_gff == 0)
{
@temp = split(/\t+/,$line);
if ($#temp == 8)
{
if ($temp[2] eq 'mRNA')
{
# eg. scaff_000010 maker mRNA 88793 89828 . + . ID=maker-scaff_000010-augustus-gene-0.21-mRNA-1;Parent=maker-scaff_000010-augustus-gene-0.21;...
$scaffold = $temp[0];
$name = $temp[8];
$gene = $temp[8];
@temp = split(/ID=/,$name);
$name = $temp[1]; # eg. maker-scaff_000010-augustus-gene-0.21-mRNA-1;...
@temp = split(/\;/,$name);
$name = $temp[0]; # eg. maker-scaff_000010-augustus-gene-0.21-mRNA-1
# GET THE NAME OF THE GENE:
@temp = split(/Parent=/,$gene);
$gene = $temp[1]; # eg. maker-scaff_000010-augustus-gene-0.21;...
@temp = split(/\;/,$gene);
$gene = $temp[0]; # eg. maker-scaff_000010-augustus-gene-0.21
$name = $scaffold."=".$name;
# RECORD THE GENE NAME FOR THE mRNA:
if ($GENE{$name})
{
# IT IS POSSIBLE FOR A MAKER GFF TO HAVE >=2 GENES OF THE SAME NAME ON THE SAME SCAFFOLD:
if ($GENE{$name} ne $gene)
{
$errormsg= "ERROR: read_genes_for_transcripts: already have gene name $GENE{$name} for $name (not $gene)\n";
$errorcode= 9; # ERRORCODE=9 (TESTED FOR)
return(\%GENE,$errorcode,$errormsg);
}
}
$GENE{$name} = $gene;
}
}
else
{
$errormsg = "ERROR: read_genes_for_transcripts: line does not have 9 columns: $line\n";
$errorcode = 3; # ERRORCODE=3 (TESTED FOR)
return(\%GENE,$errorcode,$errormsg);
}
}
}
close(INPUT_GFF);
return(\%GENE,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# SUBROUTINE TO MAKE A FILE NAME FOR A TEMPORARY FILE:
sub make_filename
{
my $outputdir = $_[0]; # OUTPUT DIRECTORY TO WRITE TEMPORARY FILE NAME TO
my $found_name = 0; # SAYS WHETHER WE HAVE FOUND A FILE NAME YET
my $filename = "none";# NEW FILE NAME TO USE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $poss_filename; # POSSIBLE FILE NAME TO USE
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAME
while($found_name == 0)
{
$random_number = rand();
$poss_filename = $outputdir."/tmp".$random_number;
if (!(-e $poss_filename))
{
$filename = $poss_filename;
$found_name = 1;
}
}
if ($found_name == 0 || $filename eq 'none')
{
$errormsg = "ERROR: make_filename: found_name $found_name filename $filename\n";
$errorcode = 6; # ERRORCODE=6
return($filename,$errorcode,$errormsg);
}#
return($filename,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &print_error
sub test_print_error
{
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR
($errormsg,$errorcode) = &print_error(45,45,1);
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message','My error message',1);
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;}
($errormsg,$errorcode) = &print_error('none',45,1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message', 0, 1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;}
}
#------------------------------------------------------------------#
# PRINT OUT AN ERROR MESSAGE AND EXIT.
sub print_error
{
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED.
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED.
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/)
{
if ($called_from_test == 1)
{
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n";
exit;
}
}
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/))
{
if ($called_from_test == 1)
{
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n";
exit;
}
}
if ($errormsg eq 'none' || $errorcode == 0)
{
if ($called_from_test == 1)
{
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n";
exit;
}
}
else
{
print STDERR "$errormsg";
exit;
}
return($errormsg,$errorcode);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment