Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Last active December 11, 2015 10:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/4586572 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/4586572 to your computer and use it in GitHub Desktop.
Perl script to take a gff file of gene predictions from GeneWise (or from some other program), and to make an output file with just the set of highest-scoring non-overlapping gene predictions.
#!/usr/local/bin/perl
=head1 NAME
find_best_genewise_genes.pl
=head1 SYNOPSIS
find_best_genewise_genes.pl input_gff path_to_bedtools output_gff outputdir
where input_gff is the input GeneWise gff file,
path_to_bedtools is the path to the bedtools installation,
output_gff is the output gff file,
outputdir is the output directory for writing output files.
=head1 DESCRIPTION
This script takes a gff file of GeneWise predictions, and for each set
of overlapping predictions, it takes just the highest scoring gene prediction.
The highest scoring gene predictions are put in the output gff (<output_gff>).
=head1 VERSION
Perl script last edited 9-Jan-2013.
=head1 CONTACT
alc@sanger.ac.uk (Avril Coghlan)
=cut
#
# Perl script find_best_genewise_genes.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 9-Jan-13.
# Last edited 9-Jan-2013.
# SCRIPT SYNOPSIS: find_best_genewise_genes.pl: finds set of highest-scoring non-overlapping genes in a gff file
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
use strict;
use warnings;
my $num_args = $#ARGV + 1;
if ($num_args != 4)
{
print "Usage of find_best_genewise_genes.pl\n\n";
print "perl find_best_genewise_genes.pl <input_gff> <path_to_bedtools> <output_gff> <outputdir>\n";
print "where <input_gff> is the input GeneWise gff file,\n";
print " <path_to_bedtools> is the path to the bedtools installation,\n";
print " <output_gff> is the output gff file,\n";
print " <outputdir> is the output directory for writing output files\n";
print "For example, >perl find_best_genewise_genes.pl TF.gff\n";
print "/nfs/users/nfs_a/alc/Documents/bin/bedtools-2.17.0\n";
print "TF_output.gff /lustre/scratch108/parasites/alc/Genewise50Helminths/gff_file\n";
exit;
}
# FIND THE PATH TO THE INPUT GFF FILE:
my $input_gff = $ARGV[0];
# FIND THE PATH TO THE BEDTOOLS INSTALLATION:
my $path_to_bedtools = $ARGV[1];
# FIND THE OUTPUT GFF FILE:
my $output_gff = $ARGV[2];
# FIND THE DIRECTORY TO USE FOR OUTPUT FILES:
my $outputdir = $ARGV[3];
#------------------------------------------------------------------#
# TEST SUBROUTINES:
my $PRINT_TEST_DATA = 1; # SAYS WHETHER TO PRINT DATA USED DURING TESTING.
&test_add_features_to_scaffold_gff($outputdir);
&test_find_best_predictions_for_scaffold($outputdir,$path_to_bedtools);
&test_run_main_program($outputdir,$path_to_bedtools);
&test_make_gff_for_scaffold($outputdir);
&test_make_genescores_file($outputdir);
&test_remove_lower_scoring_overlapping_genes($outputdir,$path_to_bedtools);
&test_make_gene_gff($outputdir);
&test_print_error;
print STDERR "Tests done, running main code...\n";
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
(my $errorcode,my $errormsg) = &run_main_program($outputdir,$input_gff,$output_gff,$path_to_bedtools);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# TEST &run_main_program
sub test_run_main_program
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES INTO
my $path_to_bedtools = $_[1]; # PATH TO THE BEDTOOLS INSTALLATION
my $gff; # GFF FILE FOR THE SCAFFOLD
my $new_gff; # NEW GFF FILE FOR THIS SCAFFOLD
my $expected_new_gff; # FILE WITH EXPECTED CONTENTS OF $new_gff
my $differences; # DIFFERENES BETWEEN $new_gff AND $expected_new_gff
my $length_differences; # LENGTH OF $differences
my $line; #
my @temp; #
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_run_main_program: cannot open gff $gff\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tintron\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
close(GFF);
($new_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@temp = split(/\//,$new_gff);
$new_gff = $temp[$#temp];
($errorcode,$errormsg) = &run_main_program($outputdir,$gff,$new_gff,$path_to_bedtools);
if ($errorcode != 0) { print STDERR "ERROR: test_run_main_program: failed test1\n"; exit;}
($expected_new_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_new_gff") || die "ERROR: test_run_main_program: cannot open expected_new_gff $expected_new_gff\n";
print EXPECTED "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print EXPECTED "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print EXPECTED "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print EXPECTED "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print EXPECTED "ACAC.scaffold.00050.383204\tGeneWise\tintron\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print EXPECTED "ACAC.scaffold.00050.383205\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print EXPECTED "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print EXPECTED "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $new_gff $expected_new_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_run_main_program: failed test1 (new_gff $new_gff expected_new_gff $expected_new_gff)\n"; exit;}
system "rm -f $new_gff";
system "rm -f $expected_new_gff";
system "rm -f $gff";
}
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
# TO FIND THE HIGHEST-SCORING NON-OVERLAPPING GENES, THIS DOES THE FOLLOWING:
# (i) SORTS THE GENES IN ORDER OF DESCENDING SCORES
# (ii) REMOVE ALL GENES FROM THE GFF THAT OVERLAP THE HIGHEST-SCORING GENE -> GIVES GFF1
# (iii) REMOVE ALL THE GENES FROM THE GFF1 THAT OVERLAP THE SECOND HIGHEST-SCORING GENE -> GIVES GFF2
# (iv) REMOVE ALL THE GENES FROM THE GFF2 THAT OVERLAP THE THIRD HIGHEST-SCORING GENE -> GIVES GFF3
# (v) etc.
sub run_main_program
{
my $outputdir = $_[0]; ## DIRECTORY TO PUT OUTPUT FILES IN.
my $input_gff = $_[1]; ## THE INPUT GFF FILE
my $output_gff = $_[2]; ## THE OUTPUT GFF FILE
my $path_to_bedtools = $_[3]; ## PATH TO THE BEDTOOLS INSTALLATION
my $errorcode = 0; ## RETURNED AS 0 IF THERE IS NO ERROR.
my $errormsg = "none";## RETURNED AS 'none' IF THERE IS NO ERROR.
my $scaffolds_file; ## A FILE WITH THE NAMES OF ALL SCAFFOLDS IN $new_gff
my $scaffold; ## NAME OF A SCAFFOLD
my $scaffold_gff; ## GFF FILE FOR SCAFFOLD $scaffold
my $new_gff; ## GFF FILE OF GENES
# MAKE A GFF FILE WITH JUST THE GENE FEATURES (NOT 'cds' OR 'intron' FEATURES):
($new_gff,$errorcode,$errormsg) = &make_gene_gff($input_gff,$outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# MAKE A FILE WITH ALL THE SCAFFOLDS IN $new_gff:
($scaffolds_file,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
system "cut -f1 $new_gff | sort | uniq > $scaffolds_file";
# OPEN THE OUTPUT GFF:
$output_gff = $outputdir."/".$output_gff;
open(OUTPUT,">$output_gff") || die "ERROR: run_main_program: cannot open output_gff $output_gff\n";
close(OUTPUT);
# LOOK AT EACH SCAFFOLD AT A TIME:
open(SCAFFOLDS,"$scaffolds_file") || die "ERROR: run_main_program: cannot open scaffolds_file $scaffolds_file\n";
while(<SCAFFOLDS>)
{
$scaffold = $_;
chomp $scaffold;
# MAKE A GFF FILE WITH JUST THE INFORMATION FOR THIS SCAFFOLD FROM $new_gff:
($scaffold_gff,$errorcode,$errormsg) = &make_gff_for_scaffold($new_gff,$outputdir,$scaffold);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# FIND THE HIGHEST-SCORING NON-OVERLAPPING GENE PREDICTIONS FOR $scaffold:
($scaffold_gff,$errorcode,$errormsg) = &find_best_predictions_for_scaffold($scaffold_gff,$outputdir,$scaffold,$path_to_bedtools);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# ADD THE 'CDS' AND 'intron' FEATURES BACK TO THE SCAFFOLD GFF FILE:
($errorcode,$errormsg) = &add_features_to_scaffold_gff($scaffold_gff,$outputdir,$scaffold,$input_gff);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# CONCATENATE $scaffold_gff TO THE OUTPUT FILE:
system "cat $scaffold_gff >> $output_gff";
# DELETE TEMPORARY FILES:
system "rm -f $scaffold_gff";
}
close(SCAFFOLDS);
# DELETE TEMPORARY FILES:
system "rm -f $new_gff";
system "rm -f $scaffolds_file";
return($errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &add_features_to_scaffold_gff
sub test_add_features_to_scaffold_gff
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES INTO
my $input_gff; # INPUT GFF FILE
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR
my $scaffold_gff; # THE SCAFFOLD GFF FILE
my $expected_scaffold_gff; # FILE CONTAINING THE EXPECTED FINAL CONTENTS OF $scaffold_gff
my $differences; # DIFFERENCES BETWEEN $scaffold_gff AND $expected_scaffold_gff
my $length_differences; # LENGTH OF $differences
my $line; #
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$input_gff") || die "ERROR: test_add_features_to_scaffold_gff: cannot open input_gff $input_gff\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tintron\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
close(GFF);
($scaffold_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(SCAFFOLDGFF,">$scaffold_gff") || die "ERROR: test_add_features_to_scaffold_gff: cannot open scaffold_gff $scaffold_gff\n";
print SCAFFOLDGFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print SCAFFOLDGFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print SCAFFOLDGFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
close(SCAFFOLDGFF);
($errorcode,$errormsg) = &add_features_to_scaffold_gff($scaffold_gff,$outputdir,"ACAC.scaffold.00050.383204",$input_gff);
if ($errorcode != 0) { print STDERR "ERROR: test_add_features_to_scaffold_gff: failed test1\n"; exit;}
($expected_scaffold_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_scaffold_gff") || die "ERROR: test_add_features_to_scaffold_gff: cannot open expected_scaffold_gff $expected_scaffold_gff\n";
print EXPECTED "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print EXPECTED "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print EXPECTED "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print EXPECTED "ACAC.scaffold.00050.383204\tGeneWise\tCDS\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print EXPECTED "ACAC.scaffold.00050.383204\tGeneWise\tintron\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $scaffold_gff $expected_scaffold_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_add_features_to_scaffold_gff: failed test1 (scaffold_gff $scaffold_gff expected_scaffold_gff $expected_scaffold_gff)\n"; exit;}
system "rm -f $scaffold_gff";
system "rm -f $expected_scaffold_gff";
system "rm -f $input_gff";
}
#------------------------------------------------------------------#
# ADD THE 'CDS' AND 'intron' FEATURES BACK TO THE SCAFFOLD GFF FILE:
sub add_features_to_scaffold_gff
{
my $scaffold_gff = $_[0]; # GFF FILE FOR SCAFFOLD
my $outputdir = $_[1]; # DIRECTORY TO PUT OUTPUT FILES INTO
my $scaffold = $_[2]; # SCAFFOLD NAME
my $input_gff = $_[3]; # INPUT GFF FILE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my %TAKE = (); # HASH TABLE OF GENES TO TAKE FROM THE SCAFFOLD
my $line; #
my @temp; #
my $temp_gff; # TEMPORARY GFF FILE
my $temp_gff2; # TEMPORARY GFF FILE
my $gene; # NAME OF A GENE PREDICTION
# FIND THE NAMES OF THE GENES THAT WE ARE TAKING FROM THIS SCAFFOLD:
open(SCAFFOLD_GFF,"$scaffold_gff") || die "ERROR: add_features_to_scaffold_gff: cannot open scaffold_gff $scaffold_gff\n";
while(<SCAFFOLD_GFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
$gene = $temp[8];
$TAKE{$gene} = 1;
}
close(SCAFFOLD_GFF);
# MAKE A TEMPORARY GFF FILE WITH THE 'CDS' AND 'intron' FEATURES FOR THE SCAFFOLD:
($temp_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
system "grep $scaffold $input_gff | grep CDS > $temp_gff";
system "grep $scaffold $input_gff | grep intron >> $temp_gff";
# TAKE JUST THE INTRON AND CDS FEATURES FOR THE GENES THAT WE ARE INTERESTED IN:
($temp_gff2,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(TEMP_GFF2,">$temp_gff2") || die "ERROR: add_features_to_scaffold_gff: cannot open $temp_gff2\n";
open(TEMP_GFF,"$temp_gff") || die "ERROR: add_features_to_scaffold_gff: cannot open $temp_gff\n";
while(<TEMP_GFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
$gene = $temp[8];
if ($TAKE{$gene})
{
print TEMP_GFF2 "$line\n";
}
}
close(TEMP_GFF);
close(TEMP_GFF2);
# CONCATENATE $temp_gff2 TO $scaffold_gff:
system "cat $temp_gff2 >> $scaffold_gff";
# DELETE TEMPORARY FILES:
system "rm -f $temp_gff";
system "rm -f $temp_gff2";
return($errorcode,$errormsg);
}
#------------------------------------------------------------------#
sub test_find_best_predictions_for_scaffold
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES INTO
my $path_to_bedtools = $_[1]; # PATH TO THE BEDTOOLS INSTALLATION
my $gff; # GFF FILE FOR THE SCAFFOLD
my $new_gff; # NEW GFF FILE FOR THIS SCAFFOLD
my $expected_new_gff; # FILE WITH EXPECTED CONTENTS OF $new_gff
my $differences; # DIFFERENES BETWEEN $new_gff AND $expected_new_gff
my $length_differences; # LENGTH OF $differences
my $line; #
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_find_best_predictions_for_scaffold: cannot open $gff\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
close(GFF);
# RUN &find_best_predictions_for_scaffold:
($new_gff,$errorcode,$errormsg) = &find_best_predictions_for_scaffold($gff,$outputdir,"ACAC.scaffold.00050.383204",$path_to_bedtools);
if ($errorcode != 0) { print STDERR "ERROR: test_find_best_predictions_for_scaffold: failed test1\n"; exit;}
($expected_new_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_new_gff") || die "ERROR: test_find_best_predictions_for_scaffold: cannot open $expected_new_gff\n";
print EXPECTED "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print EXPECTED "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print EXPECTED "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $new_gff $expected_new_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_find_best_predictions_for_scaffold: failed test1 (new_gff $new_gff expected_new_gff $expected_new_gff)\n"; exit;}
system "rm -f $new_gff";
system "rm -f $expected_new_gff";
system "rm -f $gff";
}
#------------------------------------------------------------------#
# FIND THE HIGHEST-SCORING NON-OVERLAPPING GENE PREDICTIONS FOR $scaffold:
sub find_best_predictions_for_scaffold
{
my $new_gff = $_[0]; # GFF FILE FOR SCAFFOLD
my $outputdir = $_[1]; # DIRECTORY TO PUT OUTPUT FILES INTO
my $scaffold = $_[2]; # NAME OF SCAFFOLD
my $path_to_bedtools = $_[3]; # PATH TO THE BEDTOOLS INSTALLATION
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $scores_file; # FILE WITH GENE SCORES FOR EACH GENE
my $num_genes; # NUMBER OF GENES IN THE SCAFFOLD
my $genecnt; # COUNTER FOR THE GENES WE LOOKED AT IN THE SCAFFOLD
my $line; #
my @temp; #
my $gene; # NAME OF GENE
my $score; # SCORE FOR GENE $gene
# MAKE A FILE WITH THE GENE ID. AND GENEWISE SCORE FOR EACH GENE:
($scores_file,$num_genes,$errorcode,$errormsg) = &make_genescores_file($new_gff,$outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# READ IN THE FILE OF GENE PREDICTIONS, WHICH ARE SORTED FROM HIGHEST-SCORING TO LOWEST-SCORING:
$genecnt = 0;
open(SCORESFILE,"$scores_file") || die "ERROR: run_main_program: cannot open $scores_file\n";
while(<SCORESFILE>)
{
$line = $_;
chomp $line;
@temp = split(/\s+/,$line);
if ($#temp != 1)
{
$errormsg = "ERROR: find_best_predictions_for_scaffold: there are not 2 columns in scores_file $scores_file\n";
$errorcode = 44; # ERRORCODE=44 (SHOULDN'T HAPPEN SO CAN'T TEST FOR)
return($new_gff,$errorcode,$errormsg);
}
$genecnt++;
$gene = $temp[0];
$score = $temp[1];
# MAKE A NEW GFF FILE, BY REMOVING THE GENE PREDICTIONS (WHICH MUST BE LOWER SCORING) THAT OVERLAP $gene:
print STDERR "Gene $genecnt (out of $num_genes on $scaffold): removing gene predictions that overlap $gene (score $score...) from gff file $new_gff\n";
($new_gff,$errorcode,$errormsg) = &remove_lower_scoring_overlapping_genes($new_gff,$outputdir,$path_to_bedtools,$gene);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
}
close(SCORESFILE);
system "rm -f $scores_file";
return($new_gff,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &make_gff_for_scaffold
sub test_make_gff_for_scaffold
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES INTO
my $gff; # GFF FILE FOR THE SCAFFOLD
my $new_gff; # NEW GFF FILE FOR THIS SCAFFOLD
my $expected_new_gff; # FILE WITH EXPECTED CONTENTS OF $new_gff
my $differences; # DIFFERENES BETWEEN $new_gff AND $expected_new_gff
my $length_differences; # LENGTH OF $differences
my $line; #
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
close(GFF);
($new_gff,$errorcode,$errormsg) = &make_gff_for_scaffold($gff,$outputdir,"ACAC.scaffold.00050.383204");
if ($errorcode != 0) { print STDERR "ERROR: test_make_gff_for_scaffold: failed test1\n"; exit;}
($expected_new_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_new_gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $expected_new_gff\n";
print EXPECTED "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print EXPECTED "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print EXPECTED "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print EXPECTED "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print EXPECTED "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $new_gff $expected_new_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_make_gff_for_scaffold: failed test1 (new_gff $new_gff expected_new_gff $expected_new_gff)\n"; exit;}
system "rm -f $new_gff";
system "rm -f $expected_new_gff";
system "rm -f $gff";
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
close(GFF);
($new_gff,$errorcode,$errormsg) = &make_gff_for_scaffold($gff,$outputdir,"ACAC.scaffold.00050.383205");
if ($errorcode != 0) { print STDERR "ERROR: test_make_gff_for_scaffold: failed test1\n"; exit;}
($expected_new_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_new_gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $expected_new_gff\n";
print EXPECTED "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print EXPECTED "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print EXPECTED "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print EXPECTED "ACAC.scaffold.00050.383205\tGeneWise\tgene\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print EXPECTED "ACAC.scaffold.00050.383205\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $new_gff $expected_new_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_make_gff_for_scaffold: failed test1 (new_gff $new_gff expected_new_gff $expected_new_gff)\n"; exit;}
system "rm -f $new_gff";
system "rm -f $expected_new_gff";
system "rm -f $gff";
# TEST FOR ERRORCODE=45 (NOT 9 COLUMNS IN THE GFF FILE):
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383204\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185915\t185921\t-24.36\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185920\t185931\t-3.52\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t185910\t185916\t-3.42\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383205\tgene\t206829\t214134\t200.15\t+\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383205\tGeneWise\tgene\t206829\t214134\t252.15\t-\t.\tID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
close(GFF);
($new_gff,$errorcode,$errormsg) = &make_gff_for_scaffold($gff,$outputdir,"ACAC.scaffold.00050.383205");
if ($errorcode != 45) { print STDERR "ERROR: test_make_gff_for_scaffold: failed test1\n"; exit;}
system "rm -f $new_gff";
system "rm -f $gff";
}
#------------------------------------------------------------------#
# MAKE A GFF FILE FOR SCAFFOLD $scaffold:
sub make_gff_for_scaffold
{
my $gff = $_[0]; # GFF FILE FOR ALL SCAFFOLDS
my $outputdir = $_[1]; # DIRECTORY TO PUT THE OUTPUT FILES INTO
my $scaffold = $_[2]; # SCAFFOLD OF INTEREST
my $temp_gff; # TEMPORARY GFF FILE FOR $scaffold, MADE USING grep
my $temp_gff2; # TEMPORARY GFF FILE FOR $scaffold
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $line; #
my @temp; #
my $num_scaffolds; # NUMBER OF SCAFFOLDS IN $temp_gff2
($temp_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
system "grep $scaffold $gff > $temp_gff";
($temp_gff2,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(TEMP_GFF2,">$temp_gff2") || die "ERROR: make_gff_for_scaffold: cannot open $temp_gff2\n";
open(TEMP_GFF,"$temp_gff") || die "ERROR: make_gff_for_scaffold: cannot open $temp_gff\n";
while(<TEMP_GFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
if ($#temp != 8)
{
$errormsg = "ERROR: make_gff_for_scaffold: do not have 9 columns in $temp_gff\n";
$errorcode = 45; # ERRORCODE=45 (TESTED FOR)
system "rm -f $temp_gff";
system "rm -f $temp_gff2";
return($temp_gff2,$errorcode,$errormsg);
}
if ($temp[0] eq $scaffold)
{
print TEMP_GFF2 "$line\n";
}
}
close(TEMP_GFF);
close(TEMP_GFF2);
system "rm -f $temp_gff";
# CHECK THERE IS JUST ONE SCAFFOLD IN $temp_gff2:
open(TMP,"cut -f1 $temp_gff2 | sort | uniq | wc -l |");
while(<TMP>)
{
$num_scaffolds = $_;
chomp $num_scaffolds;
}
close(TMP);
if ($num_scaffolds != 1)
{
$errormsg = "ERROR: make_gff_for_scaffold: $num_scaffolds scaffolds in $temp_gff2\n";
$errorcode = 46; # ERRORCODE=46 (CANNOT TEST FOR, SHOULDN'T HAPPEN);
return($temp_gff2,$errorcode,$errormsg);
}
return($temp_gff2,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &make_genescores_file
sub test_make_genescores_file
{
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES INTO
my $gff; # GFF FILE OF GENES
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
my $scores_file; # FILE WITH GENE SCORES
my $expected_scores_file; # FILE WITH EXPECTED CONTENTS OF $scores_file
my $differences; # DIFFERENCES BETWEEN $scores_file AND $expected_scores_file
my $length_differences; # LENGTH OF $differences
my $line; #
my $num_genes; # NUMBER OF GENES IN $gff
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_make_genescores_file: cannot open $gff\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 185915 185921 -24.36 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 185920 185931 -3.52 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 185910 185916 -3.42 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 206829 214134 252.15 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
close(GFF);
($scores_file,$num_genes,$errorcode,$errormsg) = &make_genescores_file($gff,$outputdir);
if ($errorcode != 0 || $num_genes != 4) { print STDERR "ERROR: test_make_genescores_file: failed test1\n"; exit;}
($expected_scores_file,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_scores_file") || die "ERROR: test_make_genescores_file: cannot open $expected_scores_file\n";
print EXPECTED "ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5 252.15\n";
print EXPECTED "ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3 -3.42\n";
print EXPECTED "ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2 -3.52\n";
print EXPECTED "ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1 -24.36\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $scores_file $expected_scores_file |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_make_genescores_file: scores_file $scores_file expected_scores_file $expected_scores_file\n"; exit;}
system "rm -f $gff";
system "rm -f $scores_file";
system "rm -f $expected_scores_file";
# TEST ERRORCODE=47 (GFF DOES NOT HAVE 9 COLUMNS):
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_make_genescores_file: cannot open $gff\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 185915 185921 -24.36 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 185920 185931 -3.52 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383204 gene 185910 185916 -3.42 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 206829 214134 252.15 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
close(GFF);
($scores_file,$num_genes,$errorcode,$errormsg) = &make_genescores_file($gff,$outputdir);
if ($errorcode != 47) { print STDERR "ERROR: test_make_genescores_file: failed test2\n"; exit;}
system "rm -f $gff";
}
#------------------------------------------------------------------#
# MAKE A FILE WITH THE GENE ID. AND GENEWISE SCORE FOR EACH GENE:
sub make_genescores_file
{
my $new_gff = $_[0]; # GFF FILE WITH GENEWISE GENES
my $outputdir = $_[1]; # DIRECTORY TO PUT OUTPUT FILES INTO
my $scores_file; # FILE WITH GENEWISE GENE SCORES
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my @temp; #
my $line; #
my $score; # GENEWISE SCORE
my $gene; # GENE IDENTIFIER
my $sorted_scores_file = "none";# A SORTED VERSION OF $scores_file
my $num_genes = 0; # NUMBER OF GENES IN $new_gff
($scores_file,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(SCORES,">$scores_file") || die "ERROR: make_genescores_file: cannot open scores_file $scores_file\n";
open(NEW_GFF,"$new_gff") || die "ERROR: make_genescores_file: cannot open new_gff $new_gff\n";
while(<NEW_GFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
if ($#temp != 8)
{
$errormsg = "ERROR: make_genescores_file: line $line from $new_gff does not have 9 columns (temp = $#temp)\n";
$errorcode = 47; # ERRORCODE=47 (TESTED FOR)
system "rm -f $scores_file";
return($sorted_scores_file,$num_genes,$errorcode,$errormsg);
}
$score = $temp[5];
$gene = $temp[8];
print SCORES "$gene $score\n";
$num_genes++;
}
close(NEW_GFF);
close(SCORES);
# SORT $scores_file:
($sorted_scores_file,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
system "sort -k2,2nr $scores_file > $sorted_scores_file";
system "rm -f $scores_file";
return($sorted_scores_file,$num_genes,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &remove_lower_scoring_overlapping_genes
sub test_remove_lower_scoring_overlapping_genes
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES INTO
my $path_to_bedtools = $_[1]; # PATH TO THE BEDTOOLS INSTALLATION
my $gff; # GFF FILE OF GENE PREDICTIONS
my $errorcode; # RETURNED AS 0 FROM A SUBROUTINE IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A SUBROUTINE IF THERE IS NO ERROR
my $new_gff; # NEW GFF FILE
my $expected_new_gff; # FILE WITH EXPECTED CONTENTS OF $new_gff
my $differences; # DIFFERENCES BETWEEN $new_gff AND $expected_new_gff
my $length_differences; # LENGTH OF $differences
my $line; #
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_remove_lower_scoring_overlapping_genes: cannot open $gff\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 185915 185921 -24.36 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 185920 185931 -3.42 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 185910 185916 -2.00 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 206835 214200 152.15 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 206829 214134 252.15 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
close(GFF);
($new_gff,$errorcode,$errormsg) = &remove_lower_scoring_overlapping_genes($gff,$outputdir,$path_to_bedtools,"ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5");
if ($errorcode != 0) { print STDERR "ERROR: test_remove_lower_scoring_overlapping_genes: failed test1\n"; exit;}
($expected_new_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_new_gff") || die "ERROR: test_remove_lower_scoring_overlapping_genes: cannot open $expected_new_gff\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise gene 185915 185921 -24.36 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise gene 185920 185931 -3.42 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise gene 185910 185916 -2.00 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise gene 206829 214134 252.15 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $new_gff $expected_new_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_remove_lower_scoring_overlapping_genes: failed test1 (new_gff $new_gff expected_new_gff $expected_new_gff)\n"; exit;}
system "rm -f $gff";
system "rm -f $new_gff";
system "rm -f $expected_new_gff";
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_remove_lower_scoring_overlapping_genes: cannot open $gff\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 185915 185921 -24.36 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 185920 185931 -3.42 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 185910 185916 -2.00 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 206835 214200 152.15 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 206829 214134 252.15 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
close(GFF);
($new_gff,$errorcode,$errormsg) = &remove_lower_scoring_overlapping_genes($gff,$outputdir,$path_to_bedtools,"ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3");
if ($errorcode != 0) { print STDERR "ERROR: test_remove_lower_scoring_overlapping_genes: failed test2\n"; exit;}
($expected_new_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_new_gff") || die "ERROR: test_remove_lower_scoring_overlapping_genes: cannot open $expected_new_gff\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise gene 185920 185931 -3.42 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise gene 206835 214200 152.15 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise gene 206829 214134 252.15 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise gene 185910 185916 -2.00 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $new_gff $expected_new_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_remove_lower_scoring_overlapping_genes: failed test2 (new_gff $new_gff expected_new_gff $expected_new_gff)\n"; exit;}
system "rm -f $gff";
system "rm -f $new_gff";
system "rm -f $expected_new_gff";
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_remove_lower_scoring_overlapping_genes: cannot open $gff\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 185915 185921 -24.36 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 185920 185931 -3.42 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 185910 185916 -2.00 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 206835 214200 152.15 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 206829 214134 252.15 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
close(GFF);
($new_gff,$errorcode,$errormsg) = &remove_lower_scoring_overlapping_genes($gff,$outputdir,$path_to_bedtools,"ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1");
if ($errorcode != 0) { print STDERR "ERROR: test_remove_lower_scoring_overlapping_genes: failed test3\n"; exit;}
($expected_new_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_new_gff") || die "ERROR: test_remove_lower_scoring_overlapping_genes: cannot open $expected_new_gff\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise gene 206835 214200 152.15 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise gene 206829 214134 252.15 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise gene 185915 185921 -24.36 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $new_gff $expected_new_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_remove_lower_scoring_overlapping_genes: failed test3 (new_gff $new_gff expected_new_gff $expected_new_gff)\n"; exit;}
system "rm -f $gff";
system "rm -f $new_gff";
system "rm -f $expected_new_gff";
($gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(GFF,">$gff") || die "ERROR: test_remove_lower_scoring_overlapping_genes: cannot open $gff\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 185915 185921 -24.36 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 185920 185931 -3.42 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 185910 185916 -2.00 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 206835 214200 152.15 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print GFF "ACAC.scaffold.00050.383204 GeneWise gene 206829 214134 252.15 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
close(GFF);
($new_gff,$errorcode,$errormsg) = &remove_lower_scoring_overlapping_genes($gff,$outputdir,$path_to_bedtools,"ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-6");
if ($errorcode != 0) { print STDERR "ERROR: test_remove_lower_scoring_overlapping_genes: failed test4\n"; exit;}
($expected_new_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_new_gff") || die "ERROR: test_remove_lower_scoring_overlapping_genes: cannot open $expected_new_gff\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise gene 185915 185921 -24.36 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise gene 185920 185931 -3.42 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise gene 185910 185916 -2.00 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-3\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise gene 206835 214200 152.15 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-4\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise gene 206829 214134 252.15 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $new_gff $expected_new_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_remove_lower_scoring_overlapping_genes: failed test4 (new_gff $new_gff expected_new_gff $expected_new_gff)\n"; exit;}
system "rm -f $gff";
system "rm -f $new_gff";
system "rm -f $expected_new_gff";
}
#------------------------------------------------------------------#
# MAKE A NEW GFF FILE, BY REMOVING THE GENE PREDICTIONS (WHICH MUST BE LOWER SCORING) THAT OVERLAP $gene:
sub remove_lower_scoring_overlapping_genes
{
my $gff = $_[0]; # GFF FILE OF GENE PREDICTIONS
my $outputdir = $_[1]; # DIRECTORY TO WRITE OUTPUT FILES INTO
my $path_to_bedtools = $_[2]; # PATH TO THE BEDTOOLS INSTALLATION
my $gene = $_[3]; # GENE OF INTEREST
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $new_gff = "none";# GFF FILE WITH THE GENES THAT OVERLAP $gene REMOVED
my $cmd; # COMMAND TO RUN
my $temp_gff; # TEMPORARY GFF FILE WITH JUST $gene
my $temp_gff2; # TEMPORARY GFF FILE WITH JUST $gene (MADE USING grep)
my $line; #
my @temp; #
my $found_gene = 0; # SAYS WHETHER WE FOUND GENE $gene IN $gff
# MAKE TEMPORARY GFF FILE WITH JUST GENE $gene:
($temp_gff2,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
system "grep $gene $gff > $temp_gff2";
# MAKE TEMPORARY GFF FILE WITH JUST GENE $gene:
($temp_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(TEMP_GFF,">$temp_gff") || die "ERROR: remove_lower_scoring_overlapping_genes: cannot open $temp_gff\n";
open(TEMP_GFF2,"$temp_gff2") || die "ERROR: remove_lower_scoring_overlapping_genes: cannot open $temp_gff2\n";
while(<TEMP_GFF2>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
if ($temp[8] eq $gene)
{
$found_gene = 1;
print TEMP_GFF "$line\n";
}
}
close(TEMP_GFF2);
close(TEMP_GFF);
# IF THE GENE HAS NOT BEEN FOUND, IT MUST HAVE BEEN REMOVED BECAUSE IT OVERLAPS A HIGHER SCORING GENE:
if ($found_gene == 0)
{
system "rm -f $temp_gff";
system "rm -f $temp_gff2";
return($gff,$errorcode,$errormsg);
}
# MAKE NEW GFF FILE, WITH GENE PREDICTIONS THAT OVERLAP $gene REMOVED:
($new_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
$cmd = $path_to_bedtools."/bin/intersectBed -v -a $gff -b $temp_gff > $new_gff";
system "$cmd";
# -v : Only report those entries in A that have _no overlaps_ with B.
# ADD $temp_gff (CONTAINS $gene) BACK TO $new_gff:
system "cat $temp_gff >> $new_gff";
# REMOVE TEMPORARY FILES:
system "rm -f $temp_gff";
system "rm -f $gff";
system "rm -f $temp_gff2";
return($new_gff,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &make_gene_gff
sub test_make_gene_gff
{
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES INTO
my $input_gff; # INPUT GFF FILE
my $new_gff; # NEW GFF FILE
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETUREND AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
my $expected_new_gff; # FILE WITH EXPECTED CONTENTS OF $new_gff
my $differences; # DIFFERENCES BETWEEN $new_gff AND $expected_new_gff
my $length_differences; # LENGTH OF $differences
my $line; #
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_make_gene_gff: cannot open input_gff $input_gff\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise gene 185915 185917 -24.36 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 185915 185917 0.00 + 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise gene 185920 185931 -3.42 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 185920 185931 0.00 + 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise gene 206829 214134 252.15 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 214021 214134 0.00 - 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise intron 213951 214020 0.00 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 213864 213950 0.00 - 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise intron 213848 213863 0.00 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 213764 213847 0.00 - 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise intron 211581 213763 0.00 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 211467 211580 0.00 - 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise intron 211407 211466 0.00 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 211285 211406 0.00 - 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise intron 211224 211284 0.00 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 211098 211223 0.00 - 1 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise intron 211035 211097 0.00 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 210932 211034 0.00 - 1 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise intron 210844 210931 0.00 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 210731 210843 0.00 - 0 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise intron 207010 210730 0.00 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
print INPUT_GFF "ACAC.scaffold.00050.383204 GeneWise CDS 206829 207009 0.00 - 1 ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
close(INPUT_GFF);
($new_gff,$errorcode,$errormsg) = &make_gene_gff($input_gff,$outputdir);
if ($errorcode != 0) { print STDERR "ERROR: test_make_gene_gff: failed test1\n"; exit;}
($expected_new_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_new_gff") || die "ERROR: test_make_gene_gff: cannot open $expected_new_gff\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise gene 185915 185917 -24.36 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-1\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise gene 185920 185931 -3.42 + . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-2\n";
print EXPECTED "ACAC.scaffold.00050.383204 GeneWise gene 206829 214134 252.15 - . ID=ACAC.scaffold.00050.383204-TF101001-genewise-prediction-5\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $new_gff $expected_new_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_make_gene_gff: failed test1 (new_gff $new_gff expected_new_gff $expected_new_gff)\n"; exit;}
system "rm -f $input_gff";
system "rm -f $new_gff";
system "rm -f $expected_new_gff";
}
#------------------------------------------------------------------#
# MAKE A GFF FILE WITH JUST THE GENE FEATURES (NOT 'cds' OR 'intron' FEATURES):
sub make_gene_gff
{
my $input_gff = $_[0]; # GFF FILE WITH JUST THE GENE FEATURES (NOT 'cds' OR 'intron' FEATURES)
my $outputdir = $_[1]; # DIRECTORY FOR PUTTING OUTPUT FILES INTO
my $new_gff; # NEW GFF FILE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
# JUST TAKE THE GENE FEATURES FROM $input_gff:
($new_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
system "grep -v CDS $input_gff | grep -v intron | grep gene | sort -k1,1 -k4,4n | uniq > $new_gff";
return($new_gff,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# SUBROUTINE TO MAKE A FILE NAME FOR A TEMPORARY FILE:
sub make_filename
{
my $outputdir = $_[0]; # OUTPUT DIRECTORY TO WRITE TEMPORARY FILE NAME TO
my $found_name = 0; # SAYS WHETHER WE HAVE FOUND A FILE NAME YET
my $filename = "none";# NEW FILE NAME TO USE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $poss_filename; # POSSIBLE FILE NAME TO USE
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAME
while($found_name == 0)
{
$random_number = rand();
$poss_filename = $outputdir."/tmp".$random_number;
if (!(-e $poss_filename))
{
$filename = $poss_filename;
$found_name = 1;
}
}
if ($found_name == 0 || $filename eq 'none')
{
$errormsg = "ERROR: make_filename: found_name $found_name filename $filename\n";
$errorcode = 6; # ERRORCODE=6
return($filename,$errorcode,$errormsg);
}
return($filename,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &print_error
sub test_print_error
{
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR
($errormsg,$errorcode) = &print_error(45,45,1);
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message','My error message',1);
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;}
($errormsg,$errorcode) = &print_error('none',45,1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message', 0, 1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;}
}
#------------------------------------------------------------------#
# PRINT OUT AN ERROR MESSAGE AND EXIT.
sub print_error
{
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED.
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED.
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/)
{
if ($called_from_test == 1)
{
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n";
exit;
}
}
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/))
{
if ($called_from_test == 1)
{
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n";
exit;
}
}
if ($errormsg eq 'none' || $errorcode == 0)
{
if ($called_from_test == 1)
{
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n";
exit;
}
}
else
{
print STDERR "$errormsg";
exit;
}
return($errormsg,$errorcode);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment