Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created August 27, 2013 14:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/6354180 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/6354180 to your computer and use it in GitHub Desktop.
Perl script that removes genes that encode proteins that are shorter than a minimum length, from a gff file.
#!/usr/bin/env perl
=head1 NAME
remove_tiny_genes_from_gff.pl
=head1 SYNOPSIS
remove_tiny_genes_from_gff.pl input_gff output_gff min_length outputdir assembly
where input_gff is the input gff file,
output_gff is the output gff file,
min_length is the minimum required length of the encoded proteins in amino acids,
outputdir is the output directory for writing output files,
assembly is the genome assembly fasta file.
=head1 DESCRIPTION
This script takes an input gff file (<input_gff>), and removes genes that
encode proteins of less than <min_length> amino acids, and writes the output
gff file (<output_gff>) in directory <outputdir>.
=head1 VERSION
Perl script last edited 1-May-2013.
=head1 CONTACT
alc@sanger.ac.uk (Avril Coghlan)
=cut
#
# Perl script remove_tiny_genes_from_gff.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 1-May-13.
# Last edited 1-May-2013.
# SCRIPT SYNOPSIS: remove_tiny_genes_from_gff.pl: removes genes that encode proteins that are shorter than a minimum length, from a gff file.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
use strict;
use warnings;
my $num_args = $#ARGV + 1;
if ($num_args != 5)
{
print "Usage of remove_tiny_genes_from_gff.pl\n\n";
print "perl remove_tiny_genes_from_gff.pl <input_gff> <output_gff> <min_length> <outputdir> <assembly>\n";
print "where <input_gff> is the input gff file,\n";
print " <output_gff> is the output gff file,\n";
print " <min_length> is the minimum required length of the encoded proteins in amino acids,\n";
print " <outputdir> is the output directory for writing output files,\n";
print " <assembly> is the genome assembly fasta file\n";
print "For example, >perl remove_tiny_genes_from_gff.pl round3.v2.gff new_round3.v2.gff 30\n";
print "/lustre/scratch108/parasites/alc/ SSTP.v2.fa\n";
exit;
}
# FIND THE PATH TO THE INPUT GFF FILE:
my $input_gff = $ARGV[0];
# FIND THE PATH TO THE OUTPUT GFF FILE:
my $output_gff = $ARGV[1];
# FIND THE MINIMUM REQUIRED LENGTH OF THE ENCODED PROTEINS:
my $min_length = $ARGV[2];
# FIND THE DIRECTORY TO USE FOR OUTPUT FILES:
my $outputdir = $ARGV[3];
# FIND THE NAME OF THE GENOME ASSEMBLY FASTA FILE:
my $assembly = $ARGV[4];
#------------------------------------------------------------------#
# TEST SUBROUTINES:
my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING.
&test_print_error;
&test_read_scaffold_lengths($outputdir);
&test_record_genes_to_keep($outputdir);
&test_remove_tiny_genes($outputdir);
print STDERR "remove_tiny_genes_from_gff.pl: finished tests, running main code now...\n";
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
&run_main_program($outputdir,$input_gff,$output_gff,$min_length,$assembly);
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
sub run_main_program
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN.
my $input_gff = $_[1]; # THE INPUT GFF FILE
my $output_gff = $_[2]; # THE OUTPUT GFF FILE
my $min_length = $_[3]; # MINIMUM LENGTH IN AMINO ACIDS OF THE ENCODED PROTEINS
my $assembly = $_[4]; # THE GENOME ASSEMBLY
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR
my $spliced_dna; # FILE OF SPLICED DNA SEQUENCES OF TRANSCRIPTS
my $translated_dna; # FILE OF TRANSLATED DNA SEQUENCES OF TRANSCRIPTS
my $LEN; # HASH TABLE OF LENGTHS OF TRANSLATIONS OF TRANSCRIPTS
my $KEEP; # HASH TABLE OF TRANSCRIPTS/GENES TO KEEP FROM A GFF FILE
print STDERR "Making file of spliced DNA sequences...\n";
# MAKE A FILE OF THE SPLICED DNA SEQUENCES FOR THE TRANSCRIPTS IN THE INPUT GFF:
($spliced_dna,$errorcode,$errormsg) = &make_spliced_dna($input_gff,$outputdir,$assembly);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
print STDERR "Making file of translated sequences...\n";
# MAKE A FILE OF THE TRANSLATED SEQUENCES FOR THE TRANSCRIPTS IN THE INPUT GFF:
($translated_dna,$errorcode,$errormsg) = &make_translated_dna($spliced_dna,$outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
print STDERR "Reading in lengths of amino acid translations...\n";
# READ IN THE LENGTHS OF THE AMINO ACID TRANSLATIONS OF TRANSCRIPTS:
($LEN,$errorcode,$errormsg) = &read_scaffold_lengths($translated_dna);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
print STDERR "Deciding which genes and transcripts to keep...\n";
# READ IN THE INPUT GFF FILE, AND RECORD WHICH GENES HAVE TRANSCRIPTS THAT ENCODE PROTEINS OF >=$min_length AMINO ACIDS:
($KEEP,$errorcode,$errormsg) = &record_genes_to_keep($input_gff,$LEN,$min_length);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
print STDERR "Writing output file...\n";
# READ IN THE INPUT GFF FILE, AND MAKE THE OUTPUT GFF FILE WITH GENES THAT ENCODE PROTEINS OF >=$min_length AMINO ACIDS:
($errorcode,$errormsg) = &remove_tiny_genes($outputdir,$input_gff,$output_gff,$KEEP);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# DELETE TEMPORARY FILES:
system "rm -f $spliced_dna";
system "rm -f $translated_dna";
}
#------------------------------------------------------------------#
# TEST &remove_tiny_genes
sub test_remove_tiny_genes
{
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES IN
my $input_gff; # INPUT GFF FILE
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
my %KEEP = (); # HASH TABLE OF mRNAs/GENES TO KEEP
my $output_gff; # OUTPUT GFF FILE
my @temp; #
my $expected_output_gff; # FILE WITH EXPECTED CONTENTS OF $output_gff
my $differences; # DIFFERENCES BETWEEN $output_gff AND $expected_output_gff
my $length_differences; # LENGTH OF $differences
my $line; #
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_remove_tiny_genes: cannot open $input_gff\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">seq1\n";
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n";
close(INPUT_GFF);
$KEEP{"augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1"} = 1;
$KEEP{"augustus_masked-scaff_000010-processed-gene-0.3"} = 1;
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@temp = split(/\//,$output_gff);
$output_gff = $temp[$#temp];
($errorcode,$errormsg) = &remove_tiny_genes($outputdir,$input_gff,$output_gff,\%KEEP);
if ($errorcode != 0) { print STDERR "ERROR: test_remove_tiny_genes: failed test1\n"; exit;}
($expected_output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_output_gff") || die "ERROR: test_remove_tiny_genes: failed test1\n";
print EXPECTED "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print EXPECTED "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print EXPECTED "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print EXPECTED "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print EXPECTED "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print EXPECTED "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print EXPECTED "##FASTA\n";
print EXPECTED ">seq1\n";
print EXPECTED "ACAACACAGATAGATATATAGAGCA\n";
close(EXPECTED);
$output_gff = $outputdir."/".$output_gff;
$differences = "";
open(TEMP,"diff $output_gff $expected_output_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_remove_tiny_genes: failed test1 (output_gff $output_gff expected_output_gff $expected_output_gff)\n"; exit;}
system "rm -f $input_gff";
system "rm -f $output_gff";
system "rm -f $expected_output_gff";
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_remove_tiny_genes: cannot open $input_gff\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">seq1\n";
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n";
close(INPUT_GFF);
%KEEP = ();
$KEEP{"augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1"} = 1;
$KEEP{"augustus_masked-scaff_000010-processed-gene-0.4"} = 1;
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@temp = split(/\//,$output_gff);
$output_gff = $temp[$#temp];
($errorcode,$errormsg) = &remove_tiny_genes($outputdir,$input_gff,$output_gff,\%KEEP);
if ($errorcode != 0) { print STDERR "ERROR: test_remove_tiny_genes: failed test2\n"; exit;}
($expected_output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_output_gff") || die "ERROR: test_remove_tiny_genes: failed test2\n";
print EXPECTED "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print EXPECTED "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print EXPECTED "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print EXPECTED "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print EXPECTED "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print EXPECTED "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print EXPECTED "##FASTA\n";
print EXPECTED ">seq1\n";
print EXPECTED "ACAACACAGATAGATATATAGAGCA\n";
close(EXPECTED);
$output_gff = $outputdir."/".$output_gff;
$differences = "";
open(TEMP,"diff $output_gff $expected_output_gff |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_remove_tiny_genes: failed test2 (output_gff $output_gff expected_output_gff $expected_output_gff)\n"; exit;}
system "rm -f $input_gff";
system "rm -f $output_gff";
system "rm -f $expected_output_gff";
# TEST ERRORCODE=14 (UNKNOWN FEATURE IN GFF FILE):
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_remove_tiny_genes: cannot open $input_gff\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\thello\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">seq1\n";
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n";
close(INPUT_GFF);
$KEEP{"augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1"} = 1;
$KEEP{"augustus_masked-scaff_000010-processed-gene-0.3"} = 1;
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@temp = split(/\//,$output_gff);
$output_gff = $temp[$#temp];
($errorcode,$errormsg) = &remove_tiny_genes($outputdir,$input_gff,$output_gff,\%KEEP);
if ($errorcode != 14) { print STDERR "ERROR: test_remove_tiny_genes: failed test3\n"; exit;}
system "rm -f $input_gff";
system "rm -f $output_gff";
# TEST ERRORCODE=5 (GFF DOES NOT HAVE 9 COLUMNS):
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_remove_tiny_genes: cannot open $input_gff\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">seq1\n";
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n";
close(INPUT_GFF);
%KEEP = ();
$KEEP{"augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1"} = 1;
$KEEP{"augustus_masked-scaff_000010-processed-gene-0.4"} = 1;
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@temp = split(/\//,$output_gff);
$output_gff = $temp[$#temp];
($errorcode,$errormsg) = &remove_tiny_genes($outputdir,$input_gff,$output_gff,\%KEEP);
if ($errorcode != 5) { print STDERR "ERROR: test_remove_tiny_genes: failed test4\n"; exit;}
system "rm -f $input_gff";
system "rm -f $output_gff";
}
#------------------------------------------------------------------#
# READ IN THE INPUT GFF FILE, AND MAKE THE OUTPUT GFF FILE WITH GENES THAT ENCODE PROTEINS OF >=$min_length AMINO ACIDS:
sub remove_tiny_genes
{
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES IN
my $input_gff = $_[1]; # INPUT GFF FILE
my $output_gff = $_[2]; # OUTPUT GFF FILE
my $KEEP = $_[3]; # HASH TABLE OF mRNAs AND GENES TO KEEP
my $errorcode = 0; # THIS IS RETURNED AS 0 IF NO ERROR OCCURRED.
my $errormsg = "none";# THIS IS RETURNED AS 'none' IF NO ERROR OCCURRED.
my $end_of_gff = 0; # SAYS WHETHER THE END OF THE GFF FILE HAS BEEN REACHED
my $line; #
my @temp; #
my $name; # NAME OF mRNA/GENE
# OPEN THE OUTPUT GFF FILE:
$output_gff = $outputdir."/".$output_gff;
open(OUTPUT,">$output_gff") || die "ERROR: remove_tiny_genes: cannot open $output_gff\n";
# READ IN THE INPUT GFF FILE:
open(INPUT_GFF,"$input_gff") || die "ERROR: remove_tiny_genes: cannot open $input_gff\n";
while(<INPUT_GFF>)
{
$line = $_;
chomp $line;
if ($line eq '##FASTA') { $end_of_gff = 1;}
if (substr($line,0,1) ne '#' && $end_of_gff == 0)
{
@temp = split(/\t+/,$line);
if ($#temp == 8)
{
if ($temp[2] eq 'contig' || $temp[2] eq 'match' || $temp[2] eq 'match_part' || $temp[2] eq 'protein_match' ||
$temp[2] eq 'expressed_sequence_match')
{
print OUTPUT "$line\n";
}
elsif ($temp[2] eq 'gene')
{
$name = $temp[8]; # eg. ID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3
@temp = split(/ID=/,$name);
$name = $temp[1]; # eg. augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3
@temp = split(/\;/,$name);
$name = $temp[0]; # eg. augustus_masked-scaff_000010-processed-gene-0.3
if ($KEEP->{$name}) { print OUTPUT "$line\n";}
}
elsif ($temp[2] eq 'mRNA')
{
$name = $temp[8]; # eg. ID=augustus_masked-scaff_000010-processed-gene-0.5-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.5;N...
@temp = split(/ID=/,$name);
$name = $temp[1]; # eg. augustus_masked-scaff_000010-processed-gene-0.5-mRNA-1;Parent=augustus_...
@temp = split(/\;/,$name);
$name = $temp[0]; # eg. augustus_masked-scaff_000010-processed-gene-0.5-mRNA-1
if ($KEEP->{$name}) { print OUTPUT "$line\n";}
}
elsif ($temp[2] eq 'exon' || $temp[2] eq 'CDS' || $temp[2] eq 'five_prime_UTR' || $temp[2] eq 'three_prime_UTR')
{
$name = $temp[8]; # eg. ID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1
@temp = split(/Parent=/,$name);
$name = $temp[1]; # eg. augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1
@temp = split(/\;/,$name);
$name = $temp[0];
if ($KEEP->{$name}) { print OUTPUT "$line\n";}
}
else
{
$errormsg = "ERROR: remove_tiny_genes: feature $temp[2] in gff line $line\n";
$errorcode = 14; # ERRORCODE=14 (TESTED FOR)
return($errorcode,$errormsg);
}
}
else
{
$errormsg = "ERROR: remove_tiny_genes: line does not have 9 columns: $line\n";
$errorcode = 5; # ERRORCODE=5 (TESTED FOR)
return($errorcode,$errormsg);
}
}
else
{
print OUTPUT "$line\n";
}
}
close(INPUT_GFF);
close(OUTPUT);
return($errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &record_genes_to_keep
sub test_record_genes_to_keep
{
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES IN
my $input_gff; # INPUT GFF FILE
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR
my %LEN = (); # HASH TABLE WITH LENGTHS OF TRANSLATIONS OF TRANSCRIPTS
my $KEEP; # HASH TABLE WITH mRNAs/GENES TO KEEP
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_record_genes_to_keep: cannot open $input_gff\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">seq1\n";
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n";
close(INPUT_GFF);
$LEN{"augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1"} = 30;
$LEN{"augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1"} = 29;
($KEEP,$errorcode,$errormsg) = &record_genes_to_keep($input_gff,\%LEN,30);
if ($errorcode != 0) { print STDERR "ERROR: test_record_genes_to_keep: failed test1\n"; exit;}
if (!($KEEP->{"augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1"})) { print STDERR "ERROR: test_record_genes_to_keep: failed test1b\n"; exit;}
if (!($KEEP->{"augustus_masked-scaff_000010-processed-gene-0.3"})) { print STDERR "ERROR: test_record_genes_to_keep: failed test1c\n"; exit;}
if ($KEEP->{"augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1"}) { print STDERR "ERROR: test_record_genes_to_keep: failed test1c\n"; exit;}
if ($KEEP->{"augustus_masked-scaff_000010-processed-gene-0.4"}) { print STDERR "ERROR: test_record_genes_to_keep: failed test1d\n"; exit;}
system "rm -f $input_gff";
# TEST ERRORCODE=10 (DO NOT KNOW LENGTH OF ENCODED PROTEIN):
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_record_genes_to_keep: cannot open $input_gff\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">seq1\n";
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n";
close(INPUT_GFF);
$LEN{"augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1"} = 30;
delete($LEN{"augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1"});
($KEEP,$errorcode,$errormsg) = &record_genes_to_keep($input_gff,\%LEN,30);
if ($errorcode != 10) { print STDERR "ERROR: test_record_genes_to_keep: failed test2\n"; exit;}
system "rm -f $input_gff";
# TEST ERRORCODE=4 (GFF DOES NOT HAVE 9 COLUMNS):
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_record_genes_to_keep: cannot open $input_gff\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n";
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n";
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n";
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n";
print INPUT_GFF "##FASTA\n";
print INPUT_GFF ">seq1\n";
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n";
close(INPUT_GFF);
$LEN{"augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1"} = 30;
$LEN{"augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1"} = 29;
($KEEP,$errorcode,$errormsg) = &record_genes_to_keep($input_gff,\%LEN,30);
if ($errorcode != 4) { print STDERR "ERROR: test_record_genes_to_keep: failed test3\n"; exit;}
system "rm -f $input_gff";
}
#------------------------------------------------------------------#
# READ IN THE INPUT GFF FILE, AND RECORD WHICH GENES HAVE TRANSCRIPTS THAT ENCODE PROTEINS OF >=$min_length AMINO ACIDS:
sub record_genes_to_keep
{
my $input_gff = $_[0]; # INPUT GFF FILE
my $LEN = $_[1]; # HASH TABLE WITH THE LENGTHS OF ENCODED PROTEINS
my $min_length = $_[2]; # MINIMUM REQUIRED LENGTH FOR AN ENCODED PROTEIN
my $errorcode = 0; # THIS IS RETURNED AS 0 IF NO ERROR OCCURRED.
my $errormsg = "none";# THIS IS RETURNED AS 'none' IF NO ERROR OCCURRED.
my %KEEP = (); # HASH TABLE OF GENES/TRANSCRIPTS TO KEEP
my $line; #
my @temp; #
my $end_of_gff = 0; # SAYS WHETHER THE END OF THE GFF FILE HAS BEEN REACHED
my $name; # NAME OF mRNA
my $gene; # NAME OF GENE
my $length; # LENGTH OF A TRANSLATION OF A TRANSCRIPT (IN AMINO ACIDS)
my $name2; # SECOND NAME FOR mRNA
open(INPUT_GFF,"$input_gff") || die "ERROR: record_genes_to_keep: cannot open $input_gff\n";
while(<INPUT_GFF>)
{
$line = $_;
chomp $line;
if ($line eq '##FASTA') { $end_of_gff = 1;}
if (substr($line,0,1) ne '#' && $end_of_gff == 0)
{
@temp = split(/\t+/,$line);
if ($#temp == 8)
{
if ($temp[2] eq 'mRNA')
{
# eg. scaff_000010 maker mRNA 88793 89828 . + . ID=maker-scaff_000010-augustus-gene-0.21-mRNA-1;Parent=maker-scaff_000010-augustus-gene-0.21;...
$name = $temp[8];
$gene = $temp[8];
@temp = split(/ID=/,$name);
$name = $temp[1]; # eg. maker-scaff_000010-augustus-gene-0.21-mRNA-1;...
@temp = split(/\;/,$name);
$name = $temp[0]; # eg. maker-scaff_000010-augustus-gene-0.21-mRNA-1
# GET THE NAME OF THE GENE:
@temp = split(/Parent=/,$gene);
$gene = $temp[1]; # eg. maker-scaff_000010-augustus-gene-0.21;...
@temp = split(/\;/,$gene);
$gene = $temp[0]; # eg. maker-scaff_000010-augustus-gene-0.21
# GET THE OTHER NAME OF THE mRNA:
@temp = split(/\t+/,$line);
$name2 = $temp[8];
@temp = split(/Name=/,$name2);
$name2 = $temp[1];
@temp = split(/\;/,$name2);
$name2 = $temp[0];
# FIND THE LENGTH OF THE ENCODED PROTEIN:
if (!($LEN->{$name}))
{
$errormsg= "ERROR: record_genes_to_keep: do not know length of protein encoded by $name\n";
$errorcode=10; # ERRORCODE=10 (TESTED FOR)
return(\%KEEP,$errorcode,$errormsg);
}
$length = $LEN->{$name};
# IF THE PROTEIN IS >= $min_length, WE WANT TO KEEP THIS GENE AND mRNA:
if ($length >= $min_length)
{
$KEEP{$name} = 1;
$KEEP{$gene} = 1;
$KEEP{$name2} = 1;
}
}
}
else
{
$errormsg = "ERROR: record_genes_to_keep: line does not have 9 columns: $line\n";
$errorcode = 4; # ERRORCODE=4 (TESTED FOR)
return(\%KEEP,$errorcode,$errormsg);
}
}
}
close(INPUT_GFF);
return(\%KEEP,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &read_scaffold_lengths
sub test_read_scaffold_lengths
{
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT IN.
my $assembly; # FILE CONTAINING ASSEMBLY.
my $LEN; # HASH TABLE CONTAINING LENGTHS OF SEQUENCES.
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR.
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR.
($assembly,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(ASSEMBLY,">$assembly") || die "ERROR: test_read_scaffold_lengths: cannot open $assembly\n";
print ASSEMBLY ">seq1\n";
print ASSEMBLY "AAAAA\n";
print ASSEMBLY ">seq2\n";
print ASSEMBLY "AAAAATTTTT\n";
print ASSEMBLY "\n";
print ASSEMBLY ">seq3\n";
print ASSEMBLY "AAAAA\n";
print ASSEMBLY "TTTTT\n";
print ASSEMBLY ">seq4\n";
print ASSEMBLY ">seq5\n";
print ASSEMBLY " AAA AA \n";
close(ASSEMBLY);
($LEN,$errorcode,$errormsg) = &read_scaffold_lengths($assembly);
if ($LEN->{'seq1'} != 5 || $LEN->{'seq2'} != 10 || $LEN->{'seq3'} != 10 || defined($LEN->{'seq4'}) || $LEN->{'seq5'} != 5 || $errorcode != 0) { print STDERR "ERROR: test_read_scaffold_lengths: failed test1\n"; exit;}
system "rm -f $assembly";
# TEST ERRORCODE=1:
($assembly,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(ASSEMBLY,">$assembly") || die "ERROR: test_read_scaffold_lengths: cannot open $assembly\n";
print ASSEMBLY ">seq1\n";
print ASSEMBLY "AAAAA\n";
print ASSEMBLY ">seq2\n";
print ASSEMBLY "AAAAATTTTT\n";
print ASSEMBLY ">seq1\n";
print ASSEMBLY "AAAAA\n";
close(ASSEMBLY);
($LEN,$errorcode,$errormsg) = &read_scaffold_lengths($assembly);
if ($errorcode != 1) { print STDERR "ERROR: test_read_scaffold_lengths: failed test2\n"; exit;}
system "rm -f $assembly";
# TEST ERRORCODE=2:
($assembly,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(ASSEMBLY,">$assembly") || die "ERROR: test_read_scaffold_lengths: cannot open $assembly\n";
print ASSEMBLY "AAAAA\n";
print ASSEMBLY ">seq2\n";
print ASSEMBLY "AAAAATTTTT\n";
print ASSEMBLY ">seq1\n";
print ASSEMBLY "AAAAA\n";
close(ASSEMBLY);
($LEN,$errorcode,$errormsg) = &read_scaffold_lengths($assembly);
if ($errorcode != 2) { print STDERR "ERROR: test_read_scaffold_lengths: failed test3\n"; exit;}
system "rm -f $assembly";
}
#------------------------------------------------------------------#
# READ IN THE LENGTHS OF CONTIGS IN THE ASSEMBLY:
# SUBROUTINE SYNOPSIS: read_scaffold_length(): store lengths of sequences from a fasta file in a hash
sub read_scaffold_lengths
{
my $assembly = $_[0]; # FILE CONTAINING THE ASSEMBLY
my %SCAFFOLDLEN = (); # HASH TABLE FOR STORING LENGTHS OF SCAFFOLDS
my $line; #
my @temp; #
my $length; # LENGTH OF A SCAFFOLD
my $name = "none";# NAME OF A SCAFFOLD
my $seq; # SEQUENCE OF A SCAFFOLD
my $errorcode = 0; # THIS IS RETURNED AS 0 IF NO ERROR OCCURRED.
my $errormsg = "none";# THIS IS RETURNED AS 'none' IF NO ERROR OCCURRED.
my %SEEN = (); # HASH TABLE TO RECORD WHICH SCAFFOLD NAMES WE HAVE SEEN.
open(ASSEMBLY,"$assembly") || die "ERROR: read_scaffold_lengths: cannot open $assembly\n";
while(<ASSEMBLY>)
{
$line = $_;
chomp $line;
if (substr($line,0,1) eq '>')
{
@temp = split(/\s+/,$line);
$name = $temp[0];
$name = substr($name,1,length($name)-1);
if ($SEEN{$name})
{
$errormsg = "ERROR: read_scaffold_lengths: seen scaffold $name already\n";
$errorcode = 1; # ERRORCODE=1 (TESTED FOR)
return(\%SCAFFOLDLEN, $errorcode, $errormsg);
}
$SEEN{$name} = 1;
}
else
{
$seq = $line;
# REMOVAL SPACES FROM THE LINE, EITHER INTERNAL SPACES OR SPACES AT EITHER END:
$seq = $line;
$seq =~ s/\s+//g;
# STORE THE LENGTH OF THE SEQUENCE, OR UPDATE THE STORED LENGTH:
if ($seq eq '') { $length = 0; }
else { $length = length($seq);}
if ($name eq 'none')
{
$errormsg = "ERROR: read_scaffold_lengths: name is $name\n";
$errorcode = 2; # ERRORCODE=2 (TESTED FOR)
return(\%SCAFFOLDLEN, $errorcode, $errormsg);
}
if (!($SCAFFOLDLEN{$name})){ $SCAFFOLDLEN{$name} = $length;}
else {$SCAFFOLDLEN{$name} = $SCAFFOLDLEN{$name} + $length;}
}
}
close(ASSEMBLY);
return(\%SCAFFOLDLEN,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# MAKE A FILE OF THE TRANSLATED SEQUENCES FOR THE TRANSCRIPTS IN THE INPUT GFF:
sub make_translated_dna
{
my $spliced_dna = $_[0]; # FILE OF SPLICED DNA SEQUENCES FOR TRANSCRIPTS
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $cmd; # COMMAND TO RUN
my $translated_dna; # FILE OF TRANSLATED DNA SEQUENCES OF TRANSCRIPTS
my @temp; #
($translated_dna,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@temp = split(/\//,$translated_dna);
$translated_dna = $temp[$#temp];
$cmd = "translate_spliced_dna.pl $spliced_dna $translated_dna $outputdir";
system "$cmd";
system "sleep 1"; # SLEEP FOR 1 SECOND TO MAKE SURE THE OUTPUT FILE IS READY.
if (!(-e $translated_dna))
{
$errormsg = "ERROR: make_translated_dna: file translated_dna $translated_dna does not exist\n";
$errorcode = 8; # ERRORCODE=8 (SHOULDN'T HAPPEN, SO CAN'T TEST FOR)
return($translated_dna,$errorcode,$errormsg);
}
return($translated_dna,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# MAKE A FILE OF THE SPLICED DNA SEQUENCES FOR THE TRANSCRIPTS IN THE INPUT GFF:
sub make_spliced_dna
{
my $input_gff = $_[0]; # INPUT GFF FILE
my $outputdir = $_[1]; # DIRECTORY TO PUT OUTPUT FILES IN
my $assembly = $_[2]; # GENOME ASSEMBLY
my $cmd; # COMMAND TO RUN
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $spliced_dna; # FILE OF SPLICED DNA SEQUENCES FOR TRANSCRIPTS
my @temp; #
($spliced_dna,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
@temp = split(/\//,$spliced_dna);
$spliced_dna = $temp[$#temp];
$cmd = "get_spliced_transcripts_from_gff.pl $input_gff $assembly $spliced_dna $outputdir yes no no";
system "$cmd";
system "sleep 1"; # SLEEP FOR 1 SECOND TO MAKE SURE THE OUTPUT FILE IS READY.
if (!(-e $spliced_dna))
{
$errormsg = "ERROR: make_spliced_dna: file spliced_dna $spliced_dna does not exist\n";
$errorcode = 7; # ERRORCODE=7 (SHOULDN'T HAPPEN, SO CAN'T TEST FOR)
return($spliced_dna,$errorcode,$errormsg);
}
return($spliced_dna,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# SUBROUTINE TO MAKE A FILE NAME FOR A TEMPORARY FILE:
sub make_filename
{
my $outputdir = $_[0]; # OUTPUT DIRECTORY TO WRITE TEMPORARY FILE NAME TO
my $found_name = 0; # SAYS WHETHER WE HAVE FOUND A FILE NAME YET
my $filename = "none";# NEW FILE NAME TO USE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $poss_filename; # POSSIBLE FILE NAME TO USE
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAME
while($found_name == 0)
{
$random_number = rand();
$poss_filename = $outputdir."/tmp".$random_number;
if (!(-e $poss_filename))
{
$filename = $poss_filename;
$found_name = 1;
}
}
if ($found_name == 0 || $filename eq 'none')
{
$errormsg = "ERROR: make_filename: found_name $found_name filename $filename\n";
$errorcode = 6; # ERRORCODE=6
return($filename,$errorcode,$errormsg);
}#
return($filename,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &print_error
sub test_print_error
{
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR
($errormsg,$errorcode) = &print_error(45,45,1);
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message','My error message',1);
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;}
($errormsg,$errorcode) = &print_error('none',45,1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message', 0, 1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;}
}
#------------------------------------------------------------------#
# PRINT OUT AN ERROR MESSAGE AND EXIT.
sub print_error
{
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED.
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED.
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/)
{
if ($called_from_test == 1)
{
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n";
exit;
}
}
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/))
{
if ($called_from_test == 1)
{
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n";
exit;
}
}
if ($errormsg eq 'none' || $errorcode == 0)
{
if ($called_from_test == 1)
{
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n";
exit;
}
}
else
{
print STDERR "$errormsg";
exit;
}
return($errormsg,$errorcode);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment