Created
August 27, 2013 14:23
-
-
Save avrilcoghlan/6354180 to your computer and use it in GitHub Desktop.
Perl script that removes genes that encode proteins that are shorter than a minimum length, from a gff file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
=head1 NAME | |
remove_tiny_genes_from_gff.pl | |
=head1 SYNOPSIS | |
remove_tiny_genes_from_gff.pl input_gff output_gff min_length outputdir assembly | |
where input_gff is the input gff file, | |
output_gff is the output gff file, | |
min_length is the minimum required length of the encoded proteins in amino acids, | |
outputdir is the output directory for writing output files, | |
assembly is the genome assembly fasta file. | |
=head1 DESCRIPTION | |
This script takes an input gff file (<input_gff>), and removes genes that | |
encode proteins of less than <min_length> amino acids, and writes the output | |
gff file (<output_gff>) in directory <outputdir>. | |
=head1 VERSION | |
Perl script last edited 1-May-2013. | |
=head1 CONTACT | |
alc@sanger.ac.uk (Avril Coghlan) | |
=cut | |
# | |
# Perl script remove_tiny_genes_from_gff.pl | |
# Written by Avril Coghlan (alc@sanger.ac.uk) | |
# 1-May-13. | |
# Last edited 1-May-2013. | |
# SCRIPT SYNOPSIS: remove_tiny_genes_from_gff.pl: removes genes that encode proteins that are shorter than a minimum length, from a gff file. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
use strict; | |
use warnings; | |
my $num_args = $#ARGV + 1; | |
if ($num_args != 5) | |
{ | |
print "Usage of remove_tiny_genes_from_gff.pl\n\n"; | |
print "perl remove_tiny_genes_from_gff.pl <input_gff> <output_gff> <min_length> <outputdir> <assembly>\n"; | |
print "where <input_gff> is the input gff file,\n"; | |
print " <output_gff> is the output gff file,\n"; | |
print " <min_length> is the minimum required length of the encoded proteins in amino acids,\n"; | |
print " <outputdir> is the output directory for writing output files,\n"; | |
print " <assembly> is the genome assembly fasta file\n"; | |
print "For example, >perl remove_tiny_genes_from_gff.pl round3.v2.gff new_round3.v2.gff 30\n"; | |
print "/lustre/scratch108/parasites/alc/ SSTP.v2.fa\n"; | |
exit; | |
} | |
# FIND THE PATH TO THE INPUT GFF FILE: | |
my $input_gff = $ARGV[0]; | |
# FIND THE PATH TO THE OUTPUT GFF FILE: | |
my $output_gff = $ARGV[1]; | |
# FIND THE MINIMUM REQUIRED LENGTH OF THE ENCODED PROTEINS: | |
my $min_length = $ARGV[2]; | |
# FIND THE DIRECTORY TO USE FOR OUTPUT FILES: | |
my $outputdir = $ARGV[3]; | |
# FIND THE NAME OF THE GENOME ASSEMBLY FASTA FILE: | |
my $assembly = $ARGV[4]; | |
#------------------------------------------------------------------# | |
# TEST SUBROUTINES: | |
my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING. | |
&test_print_error; | |
&test_read_scaffold_lengths($outputdir); | |
&test_record_genes_to_keep($outputdir); | |
&test_remove_tiny_genes($outputdir); | |
print STDERR "remove_tiny_genes_from_gff.pl: finished tests, running main code now...\n"; | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
&run_main_program($outputdir,$input_gff,$output_gff,$min_length,$assembly); | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
sub run_main_program | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN. | |
my $input_gff = $_[1]; # THE INPUT GFF FILE | |
my $output_gff = $_[2]; # THE OUTPUT GFF FILE | |
my $min_length = $_[3]; # MINIMUM LENGTH IN AMINO ACIDS OF THE ENCODED PROTEINS | |
my $assembly = $_[4]; # THE GENOME ASSEMBLY | |
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR | |
my $spliced_dna; # FILE OF SPLICED DNA SEQUENCES OF TRANSCRIPTS | |
my $translated_dna; # FILE OF TRANSLATED DNA SEQUENCES OF TRANSCRIPTS | |
my $LEN; # HASH TABLE OF LENGTHS OF TRANSLATIONS OF TRANSCRIPTS | |
my $KEEP; # HASH TABLE OF TRANSCRIPTS/GENES TO KEEP FROM A GFF FILE | |
print STDERR "Making file of spliced DNA sequences...\n"; | |
# MAKE A FILE OF THE SPLICED DNA SEQUENCES FOR THE TRANSCRIPTS IN THE INPUT GFF: | |
($spliced_dna,$errorcode,$errormsg) = &make_spliced_dna($input_gff,$outputdir,$assembly); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
print STDERR "Making file of translated sequences...\n"; | |
# MAKE A FILE OF THE TRANSLATED SEQUENCES FOR THE TRANSCRIPTS IN THE INPUT GFF: | |
($translated_dna,$errorcode,$errormsg) = &make_translated_dna($spliced_dna,$outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
print STDERR "Reading in lengths of amino acid translations...\n"; | |
# READ IN THE LENGTHS OF THE AMINO ACID TRANSLATIONS OF TRANSCRIPTS: | |
($LEN,$errorcode,$errormsg) = &read_scaffold_lengths($translated_dna); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
print STDERR "Deciding which genes and transcripts to keep...\n"; | |
# READ IN THE INPUT GFF FILE, AND RECORD WHICH GENES HAVE TRANSCRIPTS THAT ENCODE PROTEINS OF >=$min_length AMINO ACIDS: | |
($KEEP,$errorcode,$errormsg) = &record_genes_to_keep($input_gff,$LEN,$min_length); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
print STDERR "Writing output file...\n"; | |
# READ IN THE INPUT GFF FILE, AND MAKE THE OUTPUT GFF FILE WITH GENES THAT ENCODE PROTEINS OF >=$min_length AMINO ACIDS: | |
($errorcode,$errormsg) = &remove_tiny_genes($outputdir,$input_gff,$output_gff,$KEEP); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# DELETE TEMPORARY FILES: | |
system "rm -f $spliced_dna"; | |
system "rm -f $translated_dna"; | |
} | |
#------------------------------------------------------------------# | |
# TEST &remove_tiny_genes | |
sub test_remove_tiny_genes | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES IN | |
my $input_gff; # INPUT GFF FILE | |
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR | |
my %KEEP = (); # HASH TABLE OF mRNAs/GENES TO KEEP | |
my $output_gff; # OUTPUT GFF FILE | |
my @temp; # | |
my $expected_output_gff; # FILE WITH EXPECTED CONTENTS OF $output_gff | |
my $differences; # DIFFERENCES BETWEEN $output_gff AND $expected_output_gff | |
my $length_differences; # LENGTH OF $differences | |
my $line; # | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_remove_tiny_genes: cannot open $input_gff\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print INPUT_GFF "##FASTA\n"; | |
print INPUT_GFF ">seq1\n"; | |
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(INPUT_GFF); | |
$KEEP{"augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1"} = 1; | |
$KEEP{"augustus_masked-scaff_000010-processed-gene-0.3"} = 1; | |
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
@temp = split(/\//,$output_gff); | |
$output_gff = $temp[$#temp]; | |
($errorcode,$errormsg) = &remove_tiny_genes($outputdir,$input_gff,$output_gff,\%KEEP); | |
if ($errorcode != 0) { print STDERR "ERROR: test_remove_tiny_genes: failed test1\n"; exit;} | |
($expected_output_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(EXPECTED,">$expected_output_gff") || die "ERROR: test_remove_tiny_genes: failed test1\n"; | |
print EXPECTED "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print EXPECTED "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print EXPECTED "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print EXPECTED "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print EXPECTED "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print EXPECTED "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print EXPECTED "##FASTA\n"; | |
print EXPECTED ">seq1\n"; | |
print EXPECTED "ACAACACAGATAGATATATAGAGCA\n"; | |
close(EXPECTED); | |
$output_gff = $outputdir."/".$output_gff; | |
$differences = ""; | |
open(TEMP,"diff $output_gff $expected_output_gff |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_remove_tiny_genes: failed test1 (output_gff $output_gff expected_output_gff $expected_output_gff)\n"; exit;} | |
system "rm -f $input_gff"; | |
system "rm -f $output_gff"; | |
system "rm -f $expected_output_gff"; | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_remove_tiny_genes: cannot open $input_gff\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print INPUT_GFF "##FASTA\n"; | |
print INPUT_GFF ">seq1\n"; | |
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(INPUT_GFF); | |
%KEEP = (); | |
$KEEP{"augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1"} = 1; | |
$KEEP{"augustus_masked-scaff_000010-processed-gene-0.4"} = 1; | |
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
@temp = split(/\//,$output_gff); | |
$output_gff = $temp[$#temp]; | |
($errorcode,$errormsg) = &remove_tiny_genes($outputdir,$input_gff,$output_gff,\%KEEP); | |
if ($errorcode != 0) { print STDERR "ERROR: test_remove_tiny_genes: failed test2\n"; exit;} | |
($expected_output_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(EXPECTED,">$expected_output_gff") || die "ERROR: test_remove_tiny_genes: failed test2\n"; | |
print EXPECTED "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print EXPECTED "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print EXPECTED "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print EXPECTED "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print EXPECTED "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print EXPECTED "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print EXPECTED "##FASTA\n"; | |
print EXPECTED ">seq1\n"; | |
print EXPECTED "ACAACACAGATAGATATATAGAGCA\n"; | |
close(EXPECTED); | |
$output_gff = $outputdir."/".$output_gff; | |
$differences = ""; | |
open(TEMP,"diff $output_gff $expected_output_gff |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_remove_tiny_genes: failed test2 (output_gff $output_gff expected_output_gff $expected_output_gff)\n"; exit;} | |
system "rm -f $input_gff"; | |
system "rm -f $output_gff"; | |
system "rm -f $expected_output_gff"; | |
# TEST ERRORCODE=14 (UNKNOWN FEATURE IN GFF FILE): | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_remove_tiny_genes: cannot open $input_gff\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\thello\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print INPUT_GFF "##FASTA\n"; | |
print INPUT_GFF ">seq1\n"; | |
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(INPUT_GFF); | |
$KEEP{"augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1"} = 1; | |
$KEEP{"augustus_masked-scaff_000010-processed-gene-0.3"} = 1; | |
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
@temp = split(/\//,$output_gff); | |
$output_gff = $temp[$#temp]; | |
($errorcode,$errormsg) = &remove_tiny_genes($outputdir,$input_gff,$output_gff,\%KEEP); | |
if ($errorcode != 14) { print STDERR "ERROR: test_remove_tiny_genes: failed test3\n"; exit;} | |
system "rm -f $input_gff"; | |
system "rm -f $output_gff"; | |
# TEST ERRORCODE=5 (GFF DOES NOT HAVE 9 COLUMNS): | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_remove_tiny_genes: cannot open $input_gff\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print INPUT_GFF "##FASTA\n"; | |
print INPUT_GFF ">seq1\n"; | |
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(INPUT_GFF); | |
%KEEP = (); | |
$KEEP{"augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1"} = 1; | |
$KEEP{"augustus_masked-scaff_000010-processed-gene-0.4"} = 1; | |
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
@temp = split(/\//,$output_gff); | |
$output_gff = $temp[$#temp]; | |
($errorcode,$errormsg) = &remove_tiny_genes($outputdir,$input_gff,$output_gff,\%KEEP); | |
if ($errorcode != 5) { print STDERR "ERROR: test_remove_tiny_genes: failed test4\n"; exit;} | |
system "rm -f $input_gff"; | |
system "rm -f $output_gff"; | |
} | |
#------------------------------------------------------------------# | |
# READ IN THE INPUT GFF FILE, AND MAKE THE OUTPUT GFF FILE WITH GENES THAT ENCODE PROTEINS OF >=$min_length AMINO ACIDS: | |
sub remove_tiny_genes | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES IN | |
my $input_gff = $_[1]; # INPUT GFF FILE | |
my $output_gff = $_[2]; # OUTPUT GFF FILE | |
my $KEEP = $_[3]; # HASH TABLE OF mRNAs AND GENES TO KEEP | |
my $errorcode = 0; # THIS IS RETURNED AS 0 IF NO ERROR OCCURRED. | |
my $errormsg = "none";# THIS IS RETURNED AS 'none' IF NO ERROR OCCURRED. | |
my $end_of_gff = 0; # SAYS WHETHER THE END OF THE GFF FILE HAS BEEN REACHED | |
my $line; # | |
my @temp; # | |
my $name; # NAME OF mRNA/GENE | |
# OPEN THE OUTPUT GFF FILE: | |
$output_gff = $outputdir."/".$output_gff; | |
open(OUTPUT,">$output_gff") || die "ERROR: remove_tiny_genes: cannot open $output_gff\n"; | |
# READ IN THE INPUT GFF FILE: | |
open(INPUT_GFF,"$input_gff") || die "ERROR: remove_tiny_genes: cannot open $input_gff\n"; | |
while(<INPUT_GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
if ($line eq '##FASTA') { $end_of_gff = 1;} | |
if (substr($line,0,1) ne '#' && $end_of_gff == 0) | |
{ | |
@temp = split(/\t+/,$line); | |
if ($#temp == 8) | |
{ | |
if ($temp[2] eq 'contig' || $temp[2] eq 'match' || $temp[2] eq 'match_part' || $temp[2] eq 'protein_match' || | |
$temp[2] eq 'expressed_sequence_match') | |
{ | |
print OUTPUT "$line\n"; | |
} | |
elsif ($temp[2] eq 'gene') | |
{ | |
$name = $temp[8]; # eg. ID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3 | |
@temp = split(/ID=/,$name); | |
$name = $temp[1]; # eg. augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3 | |
@temp = split(/\;/,$name); | |
$name = $temp[0]; # eg. augustus_masked-scaff_000010-processed-gene-0.3 | |
if ($KEEP->{$name}) { print OUTPUT "$line\n";} | |
} | |
elsif ($temp[2] eq 'mRNA') | |
{ | |
$name = $temp[8]; # eg. ID=augustus_masked-scaff_000010-processed-gene-0.5-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.5;N... | |
@temp = split(/ID=/,$name); | |
$name = $temp[1]; # eg. augustus_masked-scaff_000010-processed-gene-0.5-mRNA-1;Parent=augustus_... | |
@temp = split(/\;/,$name); | |
$name = $temp[0]; # eg. augustus_masked-scaff_000010-processed-gene-0.5-mRNA-1 | |
if ($KEEP->{$name}) { print OUTPUT "$line\n";} | |
} | |
elsif ($temp[2] eq 'exon' || $temp[2] eq 'CDS' || $temp[2] eq 'five_prime_UTR' || $temp[2] eq 'three_prime_UTR') | |
{ | |
$name = $temp[8]; # eg. ID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1 | |
@temp = split(/Parent=/,$name); | |
$name = $temp[1]; # eg. augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1 | |
@temp = split(/\;/,$name); | |
$name = $temp[0]; | |
if ($KEEP->{$name}) { print OUTPUT "$line\n";} | |
} | |
else | |
{ | |
$errormsg = "ERROR: remove_tiny_genes: feature $temp[2] in gff line $line\n"; | |
$errorcode = 14; # ERRORCODE=14 (TESTED FOR) | |
return($errorcode,$errormsg); | |
} | |
} | |
else | |
{ | |
$errormsg = "ERROR: remove_tiny_genes: line does not have 9 columns: $line\n"; | |
$errorcode = 5; # ERRORCODE=5 (TESTED FOR) | |
return($errorcode,$errormsg); | |
} | |
} | |
else | |
{ | |
print OUTPUT "$line\n"; | |
} | |
} | |
close(INPUT_GFF); | |
close(OUTPUT); | |
return($errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &record_genes_to_keep | |
sub test_record_genes_to_keep | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES IN | |
my $input_gff; # INPUT GFF FILE | |
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR | |
my %LEN = (); # HASH TABLE WITH LENGTHS OF TRANSLATIONS OF TRANSCRIPTS | |
my $KEEP; # HASH TABLE WITH mRNAs/GENES TO KEEP | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_record_genes_to_keep: cannot open $input_gff\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print INPUT_GFF "##FASTA\n"; | |
print INPUT_GFF ">seq1\n"; | |
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(INPUT_GFF); | |
$LEN{"augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1"} = 30; | |
$LEN{"augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1"} = 29; | |
($KEEP,$errorcode,$errormsg) = &record_genes_to_keep($input_gff,\%LEN,30); | |
if ($errorcode != 0) { print STDERR "ERROR: test_record_genes_to_keep: failed test1\n"; exit;} | |
if (!($KEEP->{"augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1"})) { print STDERR "ERROR: test_record_genes_to_keep: failed test1b\n"; exit;} | |
if (!($KEEP->{"augustus_masked-scaff_000010-processed-gene-0.3"})) { print STDERR "ERROR: test_record_genes_to_keep: failed test1c\n"; exit;} | |
if ($KEEP->{"augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1"}) { print STDERR "ERROR: test_record_genes_to_keep: failed test1c\n"; exit;} | |
if ($KEEP->{"augustus_masked-scaff_000010-processed-gene-0.4"}) { print STDERR "ERROR: test_record_genes_to_keep: failed test1d\n"; exit;} | |
system "rm -f $input_gff"; | |
# TEST ERRORCODE=10 (DO NOT KNOW LENGTH OF ENCODED PROTEIN): | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_record_genes_to_keep: cannot open $input_gff\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print INPUT_GFF "##FASTA\n"; | |
print INPUT_GFF ">seq1\n"; | |
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(INPUT_GFF); | |
$LEN{"augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1"} = 30; | |
delete($LEN{"augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1"}); | |
($KEEP,$errorcode,$errormsg) = &record_genes_to_keep($input_gff,\%LEN,30); | |
if ($errorcode != 10) { print STDERR "ERROR: test_record_genes_to_keep: failed test2\n"; exit;} | |
system "rm -f $input_gff"; | |
# TEST ERRORCODE=4 (GFF DOES NOT HAVE 9 COLUMNS): | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_record_genes_to_keep: cannot open $input_gff\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print INPUT_GFF "##FASTA\n"; | |
print INPUT_GFF ">seq1\n"; | |
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(INPUT_GFF); | |
$LEN{"augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1"} = 30; | |
$LEN{"augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1"} = 29; | |
($KEEP,$errorcode,$errormsg) = &record_genes_to_keep($input_gff,\%LEN,30); | |
if ($errorcode != 4) { print STDERR "ERROR: test_record_genes_to_keep: failed test3\n"; exit;} | |
system "rm -f $input_gff"; | |
} | |
#------------------------------------------------------------------# | |
# READ IN THE INPUT GFF FILE, AND RECORD WHICH GENES HAVE TRANSCRIPTS THAT ENCODE PROTEINS OF >=$min_length AMINO ACIDS: | |
sub record_genes_to_keep | |
{ | |
my $input_gff = $_[0]; # INPUT GFF FILE | |
my $LEN = $_[1]; # HASH TABLE WITH THE LENGTHS OF ENCODED PROTEINS | |
my $min_length = $_[2]; # MINIMUM REQUIRED LENGTH FOR AN ENCODED PROTEIN | |
my $errorcode = 0; # THIS IS RETURNED AS 0 IF NO ERROR OCCURRED. | |
my $errormsg = "none";# THIS IS RETURNED AS 'none' IF NO ERROR OCCURRED. | |
my %KEEP = (); # HASH TABLE OF GENES/TRANSCRIPTS TO KEEP | |
my $line; # | |
my @temp; # | |
my $end_of_gff = 0; # SAYS WHETHER THE END OF THE GFF FILE HAS BEEN REACHED | |
my $name; # NAME OF mRNA | |
my $gene; # NAME OF GENE | |
my $length; # LENGTH OF A TRANSLATION OF A TRANSCRIPT (IN AMINO ACIDS) | |
my $name2; # SECOND NAME FOR mRNA | |
open(INPUT_GFF,"$input_gff") || die "ERROR: record_genes_to_keep: cannot open $input_gff\n"; | |
while(<INPUT_GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
if ($line eq '##FASTA') { $end_of_gff = 1;} | |
if (substr($line,0,1) ne '#' && $end_of_gff == 0) | |
{ | |
@temp = split(/\t+/,$line); | |
if ($#temp == 8) | |
{ | |
if ($temp[2] eq 'mRNA') | |
{ | |
# eg. scaff_000010 maker mRNA 88793 89828 . + . ID=maker-scaff_000010-augustus-gene-0.21-mRNA-1;Parent=maker-scaff_000010-augustus-gene-0.21;... | |
$name = $temp[8]; | |
$gene = $temp[8]; | |
@temp = split(/ID=/,$name); | |
$name = $temp[1]; # eg. maker-scaff_000010-augustus-gene-0.21-mRNA-1;... | |
@temp = split(/\;/,$name); | |
$name = $temp[0]; # eg. maker-scaff_000010-augustus-gene-0.21-mRNA-1 | |
# GET THE NAME OF THE GENE: | |
@temp = split(/Parent=/,$gene); | |
$gene = $temp[1]; # eg. maker-scaff_000010-augustus-gene-0.21;... | |
@temp = split(/\;/,$gene); | |
$gene = $temp[0]; # eg. maker-scaff_000010-augustus-gene-0.21 | |
# GET THE OTHER NAME OF THE mRNA: | |
@temp = split(/\t+/,$line); | |
$name2 = $temp[8]; | |
@temp = split(/Name=/,$name2); | |
$name2 = $temp[1]; | |
@temp = split(/\;/,$name2); | |
$name2 = $temp[0]; | |
# FIND THE LENGTH OF THE ENCODED PROTEIN: | |
if (!($LEN->{$name})) | |
{ | |
$errormsg= "ERROR: record_genes_to_keep: do not know length of protein encoded by $name\n"; | |
$errorcode=10; # ERRORCODE=10 (TESTED FOR) | |
return(\%KEEP,$errorcode,$errormsg); | |
} | |
$length = $LEN->{$name}; | |
# IF THE PROTEIN IS >= $min_length, WE WANT TO KEEP THIS GENE AND mRNA: | |
if ($length >= $min_length) | |
{ | |
$KEEP{$name} = 1; | |
$KEEP{$gene} = 1; | |
$KEEP{$name2} = 1; | |
} | |
} | |
} | |
else | |
{ | |
$errormsg = "ERROR: record_genes_to_keep: line does not have 9 columns: $line\n"; | |
$errorcode = 4; # ERRORCODE=4 (TESTED FOR) | |
return(\%KEEP,$errorcode,$errormsg); | |
} | |
} | |
} | |
close(INPUT_GFF); | |
return(\%KEEP,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &read_scaffold_lengths | |
sub test_read_scaffold_lengths | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT IN. | |
my $assembly; # FILE CONTAINING ASSEMBLY. | |
my $LEN; # HASH TABLE CONTAINING LENGTHS OF SEQUENCES. | |
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR. | |
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR. | |
($assembly,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(ASSEMBLY,">$assembly") || die "ERROR: test_read_scaffold_lengths: cannot open $assembly\n"; | |
print ASSEMBLY ">seq1\n"; | |
print ASSEMBLY "AAAAA\n"; | |
print ASSEMBLY ">seq2\n"; | |
print ASSEMBLY "AAAAATTTTT\n"; | |
print ASSEMBLY "\n"; | |
print ASSEMBLY ">seq3\n"; | |
print ASSEMBLY "AAAAA\n"; | |
print ASSEMBLY "TTTTT\n"; | |
print ASSEMBLY ">seq4\n"; | |
print ASSEMBLY ">seq5\n"; | |
print ASSEMBLY " AAA AA \n"; | |
close(ASSEMBLY); | |
($LEN,$errorcode,$errormsg) = &read_scaffold_lengths($assembly); | |
if ($LEN->{'seq1'} != 5 || $LEN->{'seq2'} != 10 || $LEN->{'seq3'} != 10 || defined($LEN->{'seq4'}) || $LEN->{'seq5'} != 5 || $errorcode != 0) { print STDERR "ERROR: test_read_scaffold_lengths: failed test1\n"; exit;} | |
system "rm -f $assembly"; | |
# TEST ERRORCODE=1: | |
($assembly,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(ASSEMBLY,">$assembly") || die "ERROR: test_read_scaffold_lengths: cannot open $assembly\n"; | |
print ASSEMBLY ">seq1\n"; | |
print ASSEMBLY "AAAAA\n"; | |
print ASSEMBLY ">seq2\n"; | |
print ASSEMBLY "AAAAATTTTT\n"; | |
print ASSEMBLY ">seq1\n"; | |
print ASSEMBLY "AAAAA\n"; | |
close(ASSEMBLY); | |
($LEN,$errorcode,$errormsg) = &read_scaffold_lengths($assembly); | |
if ($errorcode != 1) { print STDERR "ERROR: test_read_scaffold_lengths: failed test2\n"; exit;} | |
system "rm -f $assembly"; | |
# TEST ERRORCODE=2: | |
($assembly,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(ASSEMBLY,">$assembly") || die "ERROR: test_read_scaffold_lengths: cannot open $assembly\n"; | |
print ASSEMBLY "AAAAA\n"; | |
print ASSEMBLY ">seq2\n"; | |
print ASSEMBLY "AAAAATTTTT\n"; | |
print ASSEMBLY ">seq1\n"; | |
print ASSEMBLY "AAAAA\n"; | |
close(ASSEMBLY); | |
($LEN,$errorcode,$errormsg) = &read_scaffold_lengths($assembly); | |
if ($errorcode != 2) { print STDERR "ERROR: test_read_scaffold_lengths: failed test3\n"; exit;} | |
system "rm -f $assembly"; | |
} | |
#------------------------------------------------------------------# | |
# READ IN THE LENGTHS OF CONTIGS IN THE ASSEMBLY: | |
# SUBROUTINE SYNOPSIS: read_scaffold_length(): store lengths of sequences from a fasta file in a hash | |
sub read_scaffold_lengths | |
{ | |
my $assembly = $_[0]; # FILE CONTAINING THE ASSEMBLY | |
my %SCAFFOLDLEN = (); # HASH TABLE FOR STORING LENGTHS OF SCAFFOLDS | |
my $line; # | |
my @temp; # | |
my $length; # LENGTH OF A SCAFFOLD | |
my $name = "none";# NAME OF A SCAFFOLD | |
my $seq; # SEQUENCE OF A SCAFFOLD | |
my $errorcode = 0; # THIS IS RETURNED AS 0 IF NO ERROR OCCURRED. | |
my $errormsg = "none";# THIS IS RETURNED AS 'none' IF NO ERROR OCCURRED. | |
my %SEEN = (); # HASH TABLE TO RECORD WHICH SCAFFOLD NAMES WE HAVE SEEN. | |
open(ASSEMBLY,"$assembly") || die "ERROR: read_scaffold_lengths: cannot open $assembly\n"; | |
while(<ASSEMBLY>) | |
{ | |
$line = $_; | |
chomp $line; | |
if (substr($line,0,1) eq '>') | |
{ | |
@temp = split(/\s+/,$line); | |
$name = $temp[0]; | |
$name = substr($name,1,length($name)-1); | |
if ($SEEN{$name}) | |
{ | |
$errormsg = "ERROR: read_scaffold_lengths: seen scaffold $name already\n"; | |
$errorcode = 1; # ERRORCODE=1 (TESTED FOR) | |
return(\%SCAFFOLDLEN, $errorcode, $errormsg); | |
} | |
$SEEN{$name} = 1; | |
} | |
else | |
{ | |
$seq = $line; | |
# REMOVAL SPACES FROM THE LINE, EITHER INTERNAL SPACES OR SPACES AT EITHER END: | |
$seq = $line; | |
$seq =~ s/\s+//g; | |
# STORE THE LENGTH OF THE SEQUENCE, OR UPDATE THE STORED LENGTH: | |
if ($seq eq '') { $length = 0; } | |
else { $length = length($seq);} | |
if ($name eq 'none') | |
{ | |
$errormsg = "ERROR: read_scaffold_lengths: name is $name\n"; | |
$errorcode = 2; # ERRORCODE=2 (TESTED FOR) | |
return(\%SCAFFOLDLEN, $errorcode, $errormsg); | |
} | |
if (!($SCAFFOLDLEN{$name})){ $SCAFFOLDLEN{$name} = $length;} | |
else {$SCAFFOLDLEN{$name} = $SCAFFOLDLEN{$name} + $length;} | |
} | |
} | |
close(ASSEMBLY); | |
return(\%SCAFFOLDLEN,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# MAKE A FILE OF THE TRANSLATED SEQUENCES FOR THE TRANSCRIPTS IN THE INPUT GFF: | |
sub make_translated_dna | |
{ | |
my $spliced_dna = $_[0]; # FILE OF SPLICED DNA SEQUENCES FOR TRANSCRIPTS | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $cmd; # COMMAND TO RUN | |
my $translated_dna; # FILE OF TRANSLATED DNA SEQUENCES OF TRANSCRIPTS | |
my @temp; # | |
($translated_dna,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
@temp = split(/\//,$translated_dna); | |
$translated_dna = $temp[$#temp]; | |
$cmd = "translate_spliced_dna.pl $spliced_dna $translated_dna $outputdir"; | |
system "$cmd"; | |
system "sleep 1"; # SLEEP FOR 1 SECOND TO MAKE SURE THE OUTPUT FILE IS READY. | |
if (!(-e $translated_dna)) | |
{ | |
$errormsg = "ERROR: make_translated_dna: file translated_dna $translated_dna does not exist\n"; | |
$errorcode = 8; # ERRORCODE=8 (SHOULDN'T HAPPEN, SO CAN'T TEST FOR) | |
return($translated_dna,$errorcode,$errormsg); | |
} | |
return($translated_dna,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# MAKE A FILE OF THE SPLICED DNA SEQUENCES FOR THE TRANSCRIPTS IN THE INPUT GFF: | |
sub make_spliced_dna | |
{ | |
my $input_gff = $_[0]; # INPUT GFF FILE | |
my $outputdir = $_[1]; # DIRECTORY TO PUT OUTPUT FILES IN | |
my $assembly = $_[2]; # GENOME ASSEMBLY | |
my $cmd; # COMMAND TO RUN | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $spliced_dna; # FILE OF SPLICED DNA SEQUENCES FOR TRANSCRIPTS | |
my @temp; # | |
($spliced_dna,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
@temp = split(/\//,$spliced_dna); | |
$spliced_dna = $temp[$#temp]; | |
$cmd = "get_spliced_transcripts_from_gff.pl $input_gff $assembly $spliced_dna $outputdir yes no no"; | |
system "$cmd"; | |
system "sleep 1"; # SLEEP FOR 1 SECOND TO MAKE SURE THE OUTPUT FILE IS READY. | |
if (!(-e $spliced_dna)) | |
{ | |
$errormsg = "ERROR: make_spliced_dna: file spliced_dna $spliced_dna does not exist\n"; | |
$errorcode = 7; # ERRORCODE=7 (SHOULDN'T HAPPEN, SO CAN'T TEST FOR) | |
return($spliced_dna,$errorcode,$errormsg); | |
} | |
return($spliced_dna,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE TO MAKE A FILE NAME FOR A TEMPORARY FILE: | |
sub make_filename | |
{ | |
my $outputdir = $_[0]; # OUTPUT DIRECTORY TO WRITE TEMPORARY FILE NAME TO | |
my $found_name = 0; # SAYS WHETHER WE HAVE FOUND A FILE NAME YET | |
my $filename = "none";# NEW FILE NAME TO USE | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $poss_filename; # POSSIBLE FILE NAME TO USE | |
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAME | |
while($found_name == 0) | |
{ | |
$random_number = rand(); | |
$poss_filename = $outputdir."/tmp".$random_number; | |
if (!(-e $poss_filename)) | |
{ | |
$filename = $poss_filename; | |
$found_name = 1; | |
} | |
} | |
if ($found_name == 0 || $filename eq 'none') | |
{ | |
$errormsg = "ERROR: make_filename: found_name $found_name filename $filename\n"; | |
$errorcode = 6; # ERRORCODE=6 | |
return($filename,$errorcode,$errormsg); | |
}# | |
return($filename,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &print_error | |
sub test_print_error | |
{ | |
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR | |
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR | |
($errormsg,$errorcode) = &print_error(45,45,1); | |
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;} | |
($errormsg,$errorcode) = &print_error('My error message','My error message',1); | |
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;} | |
($errormsg,$errorcode) = &print_error('none',45,1); | |
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;} | |
($errormsg,$errorcode) = &print_error('My error message', 0, 1); | |
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;} | |
} | |
#------------------------------------------------------------------# | |
# PRINT OUT AN ERROR MESSAGE AND EXIT. | |
sub print_error | |
{ | |
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED. | |
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED. | |
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT | |
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; | |
exit; | |
} | |
} | |
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/)) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; | |
exit; | |
} | |
} | |
if ($errormsg eq 'none' || $errorcode == 0) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; | |
exit; | |
} | |
} | |
else | |
{ | |
print STDERR "$errormsg"; | |
exit; | |
} | |
return($errormsg,$errorcode); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment