Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Last active December 10, 2015 19:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/4483186 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/4483186 to your computer and use it in GitHub Desktop.
Perl script to convert a gff file from the Chado database to a gtf format file for the RNA-SeqQC software
#!/usr/local/bin/perl
=head1 NAME
convert_chado_gff_to_gtf.pl
=head1 SYNOPSIS
convert_chado_gff_to_gtf.pl input_gff output_gtf outputdir
where input_gff is the input embl file,
output_gtf is the output fasta file,
outputdir is the output directory for writing output files.
=head1 DESCRIPTION
This script takes an input gff file from chado (<input_gff>), and converts it to
gtf format, and writes the output file (<output_fasta>) in directory
<outputdir>.
Note that this script adds ';' at the end of the scaffold names in the output
gtf file.
=head1 VERSION
Perl script last edited 4-Jan-2013.
=head1 CONTACT
alc@sanger.ac.uk (Avril Coghlan)
=cut
#
# Perl script convert_chado_gff_to_gtf.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 4-Jan-13.
# Last edited 4-Jan-2013.
# SCRIPT SYNOPSIS: converts an input gff file from chado into gtf format
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
use strict;
use warnings;
my $num_args = $#ARGV + 1;
if ($num_args != 3)
{
print "Usage of convert_chado_gff_to_gtf_fasta.pl\n\n";
print "perl convert_chado_gff_to_gtf.pl <input_gff> <output_gtf> <outputdir>\n";
print "where <input_gff> is the input embl file,\n";
print " <output_gtf> is the output fasta file,\n";
print " <outputdir> is the output directory for writing output files\n";
print "For example, >perl convert_chado_gff_to_gtf.pl Pk_strainH_chr01.gff Pk_strainH_chr01.gtf\n";
print "/lustre/scratch108/parasites/alc/RNA-SeQC/Pknowlesi\n";
exit;
}
# FIND THE PATH TO THE INPUT GFF FILE:
my $input_gff = $ARGV[0];
# FIND THE PATH TO THE OUTPUT GTF FILE:
my $output_gtf = $ARGV[1];
# FIND THE DIRECTORY TO USE FOR OUTPUT FILES:
my $outputdir = $ARGV[2];
#------------------------------------------------------------------#
# TEST SUBROUTINES:
my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING.
&test_read_RNAs($outputdir);
&test_read_genes_for_transcripts($outputdir);
# xxxyyyzzz avril &test_convert_gff_to_gtf($outputdir);
&test_print_error;
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
&run_main_program($outputdir,$input_gff,$output_gtf);
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
sub run_main_program
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN.
my $input_gff = $_[1]; # THE INPUT GFF FILE
my $output_gtf = $_[2]; # THE OUTPUT GTF FILE
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR.
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR.
my $GENE; # HASH TABLE TO STORE THE GENE NAMES FOR TRANSCRIPTS
my $RNA; # HASH TABLE TO STORE NAMES OF RNA GENES
my $RRNA; # HASH TABLE TO STORE NAMES OF rRNA GENES
# READ IN THE INPUT GFF FILE, TO FIND THE GENE NAMES FOR TRANSCRIPTS:
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# READ IN THE INPUT GFF FILE, TO FIND RNA GENES:
($RNA,$RRNA,$errorcode,$errormsg) = &read_RNAs($input_gff);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# READ IN THE INPUT GFF FILE, AND MAKE THE OUTPUT GTF FILE:
($errorcode,$errormsg) = &convert_gff_to_gtf($outputdir,$input_gff,$output_gtf,$GENE,$RNA,$RRNA);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
}
#------------------------------------------------------------------#
# TEST &read_RNAs
sub test_read_RNAs
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES INTO
my $input_gff; # INPUT GFF FILE
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR
my $RRNA; # HASH TABLE TO STORE rRNA GENES IN
my $RNA; # HASH TABLE TO STORE RNA GENES IN
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_RNAs: cannot open $input_gff\n";
print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n";
print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n";
close(INPUT_GFF);
($RNA,$RRNA,$errorcode,$errormsg) = &read_RNAs($input_gff);
if ($errorcode != 0) { print STDERR "ERROR: test_read_RNAs: failed test1a\n"; exit;}
if (!($RRNA->{"PKH_011712:rRNA"}) || !($RRNA->{"PKH_011712"})) { print STDERR "ERROR: test_read_RNAs: failed test1b\n"; exit;}
if (!($RNA->{"PKH_083075.1"}) || !($RNA->{"PKH_083075"})) { print STDERR "ERROR: test_read_RNAs: failed test1c\n"; exit;}
if ($RNA->{"PKH_010010.1"} || $RNA->{"PKH_010010"}) { print STDERR "ERROR: test_read_RNAs: failed test1d\n"; exit;}
if ($RRNA->{"PKH_010010.1"} || $RRNA->{"PKH_010010"}) { print STDERR "ERROR: test_read_RNAs: failed test1e\n"; exit;}
if ($RRNA->{"PKH_083075.1"} || $RRNA->{"PKH_083075"}) { print STDERR "ERROR: test_read_RNAs: failed test1f\n"; exit;}
system "rm -f $input_gff";
# TEST FOR ERRORCODE=8 (THE GFF FILE DOES NOT HAVE 8 COLUMNS):
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_RNAs: cannot open $input_gff\n";
print INPUT_GFF "Pk_strainH_chr01 chado gene 108 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n";
print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n";
close(INPUT_GFF);
($RNA,$RRNA,$errorcode,$errormsg) = &read_RNAs($input_gff);
if ($errorcode != 8) { print STDERR "ERROR: test_read_RNAs: failed test2\n"; exit;}
system "rm -f $input_gff";
# TEST FOR ERRORCODE=9 (UNKNOWN FEATURE TYPE IN GFF FILE):
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_RNAs: cannot open $input_gff\n";
print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
print INPUT_GFF "Pk_strainH_chr01 chado xRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n";
print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n";
close(INPUT_GFF);
($RNA,$RRNA,$errorcode,$errormsg) = &read_RNAs($input_gff);
if ($errorcode != 9) { print STDERR "ERROR: test_read_RNAs: failed test3\n"; exit;}
system "rm -f $input_gff";
}
#------------------------------------------------------------------#
# READ IN THE INPUT GFF FILE, TO FIND RNA GENES:
sub read_RNAs
{
my $input_gff = $_[0]; # INPUT GFF FILE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $line; #
my @temp; #
my $start_of_seq = 0; # SAYS WHETHER WE HAVE FOUND THE START OF THE SEQUENCE
my $feature; # FEATURE TYPE
my $name; # FEATURE NAME
my %RNA = (); # HASH TABLE TO STORE THE NAMES OF RNA GENES
my %RRNA = (); # HASH TABLE TO STORE THE NAMES OF rRNA GENES
my $gene; # GENE NAME
my $transcript; # TRANSCRIPT NAME
# READ IN THE INPUT GFF FILE:
open(INPUT_GFF,"$input_gff") || die "ERROR: read_RNAs: cannot open input_gff $input_gff\n";
while(<INPUT_GFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
if (substr($line,0,1) ne '#' && $start_of_seq == 0) # IGNORE COMMENT LINES
{
if ($#temp != 8) # THE GFF FILE SHOULD HAVE 8 COLUMNS
{
$errormsg = "ERROR: read_RNAs: input_gff has line that does not have 8 columns: $line\n";
$errorcode = 8; # ERRORCODE=8 (TESTED FOR)
return(\%RNA,\%RRNA,$errorcode,$errormsg);
}
$feature = $temp[2]; # eg. gene, CDS, mRNA
$name = $temp[8]; # FEATURE NAME
# CHECK THAT THE FEATURES ARE OF THE TYPE THAT WE EXPECT:
if ($feature ne 'gene' && $feature ne 'CDS' && $feature ne 'mRNA' && $feature ne 'polypeptide' && $feature ne 'gap' &&
$feature ne 'tRNA' && $feature ne 'polypeptide_motif' && $feature ne 'pseudogenic_transcript' && $feature ne 'pseudogene' &&
$feature ne 'pseudogenic_exon' && $feature ne 'rRNA' && $feature ne 'repeat_region' && $feature ne 'snoRNA' && $feature ne 'snRNA')
{
$errormsg = "ERROR: read_RNAs: input_gff has feature $feature on line $line\n";
$errorcode = 9; # ERRORCODE=9 (TESTED FOR)
return(\%RNA,\%RRNA,$errorcode,$errormsg);
}
if ($feature eq 'rRNA') # A rRNA GENE
{
# eg. ID=PKH_010104.1;Parent=PKH_010104;
# GET THE TRANSCRIPT NAME:
@temp = split(/ID=/,$name);
$transcript = $temp[1];
@temp = split(/\;/,$transcript);
$transcript = $temp[0];
# GET THE GENE NAME:
@temp = split(/Parent=/,$name);
$gene = $temp[1];
@temp = split(/\;/,$gene);
$gene = $temp[0];
$RRNA{$gene} = 1; # RECORD THAT THIS IS A rRNA GENE
$RRNA{$transcript} = 1; # RECORD THAT THIS IS A rRNA TRANSCRIPT
}
elsif ($feature eq 'tRNA' || $feature eq 'snoRNA' || $feature eq 'snRNA') # A tRNA OR snoRNA OR OTHER RNA GENE
{
# eg. ID=PKH_010104.1;Parent=PKH_010104;
# GET THE TRANSCRIPT NAME:
@temp = split(/ID=/,$name);
$transcript = $temp[1];
@temp = split(/\;/,$transcript);
$transcript = $temp[0];
# GET THE GENE NAME:
@temp = split(/Parent=/,$name);
$gene = $temp[1];
@temp = split(/\;/,$gene);
$gene = $temp[0];
$RNA{$gene} = 1; # RECORD THAT THIS IS A RNA GENE
$RNA{$transcript} = 1; # RECORD THAT THIS IS A RNA TRANSCRIPT
}
}
elsif (substr($line,0,7) eq '##FASTA')
{
$start_of_seq = 1; # THERE IS SEQUENCE AFTER THIS IN THE GTF FILE
}
}
close(INPUT_GFF);
return(\%RNA,\%RRNA,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &read_genes_for_transcripts
sub test_read_genes_for_transcripts
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES INTO
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR
my $input_gff; # INPUT GFF FILE
my $GENE; # HASH TABLE TO STORE THE GENE NAME FOR TRANSCRIPTS
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_genes_for_transcripts: cannot open $input_gff\n";
print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n";
print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n";
close(INPUT_GFF);
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff);
if ($errorcode != 0) { print STDERR "ERROR: test_read_genes_for_transcripts: failed test1a\n"; exit;}
if ($GENE->{"PKH_011712:rRNA"} ne 'PKH_011712') { print STDERR "ERROR: test_read_genes_for_transcripts: failed test1b\n"; exit;}
if ($GENE->{"PKH_010010.1"} ne "PKH_010010") { print STDERR "ERROR: test_read_genes_for_transcripts: failed test1c\n"; exit;}
system "rm -f $input_gff";
# TEST FOR ERRORCODE=1 (GFF FILE DOES NOT HAVE 8 COLUMNS):
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_genes_for_transcripts: cannot open $input_gff\n";
print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
print INPUT_GFF "Pk_strainH_chr01 chado mRNA 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n";
print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n";
close(INPUT_GFF);
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff);
if ($errorcode != 1) { print STDERR "ERROR: test_read_genes_for_transcripts: failed test2\n"; exit;}
system "rm -f $input_gff";
# TEST FOR ERRORCODE=4 (UNKNOWN FEATURE TYPE IN GFF FILE):
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_genes_for_transcripts: cannot open $input_gff\n";
print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n";
print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr01 chado xRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n";
close(INPUT_GFF);
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff);
if ($errorcode != 4) { print STDERR "ERROR: test_read_genes_for_transcripts: failed test3\n"; exit;}
system "rm -f $input_gff";
# TEST FOR ERRORCODE=5 (TWO DIFFERENT TRANSCRIPT NAMES APPEAR FOR A GENE):
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_genes_for_transcripts: cannot open $input_gff\n";
print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n";
print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010011;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n";
print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n";
close(INPUT_GFF);
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff);
if ($errorcode != 5) { print STDERR "ERROR: test_read_genes_for_transcripts: failed test4\n"; exit;}
system "rm -f $input_gff";
}
#------------------------------------------------------------------#
# READ IN THE INPUT GFF FILE, TO FIND THE GENE NAMES FOR TRANSCRIPTS:
sub read_genes_for_transcripts
{
my $input_gff = $_[0]; # INPUT GFF FILE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my %GENE = (); # HASH TABLE TO STORE GENE NAMES FOR TRANSCRIPTS
my $start_of_seq = 0; # SAYS WHETHER WE HAVE FOUND THE START OF THE SEQUENCE
my $line; #
my @temp; #
my $feature; # FEATURE TYPE
my $name; # FEATURE NAME
my $gene; # GENE NAME
my $transcript; # TRANSCRIPT NAME
# READ IN THE INPUT GFF FILE:
open(INPUT_GFF,"$input_gff") || die "ERROR: read_genes_for_transcripts: cannot open input_gff $input_gff\n";
while(<INPUT_GFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
if (substr($line,0,1) ne '#' && $start_of_seq == 0) # IGNORE COMMENT LINES
{
if ($#temp != 8) # THE GFF FILE SHOULD HAVE 8 COLUMNS
{
$errormsg = "ERROR: read_genes_for_transcripts: input_gff has line that does not have 8 columns: $line\n";
$errorcode = 1; # ERRORCODE=1 (TESTED FOR)
return(\%GENE,$errorcode,$errormsg);
}
$feature = $temp[2]; # eg. gene, CDS, mRNA
$name = $temp[8]; # FEATURE NAME
# CHECK THAT THE FEATURES ARE OF THE TYPE THAT WE EXPECT:
if ($feature ne 'gene' && $feature ne 'CDS' && $feature ne 'mRNA' && $feature ne 'polypeptide' && $feature ne 'gap' &&
$feature ne 'tRNA' && $feature ne 'polypeptide_motif' && $feature ne 'pseudogenic_transcript' && $feature ne 'pseudogene' &&
$feature ne 'pseudogenic_exon' && $feature ne 'rRNA' && $feature ne 'repeat_region' && $feature ne 'snoRNA' && $feature ne 'snRNA')
{
$errormsg = "ERROR: read_genes_for_transcripts: input_gff has feature $feature on line $line\n";
$errorcode = 4; # ERRORCODE=4 (TESTED FOR)
return(\%GENE,$errorcode,$errormsg);
}
if ($feature eq 'mRNA' || $feature eq 'rRNA') # A mRNA OR rRNA TRANSCRIPT
{
# eg. ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c
# GET THE TRANSCRIPT NAME:
@temp = split(/ID=/,$name);
$transcript = $temp[1];
@temp = split(/\;/,$transcript);
$transcript = $temp[0];
# GET THE GENE NAME:
@temp = split(/Parent=/,$name);
$gene = $temp[1];
@temp = split(/\;/,$gene);
$gene = $temp[0];
if ($GENE{$transcript})
{
$errormsg = "ERROR: read_genes_for_transcripts: already have gene name for transcript $transcript\n";
$errorcode = 5; # ERRORCODE=5 (TESTED FOR)
return(\%GENE,$errorcode,$errormsg);
}
$GENE{$transcript} = $gene;
}
}
elsif (substr($line,0,7) eq '##FASTA')
{
$start_of_seq = 1; # THERE IS SEQUENCE AFTER THIS IN THE GTF FILE
}
}
close(INPUT_GFF);
return(\%GENE,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &convert_gff_to_gtf
sub test_convert_gff_to_gtf
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES INTO
my $input_gff; # INPUT GFF FILE
my $output_gtf; # OUTPUT GTF FILE
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR
my $RNA; # HASH TABLE TO STORE RNA GENE NAMES IN
my $RRNA; # HASH TABLE TO STORE rRNA GENE NAMES IN
my $expected_output_gtf; # FILE WITH THE EXPECTED CONTENTS OF $output_gtf
my $differences; # DIFFERENCES BETWEEN $output_gtf AND $expected_output_gtf
my $length_differences; # LENGTH OF $differences
my $GENE; # HASH TABLE OF THE NAMES OF GENES FOR TRANSCRIPTS
my $line; #
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_convert_gff_to_gtf: cannot open $input_gff\n";
print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n";
print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n";
close(INPUT_GFF);
# READ IN THE INPUT GFF FILE, TO FIND THE GENE NAMES FOR TRANSCRIPTS:
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# READ IN THE INPUT GFF FILE, TO FIND RNA GENES:
($RNA,$RRNA,$errorcode,$errormsg) = &read_RNAs($input_gff);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# CALL &convert_gff_to_gtf:
($output_gtf,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
($errorcode,$errormsg) = &convert_gff_to_gtf($outputdir,$input_gff,$output_gtf,$GENE,$RNA,$RRNA);
if ($errorcode != 0) { print STDERR "ERROR: test_convert_gff_to_gtf: failed test1 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
# OPEN A FILE WITH THE EXPECTED CONTENTS OF $output_gtf:
($expected_output_gtf,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(EXPECTED,">$expected_output_gtf") || die "ERROR: test_convert_gff_to_gtf: cannot open $expected_output_gtf\n";
print EXPECTED "Pk_strainH_chr01; chado gene 108 1421 . + . gene_id \"PKH_010010\"; transcript_id \"PKH_010010\"; transcript_type \"protein_coding\";\n";
print EXPECTED "Pk_strainH_chr01; chado exon 108 758 . + 0 gene_id \"PKH_010010\"; transcript_id \"PKH_010010.1\"; transcript_type \"protein_coding\";\n";
print EXPECTED "Pk_strainH_chr01; chado exon 978 1421 . + 0 gene_id \"PKH_010010\"; transcript_id \"PKH_010010.1\"; transcript_type \"protein_coding\";\n";
print EXPECTED "Pk_strainH_chr01; chado mRNA 108 1421 . + . gene_id \"PKH_010010\"; transcript_id \"PKH_010010.1\"; transcript_type \"protein_coding\";\n";
print EXPECTED "Pk_strainH_chr01; chado gene 795283 795406 . + . gene_id \"PKH_011712\"; transcript_id \"PKH_011712\"; transcript_type \"rRNA\";\n";
print EXPECTED "Pk_strainH_chr01; chado exon 795283 795406 . + 0 gene_id \"PKH_011712\"; transcript_id \"PKH_011712:rRNA\"; transcript_type \"rRNA\";\n";
print EXPECTED "Pk_strainH_chr01; chado rRNA 795283 795406 . + . gene_id \"PKH_011712\"; transcript_id \"PKH_011712:rRNA\"; transcript_type \"rRNA\";\n";
close(EXPECTED);
$differences = "";
open(TEMP,"diff $output_gtf $expected_output_gtf |");
while(<TEMP>)
{
$line = $_;
$differences = $differences.$line;
}
close(TEMP);
$length_differences = length($differences);
if ($length_differences != 0) { print STDERR "ERROR: test_convert_gff_to_gtf: failed test1 (output_gtf $output_gtf expected_output_gtf $expected_output_gtf)\n"; exit;}
system "rm -f $output_gtf";
system "rm -f $expected_output_gtf";
system "rm -f $input_gff";
# TEST FOR ERRORCODE=7 (GFF DOES NOT HAVE 8 COLUMNS):
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_convert_gff_to_gtf: cannot open $input_gff\n";
print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n";
print INPUT_GFF "Pk_strainH_chr01 chado gene 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n";
close(INPUT_GFF);
# READ IN THE INPUT GFF FILE, TO FIND THE GENE NAMES FOR TRANSCRIPTS:
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff);
# READ IN THE INPUT GFF FILE, TO FIND RNA GENES:
($RNA,$RRNA,$errorcode,$errormsg) = &read_RNAs($input_gff);
# CALL &convert_gff_to_gtf:
($output_gtf,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
($errorcode,$errormsg) = &convert_gff_to_gtf($outputdir,$input_gff,$output_gtf,$GENE,$RNA,$RRNA);
if ($errorcode != 7) { print STDERR "ERROR: test_convert_gff_to_gtf: failed test2 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
system "rm -f $output_gtf";
system "rm -f $input_gff";
#xxx TEST FOR ERRORCODE=3 (DO NOT KNOW GENE NAME FOR rRNA GENE):
# ($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
# if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# open(INPUT_GFF,">$input_gff") || die "ERROR: test_convert_gff_to_gtf: cannot open $input_gff\n";
# print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n";
# print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
# print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
# print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n";
# print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n";
# print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n";
# print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n";
# print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n";
# print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n";
# print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n";
# close(INPUT_GFF);
# READ IN THE INPUT GFF FILE, TO FIND THE GENE NAMES FOR TRANSCRIPTS:
# ($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff);
# if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# delete($GENE->{"PKH_011712:rRNA"});
# READ IN THE INPUT GFF FILE, TO FIND RNA GENES:
# ($RNA,$RRNA,$errorcode,$errormsg) = &read_RNAs($input_gff);
# if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# CALL &convert_gff_to_gtf:
# ($output_gtf,$errorcode,$errormsg) = &make_filename($outputdir);
# if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# ($errorcode,$errormsg) = &convert_gff_to_gtf($outputdir,$input_gff,$output_gtf,$GENE,$RNA,$RRNA);
# if ($errorcode != 3) { print STDERR "ERROR: test_convert_gff_to_gtf: failed test3 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
# system "rm -f $output_gtf";
# system "rm -f $input_gff";
# TEST FOR ERRORCODE=2 (UNKNOWN TYPE OF FEATURE IN GFF FILE):
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
open(INPUT_GFF,">$input_gff") || die "ERROR: test_convert_gff_to_gtf: cannot open $input_gff\n";
print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n";
print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado CCDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n";
print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n";
close(INPUT_GFF);
# READ IN THE INPUT GFF FILE, TO FIND THE GENE NAMES FOR TRANSCRIPTS:
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff);
# READ IN THE INPUT GFF FILE, TO FIND RNA GENES:
($RNA,$RRNA,$errorcode,$errormsg) = &read_RNAs($input_gff);
# CALL &convert_gff_to_gtf:
($output_gtf,$errorcode,$errormsg) = &make_filename($outputdir);
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
($errorcode,$errormsg) = &convert_gff_to_gtf($outputdir,$input_gff,$output_gtf,$GENE,$RNA,$RRNA);
if ($errorcode != 2) { print STDERR "ERROR: test_convert_gff_to_gtf: failed test4 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
system "rm -f $output_gtf";
system "rm -f $input_gff";
#xxx TEST FOR ERRORCODE=10 (DO NOT KNOW GENE NAME FOR PROTEIN-CODING GENE):
# ($input_gff,$errorcode,$errormsg) = &make_filename($outputdir);
# if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# open(INPUT_GFF,">$input_gff") || die "ERROR: test_convert_gff_to_gtf: cannot open $input_gff\n";
# print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n";
# print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
# print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n";
# print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n";
# print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n";
# print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n";
# print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n";
# print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n";
# print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n";
# print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n";
# close(INPUT_GFF);
# READ IN THE INPUT GFF FILE, TO FIND THE GENE NAMES FOR TRANSCRIPTS:
# ($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff);
# delete($GENE->{"PKH_010010.1"});
# READ IN THE INPUT GFF FILE, TO FIND RNA GENES:
# ($RNA,$RRNA,$errorcode,$errormsg) = &read_RNAs($input_gff);
# CALL &convert_gff_to_gtf:
# ($output_gtf,$errorcode,$errormsg) = &make_filename($outputdir);
# if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); }
# ($errorcode,$errormsg) = &convert_gff_to_gtf($outputdir,$input_gff,$output_gtf,$GENE,$RNA,$RRNA);
# if ($errorcode != 10) { print STDERR "ERROR: test_convert_gff_to_gtf: failed test5 (errorcode $errorcode errormsg $errormsg)\n"; exit;}
# system "rm -f $output_gtf";
# system "rm -f $input_gff";
}
#------------------------------------------------------------------#
# READ IN THE INPUT GFF FILE, AND MAKE THE OUTPUT GTF FILE:
sub convert_gff_to_gtf
{
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN
my $input_gff = $_[1]; # INPUT GFF FILE
my $output_gtf = $_[2]; # OUTPUT GTF FILE
my $GENE = $_[3]; # HASH TABLE TO STORE THE GENE NAMES FOR TRANSCRIPTS
my $RNA = $_[4]; # HASH TABLE TO STORE RNA GENES
my $RRNA = $_[5]; # HASH TABLE TO STORE rRNA GENES
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $line; #
my @temp; #
my $scaffold; # NAME OF THE SCAFFOLD
my $method; # METHOD USED TO FIND THE FEATURE (chado HERE)
my $feature; # FEATURE TYPE
my $start; # START POSITION OF FEATURE
my $end; # END POSITION OF FEATURE
my $score; # SCORE OF FEATURE
my $strand; # STRAND OF FEATURE
my $frame; # FRAME OF FEATURE
my $name; # FEATURE NAME
my $start_of_seq = 0; # SAYS WHETHER WE HAVE FOUND THE START OF THE SEQUENCE
my $new_name; # NEW VERSION OF THE FEATURE NAME
my $gene; # GENE NAME
my $transcript; # TRANSCRIPT NAME
my $cds; # CDS (EXON) NAME
my $i; #
# OPEN AN OUTPUT GTF FILE:
open(OUTPUT_GTF,">$output_gtf") || die "ERROR: convert_gff_to_gtf: cannot open output_gtf $output_gtf\n";
# READ IN THE INPUT GFF FILE:
open(INPUT_GFF,"$input_gff") || die "ERROR: convert_gff_to_gtf: cannot open input_gff $input_gff\n";
while(<INPUT_GFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
if (substr($line,0,1) ne '#' && $start_of_seq == 0) # IGNORE COMMENT LINES
{
if ($#temp != 8) # THE GFF FILE SHOULD HAVE 8 COLUMNS
{
$errormsg = "ERROR: convert_gff_to_gtf: input_gff has line that does not have 8 columns: $line\n";
$errorcode = 7; # ERRORCODE=7 (TESTED FOR)
return($errorcode,$errormsg);
}
$scaffold = $temp[0]; # eg. Pk_strainH_chr01
$method = $temp[1]; # eg. chado
$feature = $temp[2]; # eg. gene, CDS, mRNA
$start = $temp[3]; # eg. 108
$end = $temp[4]; # eg. 1421
$score = $temp[5];
$strand = $temp[6];
$frame = $temp[7];
$name = $temp[8]; # FEATURE NAME
# WE NEED TO ADD ';' TO THE END OF THE SCAFFOLD NAMES:
# $scaffold = $scaffold.";"; # xxxyyyzzz avril
# CHECK THAT THE FEATURES ARE OF THE TYPE THAT WE EXPECT:
if ($feature ne 'gene' && $feature ne 'CDS' && $feature ne 'mRNA' && $feature ne 'polypeptide' && $feature ne 'gap' &&
$feature ne 'tRNA' && $feature ne 'polypeptide_motif' && $feature ne 'pseudogenic_transcript' && $feature ne 'pseudogene' &&
$feature ne 'pseudogenic_exon' && $feature ne 'rRNA' && $feature ne 'repeat_region' && $feature ne 'snoRNA' && $feature ne 'snRNA')
{
$errormsg = "ERROR: convert_gff_to_gtf: input_gff has feature $feature on line $line\n";
$errorcode = 2; # ERRORCODE=2 (TESTED FOR)
return($errorcode,$errormsg);
}
# WE ARE ONLY INTERESTED IN 'exon', 'transcript', 'CDS', 'UTR', 'stop_codon', 'start_codon', 'gene', 'Selenocysteine' FEATURES:
if (($feature eq 'gene' || $feature eq 'CDS' || $feature eq 'mRNA' || $feature eq 'rRNA') && ($name =~ /ID=/)) # xxx
{
if ($feature eq 'gene') # eg. ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT
{
# GET THE GENE NAME:
@temp = split(/ID=/,$name);
$gene = $temp[1]; # eg. PKH_010010;...
@temp = split(/\;/,$gene);
$gene = $temp[0]; # eg. PKH_010010
if ($RRNA->{$gene}) # IF IT IS A rRNA GENE
{
$new_name = "gene_id \"$gene\"; transcript_id \"$gene\"; transcript_type \"rRNA\";";
print OUTPUT_GTF "$scaffold\t$method\t$feature\t$start\t$end\t$score\t$strand\t$frame\t$new_name\n";
}
elsif (!($RNA->{$gene})) # IF IT IS NOT AN RNA GENE:
{
$new_name = "gene_id \"$gene\"; transcript_id \"$gene\"; transcript_type \"protein_coding\";";
print OUTPUT_GTF "$scaffold\t$method\t$feature\t$start\t$end\t$score\t$strand\t$frame\t$new_name\n";
}
}
elsif ($feature eq 'CDS') # eg. ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7
{
# GET THE CDS NAME:
@temp = split(/ID=/,$name);
$cds = $temp[1]; # eg. PKH_010010.1:exon:1;...
@temp = split(/\;/,$cds);
$cds = $temp[0]; # eg. PKH_010010.1:exon:1
# GET THE TRANSCRIPT NAME:
@temp = split(/Parent=/,$name);
$transcript = $temp[1];
@temp = split(/\;/,$transcript);
$transcript = $temp[0];
# xxxif (!($GENE->{$transcript}))
# {
# $errormsg= "ERROR: convert_gff_to_gtf: do not know gene for rRNA transcript $transcript\n";
# $errorcode= 3; # ERRORCODE=3 (TESTED FOR)
# return($errorcode,$errormsg);
# }
# print "xxx transcript $transcript cds $cds xxx\n";
@temp = split(/\,/,$transcript);
for ($i = 0; $i <= $#temp; $i++) # xxx WORMBASE HAS JUST ONE LINE FOR A CDS SHARED BY TWO TRANSCRIPTS, eg. Transcript:AC3.5.1
{
$transcript = $temp[$i];
if ($GENE->{$transcript}) # WE SHOULD KNOW THE GENE FOR rRNA OR mRNA TRANSCRIPTS
{
# xxx if (!($GENE->{$transcript}))
# {
# $errormsg= "ERROR: convert_gff_to_gtf: do not know gene for transcript $transcript\n";
# $errorcode= 3; # ERRORCODE=3 (TESTED FOR) xxx
# return($errorcode,$errormsg);
# }
$gene = $GENE->{$transcript};
if ($RRNA->{$gene}) # IF IT IS A rRNA GENE
{
# xxxif (!($GENE->{$transcript}))
# {
# $errormsg= "ERROR: convert_gff_to_gtf: do not know gene for rRNA transcript $transcript\n";
# $errorcode= 3; # ERRORCODE=3 (TESTED FOR)
# return($errorcode,$errormsg);
# }
# $gene = $GENE->{$transcript};
$feature = "exon"; # NOTE: IN THE GENECODE GTF, 'CDS' IS THE CDS, AND 'exon' COULD INCLUDE NON-CODING (UTR) SEQUENCE
$new_name= "gene_id \"$gene\"; transcript_id \"$transcript\"; transcript_type \"rRNA\";";
print OUTPUT_GTF "$scaffold\t$method\t$feature\t$start\t$end\t$score\t$strand\t$frame\t$new_name\n";
}
elsif (!($RNA->{$transcript})) # IF IT IS NOT A RNA GENE:
{
# if (!($GENE->{$transcript}))
# {
# $errormsg= "ERROR: convert_gff_to_gtf: do not know gene for transcript $transcript cds $cds name $name (line $line)\n";
# $errorcode= 10; # ERRORCODE=10 (TESTED FOR)
# return($errorcode,$errormsg);
# }
$feature = "exon"; # NOTE: IN THE GENECODE GTF, 'CDS' IS THE CDS, AND 'exon' COULD INCLUDE NON-CODING (UTR) SEQUENCE
# $gene = $GENE->{$transcript};
$new_name= "gene_id \"$gene\"; transcript_id \"$transcript\"; transcript_type \"protein_coding\";";
print OUTPUT_GTF "$scaffold\t$method\t$feature\t$start\t$end\t$score\t$strand\t$frame\t$new_name\n";
}
}
}
}
elsif ($feature eq 'mRNA' || $feature eq 'rRNA') # eg. ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c
{
# GET THE TRANSCRIPT NAME:
@temp = split(/ID=/,$name);
$transcript = $temp[1];
@temp = split(/\;/,$transcript);
$transcript = $temp[0];
# GET THE GENE NAME:
@temp = split(/Parent=/,$name);
$gene = $temp[1];
@temp = split(/\;/,$gene);
$gene = $temp[0];
if ($RRNA->{$gene} || $RRNA->{$transcript}) # IF IT IS AN RRNA GENE:
{
$new_name= "gene_id \"$gene\"; transcript_id \"$transcript\"; transcript_type \"rRNA\";";
print OUTPUT_GTF "$scaffold\t$method\t$feature\t$start\t$end\t$score\t$strand\t$frame\t$new_name\n";
}
elsif (!($RNA->{$gene}) && !($RNA->{$transcript})) # IF IT IS NOT A RNA GENE:
{
$new_name= "gene_id \"$gene\"; transcript_id \"$transcript\"; transcript_type \"protein_coding\";";
print OUTPUT_GTF "$scaffold\t$method\t$feature\t$start\t$end\t$score\t$strand\t$frame\t$new_name\n";
}
}
}
}
elsif (substr($line,0,7) eq '##FASTA')
{
$start_of_seq = 1; # THERE IS SEQUENCE AFTER THIS IN THE GTF FILE
}
}
close(INPUT_GFF);
close(OUTPUT_GTF);
return($errorcode,$errormsg);
}
#------------------------------------------------------------------#
# SUBROUTINE TO MAKE A FILE NAME FOR A TEMPORARY FILE:
sub make_filename
{
my $outputdir = $_[0]; # OUTPUT DIRECTORY TO WRITE TEMPORARY FILE NAME TO
my $found_name = 0; # SAYS WHETHER WE HAVE FOUND A FILE NAME YET
my $filename = "none";# NEW FILE NAME TO USE
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR
my $poss_filename; # POSSIBLE FILE NAME TO USE
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAME
while($found_name == 0)
{
$random_number = rand();
$poss_filename = $outputdir."/tmp".$random_number;
if (!(-e $poss_filename))
{
$filename = $poss_filename;
$found_name = 1;
}
}
if ($found_name == 0 || $filename eq 'none')
{
$errormsg = "ERROR: make_filename: found_name $found_name filename $filename\n";
$errorcode = 6; # ERRORCODE=6
return($filename,$errorcode,$errormsg);
}
return($filename,$errorcode,$errormsg);
}
#------------------------------------------------------------------#
# TEST &print_error
sub test_print_error
{
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR
($errormsg,$errorcode) = &print_error(45,45,1);
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message','My error message',1);
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;}
($errormsg,$errorcode) = &print_error('none',45,1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;}
($errormsg,$errorcode) = &print_error('My error message', 0, 1);
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;}
}
#------------------------------------------------------------------#
# PRINT OUT AN ERROR MESSAGE AND EXIT.
sub print_error
{
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED.
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED.
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/)
{
if ($called_from_test == 1)
{
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n";
exit;
}
}
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/))
{
if ($called_from_test == 1)
{
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n";
exit;
}
}
if ($errormsg eq 'none' || $errorcode == 0)
{
if ($called_from_test == 1)
{
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13
return($errormsg,$errorcode);
}
else
{
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n";
exit;
}
}
else
{
print STDERR "$errormsg";
exit;
}
return($errormsg,$errorcode);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment