Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created August 27, 2013 14:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save avrilcoghlan/6354213 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/6354213 to your computer and use it in GitHub Desktop.
Perl script that, given an input gff file, identifies exons that have a 0 bp intron between them, or are overlapping exons, and merges them.
#!/usr/bin/env perl
=head1 NAME
merge_overlapping_exons.pl
=head1 SYNOPSIS
merge_overlapping_exons.pl input_gff output_gff outputdir input_fasta
where input_gff is the input gff file,
output_gff is the output gff file,
outputdir is the directory to write output files in,
input_fasta is the input fasta file for the assembly.
=head1 DESCRIPTION
This script takes an input gff file, and identifies exons that have a 0 bp
intron between them (ie. are adjacent exons) or are overlapping exons, and merges
them. The corrected exons are written to an output gff file.
=head1 VERSION
Perl script last edited 22-Jul-2013.
=head1 CONTACT
alc@sanger.ac.uk (Avril Coghlan)
=cut
#
# Perl script merge_overlapping_exons.pl
# Written by Avril Coghlan (alc@sanger.ac.uk)
# 22-Jul-13.
# Last edited 22-Jul-2013.
# SCRIPT SYNOPSIS: merge_overlapping_exons.pl: given an input gff file, identifies exons that have a 0 bp intron between them, or are overlapping exons, and merges them.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
use strict;
use warnings;
# xxx
BEGIN {
unshift (@INC, '/nfs/users/nfs_a/alc/Documents/git/helminth_scripts/modules');
}
use HelminthGenomeAnalysis::AvrilGffUtils;
use HelminthGenomeAnalysis::AvrilFileUtils;
use HelminthGenomeAnalysis::AvrilFastaUtils;
my $num_args = $#ARGV + 1;
if ($num_args != 4)
{
print "Usage of merge_overlapping_exons.pl\n\n";
print "perl merge_overlapping_exons.pl <input_gff> <output_gff> <outputdir> <input_fasta_file>\n";
print "where <input_gff> is the input gff file,\n";
print " <output_gff> is the output gff file,\n";
print " <outputdir> is the directory to write output files in,\n";
print " <input_fasta_file> is the input fasta file for the assembly\n";
print "For example, >perl merge_overlapping_exons.pl final.gff final2.gff . TRIC.v1.fa\n";
exit;
}
# FIND THE PATH TO THE INPUT GFF FILE:
my $input_gff = $ARGV[0];
# FIND THE NAME OF THE OUTPUT GFF FILE:
my $output_gff = $ARGV[1];
# FIND THE NAME OF THE DIRECTORY TO WRITE OUTPUT FILES IN:
my $outputdir = $ARGV[2];
# FIND THE NAME OF THE INPUT FASTA FILE FOR THE ASSEMBLY:
my $input_fasta = $ARGV[3];
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
&run_main_program($input_gff,$output_gff,$outputdir,$input_fasta);
print STDERR "FINISHED.\n";
#------------------------------------------------------------------#
# RUN THE MAIN PART OF THE CODE:
sub run_main_program
{
my $input_gff = $_[0]; # THE INPUT GFF FILE
my $output_gff = $_[1]; # THE OUTPUT GFF FILE
my $outputdir = $_[2]; # THE DIRECTORY TO WRITE OUTPUT FILES IN
my $input_fasta = $_[3]; # THE INPUT ASSEMBLY FASTA FILE
my $input_fasta_obj = $_[4]; # OBJECT FOR THE INPUT FASTA FILE
my $contigs2len; # HASH TABLE OF THE LENGTHS OF SCAFFOLD/CONTIG SEQUENCES
my $contigslen_file; # FILE WITH THE LENGTHS OF SCAFFOLD/CONTIG SEQUENCES
my $contig; # A CONTIG/SCAFFOLD NAME
my $cds_gff; # GFF FILE OF 'CDS' FEATURES
my $feature_types; # FEATURE TYPES TO PUT IN $cds_gff
my $returnvalue; # RETURN VALUE FROM A FUNCTION
my $sorted_cds_gff; # A SORTED VERSION OF $cds_gff
my $exon_gff; # GFF FILE OF 'exon' FEATURES
my $sorted_exon_gff; # A SORTED VERSION OF $exon_gff
my $contiglen; # LENGTH OF A SCAFFOLD
# GET THE LENGTHS OF THE SEQUENCES IN THE INPUT FASTA FILE, AND WRITE THEM OUT IN A TEMPORARY FILE:
$input_fasta_obj = HelminthGenomeAnalysis::AvrilFastaUtils->new(fasta_file => $input_fasta);
$contigs2len = HelminthGenomeAnalysis::AvrilFastaUtils::build_contigs2len($input_fasta_obj);
$contigslen_file = HelminthGenomeAnalysis::AvrilFileUtils::make_filename($outputdir); # MAKE A TEMPORARY FILE IN THE CURRENT DIRECTORY
open(CONTIGSLENFILE,">$contigslen_file") || die "ERROR: run_main_program: cannot open $contigslen_file\n";
foreach my $contig (keys %{$contigs2len})
{
$contiglen = $contigs2len->{$contig};
# ADD ONE TO THE SCAFFOLD LENGTH, SO THAT add_flanking_region_to_gff_features FUNCTION WILL WORK, EVEN IF THE END OF AN EXON/CDS FEATURE
# IS THE LAST BASE OF THE SCAFFOLD:
$contiglen = $contiglen + 1;
print CONTIGSLENFILE "$contig\t$contiglen\n";
}
close(CONTIGSLENFILE);
# GET A GFF FILE OF 'CDS' FEATURES:
$cds_gff = HelminthGenomeAnalysis::AvrilFileUtils::make_filename($outputdir); # MAKE A TEMPORARY FILE IN THE CURRENT DIRECTORY
$feature_types = ['CDS'];
$returnvalue = HelminthGenomeAnalysis::AvrilGffUtils::get_gff_lines_for_feature_types($input_gff,$feature_types,$cds_gff);
# SORT $cds_gff:
$sorted_cds_gff = HelminthGenomeAnalysis::AvrilFileUtils::make_filename($outputdir);
$returnvalue = HelminthGenomeAnalysis::AvrilGffUtils::sort_gff($cds_gff,$sorted_cds_gff,'CDS');
# GET A GFF FILE OF 'exon' FEATURES:
$exon_gff = HelminthGenomeAnalysis::AvrilFileUtils::make_filename($outputdir); # MAKE A TEMPORARY FILE IN THE CURRENT DIRECTORY
$feature_types = ['exon'];
$returnvalue = HelminthGenomeAnalysis::AvrilGffUtils::get_gff_lines_for_feature_types($input_gff,$feature_types,$exon_gff);
# SORT $exon_gff:
$sorted_exon_gff = HelminthGenomeAnalysis::AvrilFileUtils::make_filename($outputdir);
$returnvalue = HelminthGenomeAnalysis::AvrilGffUtils::sort_gff($exon_gff,$sorted_exon_gff,'exon');
# MERGE OVERLAPPING CDS FEATURES, OR FEATURES THAT HAVE A 0 bp INTRON BETWEEN THEM:
$returnvalue = HelminthGenomeAnalysis::AvrilGffUtils::merge_overlapping_cds($sorted_cds_gff,$sorted_exon_gff,$input_gff,$output_gff,$outputdir,$contigslen_file);
# DELETE TEMPORARY FILES:
system "rm -f $exon_gff";
system "rm -f $cds_gff";
system "rm -f $sorted_cds_gff";
system "rm -f $sorted_exon_gff";
system "rm -f $contigslen_file";
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment