Skip to content

Instantly share code, notes, and snippets.

@standage
Last active December 16, 2015 11:19
Show Gist options
  • Save standage/5426573 to your computer and use it in GitHub Desktop.
Save standage/5426573 to your computer and use it in GitHub Desktop.
Script for extracting sequences from a Fasta file given an annotation file in GFF3 format.
#!/usr/bin/env perl
# Copyright (c) 2010-2011, Daniel S. Standage <daniel.standage@gmail.com>
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
# Pragmas
use strict;
use warnings;
# Modules
use Bio::Location::Split;
use Bio::SeqIO;
use Bio::Tools::GFF;
use Getopt::Long;
# Command line usage
my $usage = "
Usage: xtractore [options] gff3_file fasta_file
--help Print this help message and exit
--list=STRING File containing IDs of genes to extract (one ID per line)
--outfile=STRING File to which extracted sequences will be printed
(default is STDOUT)
--quiet Don't print warnings, just fatal errors
--type=STRING Type of feature to extract (default is 'gene'); using 'cds'
will extract the coding sequences of each 'gene' feature
";
# Option defaults
my $help = '';
my $list = '';
my $outfile = '';
my $quiet = '';
my $type = 'gene';
# Parse options from command line
GetOptions
(
'help' => \$help,
'list=s' => \$list,
'outfile=s' => \$outfile,
'quiet' => \$quiet,
'type=s' => \$type,
);
$type = lc($type);
if($help)
{
printf(STDERR $usage);
exit();
}
if($outfile ne '')
{
close(STDOUT);
open(STDOUT, ">", $outfile) or die("[xtractore] Error: unable to write results to '$outfile' $!");
}
if( scalar(@ARGV) < 2 )
{
printf(STDERR "[xtractore] Error: must provide corresponding GFF3 and Fasta files %s", $usage);
die();
}
# Create input streams
my $gff3_infile = shift(@ARGV);
my $fasta_infile = shift(@ARGV);
my $gff3 = Bio::Tools::GFF->new( -file => $gff3_infile, -gff_version => 3 );
my $fasta = Bio::SeqIO->new( -file => $fasta_infile, -format => 'Fasta' );
my $outstream = Bio::SeqIO->new( -fh => \*STDOUT, -format => 'Fasta' );
my $genes_to_extract = {};
if($list ne '')
{
open(my $LIST, "<", $list) or die("[xtractore] Error: unable to open list file '$list'\n");
while(<$LIST>)
{
chomp();
$genes_to_extract->{ $_ } = 1 if($_ ne "");
}
close($LIST);
}
=pod
Subsequences to be extracted will be stored in memory with the following data structure
$subseq_locations =
{
seq_id =>
{
subseq_id => $bio_location_split_obj_1,
subseq_id => $bio_location_split_obj_2,
subseq_id => $bio_location_split_obj_3,
},
};
=cut
my $subseq_locations = {};
my $minus = {};
while( my $feature = $gff3->next_feature )
{
if($type eq "cds")
{
if( $feature->primary_tag eq "mRNA" )
{
my($gene_id) = $feature->get_tag_values("Parent");
my($mRNA_id) = $feature->get_tag_values("ID");
next unless( $list eq '' or $genes_to_extract->{$gene_id} );
$subseq_locations->{ $feature->seq_id }->{ $mRNA_id } = Bio::Location::Split->new();
}
elsif( $feature->primary_tag eq "CDS" )
{
my($mRNA_id) = $feature->get_tag_values("Parent");
if( $subseq_locations->{ $feature->seq_id }->{ $mRNA_id } )
{
my $loc = $feature->location;
$minus->{ $mRNA_id } = 1 if($loc->strand == -1);
$loc->strand(1);
$subseq_locations->{ $feature->seq_id }->{ $mRNA_id }->add_sub_Location( $loc );
}
}
}
else
{
if( $feature->primary_tag eq $type )
{
my $feat_id;
if($feature->has_tag("ID"))
{
($feat_id) = $feature->get_tag_values("ID");
}
else
{
$feat_id = sprintf("%s:%lu-%lu", $feature->seq_id, $feature->location->start, $feature->location->end);
}
next unless($list eq "" or $genes_to_extract->{$feat_id});
$subseq_locations->{ $feature->seq_id }->{ $feat_id } = Bio::Location::Split->new();
my $loc = $feature->location;
$minus->{ $feat_id } = 1 if($loc->strand == -1);
$loc->strand(1);
$subseq_locations->{ $feature->seq_id }->{ $feat_id }->add_sub_Location( $loc );
}
}
}
# Iterate through all sequences and print out extracted subsequences
while( my $seq = $fasta->next_seq )
{
foreach my $subseq_id( sort(keys(%{$subseq_locations->{$seq->id}})) )
{
my $location = $subseq_locations->{$seq->id}->{$subseq_id};
my $subseq = $seq->subseq($location);
my $outseq = Bio::PrimarySeq->new( -id => $subseq_id, -seq => $subseq );
$outseq = $outseq->revcom if($minus->{$subseq_id});
if( $type eq "cds" and uc($outseq->subseq(1,3)) ne "ATG" and not $quiet )
{
printf(STDERR "[xtractore] Warning: subsequence '%s' does not begin with 'ATG'\n", $subseq_id);
}
$outstream->write_seq($outseq);
}
}
close(STDOUT);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment