Last active
December 16, 2015 11:19
-
-
Save standage/5426573 to your computer and use it in GitHub Desktop.
Script for extracting sequences from a Fasta file given an annotation file in GFF3 format.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# Copyright (c) 2010-2011, Daniel S. Standage <daniel.standage@gmail.com> | |
# | |
# Permission to use, copy, modify, and/or distribute this software for any | |
# purpose with or without fee is hereby granted, provided that the above | |
# copyright notice and this permission notice appear in all copies. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | |
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | |
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
# Pragmas | |
use strict; | |
use warnings; | |
# Modules | |
use Bio::Location::Split; | |
use Bio::SeqIO; | |
use Bio::Tools::GFF; | |
use Getopt::Long; | |
# Command line usage | |
my $usage = " | |
Usage: xtractore [options] gff3_file fasta_file | |
--help Print this help message and exit | |
--list=STRING File containing IDs of genes to extract (one ID per line) | |
--outfile=STRING File to which extracted sequences will be printed | |
(default is STDOUT) | |
--quiet Don't print warnings, just fatal errors | |
--type=STRING Type of feature to extract (default is 'gene'); using 'cds' | |
will extract the coding sequences of each 'gene' feature | |
"; | |
# Option defaults | |
my $help = ''; | |
my $list = ''; | |
my $outfile = ''; | |
my $quiet = ''; | |
my $type = 'gene'; | |
# Parse options from command line | |
GetOptions | |
( | |
'help' => \$help, | |
'list=s' => \$list, | |
'outfile=s' => \$outfile, | |
'quiet' => \$quiet, | |
'type=s' => \$type, | |
); | |
$type = lc($type); | |
if($help) | |
{ | |
printf(STDERR $usage); | |
exit(); | |
} | |
if($outfile ne '') | |
{ | |
close(STDOUT); | |
open(STDOUT, ">", $outfile) or die("[xtractore] Error: unable to write results to '$outfile' $!"); | |
} | |
if( scalar(@ARGV) < 2 ) | |
{ | |
printf(STDERR "[xtractore] Error: must provide corresponding GFF3 and Fasta files %s", $usage); | |
die(); | |
} | |
# Create input streams | |
my $gff3_infile = shift(@ARGV); | |
my $fasta_infile = shift(@ARGV); | |
my $gff3 = Bio::Tools::GFF->new( -file => $gff3_infile, -gff_version => 3 ); | |
my $fasta = Bio::SeqIO->new( -file => $fasta_infile, -format => 'Fasta' ); | |
my $outstream = Bio::SeqIO->new( -fh => \*STDOUT, -format => 'Fasta' ); | |
my $genes_to_extract = {}; | |
if($list ne '') | |
{ | |
open(my $LIST, "<", $list) or die("[xtractore] Error: unable to open list file '$list'\n"); | |
while(<$LIST>) | |
{ | |
chomp(); | |
$genes_to_extract->{ $_ } = 1 if($_ ne ""); | |
} | |
close($LIST); | |
} | |
=pod | |
Subsequences to be extracted will be stored in memory with the following data structure | |
$subseq_locations = | |
{ | |
seq_id => | |
{ | |
subseq_id => $bio_location_split_obj_1, | |
subseq_id => $bio_location_split_obj_2, | |
subseq_id => $bio_location_split_obj_3, | |
}, | |
}; | |
=cut | |
my $subseq_locations = {}; | |
my $minus = {}; | |
while( my $feature = $gff3->next_feature ) | |
{ | |
if($type eq "cds") | |
{ | |
if( $feature->primary_tag eq "mRNA" ) | |
{ | |
my($gene_id) = $feature->get_tag_values("Parent"); | |
my($mRNA_id) = $feature->get_tag_values("ID"); | |
next unless( $list eq '' or $genes_to_extract->{$gene_id} ); | |
$subseq_locations->{ $feature->seq_id }->{ $mRNA_id } = Bio::Location::Split->new(); | |
} | |
elsif( $feature->primary_tag eq "CDS" ) | |
{ | |
my($mRNA_id) = $feature->get_tag_values("Parent"); | |
if( $subseq_locations->{ $feature->seq_id }->{ $mRNA_id } ) | |
{ | |
my $loc = $feature->location; | |
$minus->{ $mRNA_id } = 1 if($loc->strand == -1); | |
$loc->strand(1); | |
$subseq_locations->{ $feature->seq_id }->{ $mRNA_id }->add_sub_Location( $loc ); | |
} | |
} | |
} | |
else | |
{ | |
if( $feature->primary_tag eq $type ) | |
{ | |
my $feat_id; | |
if($feature->has_tag("ID")) | |
{ | |
($feat_id) = $feature->get_tag_values("ID"); | |
} | |
else | |
{ | |
$feat_id = sprintf("%s:%lu-%lu", $feature->seq_id, $feature->location->start, $feature->location->end); | |
} | |
next unless($list eq "" or $genes_to_extract->{$feat_id}); | |
$subseq_locations->{ $feature->seq_id }->{ $feat_id } = Bio::Location::Split->new(); | |
my $loc = $feature->location; | |
$minus->{ $feat_id } = 1 if($loc->strand == -1); | |
$loc->strand(1); | |
$subseq_locations->{ $feature->seq_id }->{ $feat_id }->add_sub_Location( $loc ); | |
} | |
} | |
} | |
# Iterate through all sequences and print out extracted subsequences | |
while( my $seq = $fasta->next_seq ) | |
{ | |
foreach my $subseq_id( sort(keys(%{$subseq_locations->{$seq->id}})) ) | |
{ | |
my $location = $subseq_locations->{$seq->id}->{$subseq_id}; | |
my $subseq = $seq->subseq($location); | |
my $outseq = Bio::PrimarySeq->new( -id => $subseq_id, -seq => $subseq ); | |
$outseq = $outseq->revcom if($minus->{$subseq_id}); | |
if( $type eq "cds" and uc($outseq->subseq(1,3)) ne "ATG" and not $quiet ) | |
{ | |
printf(STDERR "[xtractore] Warning: subsequence '%s' does not begin with 'ATG'\n", $subseq_id); | |
} | |
$outstream->write_seq($outseq); | |
} | |
} | |
close(STDOUT); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment