standage/xtractore.pl

## xtractore.pl
#!/usr/bin/env perl

# Copyright (c) 2010-2011, Daniel S. Standage <daniel.standage@gmail.com>
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

# Pragmas
use strict;
use warnings;

# Modules
use Bio::Location::Split;
use Bio::SeqIO;
use Bio::Tools::GFF;
use Getopt::Long;

# Command line usage
my $usage = "
Usage: xtractore [options] gff3_file fasta_file
  --help               Print this help message and exit
  --list=STRING        File containing IDs of genes to extract (one ID per line)
  --outfile=STRING     File to which extracted sequences will be printed
                       (default is STDOUT)
  --quiet              Don't print warnings, just fatal errors
  --type=STRING        Type of feature to extract (default is 'gene'); using 'cds'
                       will extract the coding sequences of each 'gene' feature
";

# Option defaults
my $help    = '';
my $list    = '';
my $outfile = '';
my $quiet   = '';
my $type    = 'gene';

# Parse options from command line
GetOptions
(
  'help'      => \$help,
  'list=s'    => \$list,
  'outfile=s' => \$outfile,
  'quiet'     => \$quiet,
  'type=s'    => \$type,
);
$type = lc($type);
if($help)
{
  printf(STDERR $usage);
  exit();
}
if($outfile ne '')
{
  close(STDOUT);
  open(STDOUT, ">", $outfile) or die("[xtractore] Error: unable to write results to '$outfile' $!");
}
if( scalar(@ARGV) < 2 )
{
  printf(STDERR "[xtractore] Error: must provide corresponding GFF3 and Fasta files %s", $usage);
  die();
}

# Create input streams
my $gff3_infile  = shift(@ARGV);
my $fasta_infile = shift(@ARGV);
my $gff3 = Bio::Tools::GFF->new( -file => $gff3_infile, -gff_version => 3 );
my $fasta = Bio::SeqIO->new( -file => $fasta_infile, -format => 'Fasta' );
my $outstream = Bio::SeqIO->new( -fh => \*STDOUT, -format => 'Fasta' );

my $genes_to_extract = {};
if($list ne '')
{
  open(my $LIST, "<", $list) or die("[xtractore] Error: unable to open list file '$list'\n");
  while(<$LIST>)
  {
    chomp();
    $genes_to_extract->{ $_ } = 1 if($_ ne "");
  }
  close($LIST);
}

=pod
Subsequences to be extracted will be stored in memory with the following data structure
$subseq_locations =
{
  seq_id =>
  {
    subseq_id => $bio_location_split_obj_1,
    subseq_id => $bio_location_split_obj_2,
    subseq_id => $bio_location_split_obj_3,
  },
};
=cut
my $subseq_locations = {};
my $minus = {};
while( my $feature = $gff3->next_feature )
{
  if($type eq "cds")
  {
    if( $feature->primary_tag eq "mRNA" )
    {
      my($gene_id) = $feature->get_tag_values("Parent");
      my($mRNA_id) = $feature->get_tag_values("ID");
      next unless( $list eq '' or $genes_to_extract->{$gene_id} );
      $subseq_locations->{ $feature->seq_id }->{ $mRNA_id } = Bio::Location::Split->new();
    }
    elsif( $feature->primary_tag eq "CDS" )
    {
      my($mRNA_id) = $feature->get_tag_values("Parent");
      if( $subseq_locations->{ $feature->seq_id }->{ $mRNA_id } )
      {
        my $loc = $feature->location;
        $minus->{ $mRNA_id } = 1 if($loc->strand == -1);
        $loc->strand(1);
        $subseq_locations->{ $feature->seq_id }->{ $mRNA_id }->add_sub_Location( $loc );
      }
    }
  }
  else
  {
    if( $feature->primary_tag eq $type )
    {
      my $feat_id;
      if($feature->has_tag("ID"))
      {
        ($feat_id) = $feature->get_tag_values("ID");
      }
      else
      {
        $feat_id = sprintf("%s:%lu-%lu", $feature->seq_id, $feature->location->start, $feature->location->end);
      }
      next unless($list eq "" or $genes_to_extract->{$feat_id});

      $subseq_locations->{ $feature->seq_id }->{ $feat_id } = Bio::Location::Split->new();
      my $loc = $feature->location;
      $minus->{ $feat_id } = 1 if($loc->strand == -1);
      $loc->strand(1);
      $subseq_locations->{ $feature->seq_id }->{ $feat_id }->add_sub_Location( $loc );
    }
  }
}

# Iterate through all sequences and print out extracted subsequences
while( my $seq = $fasta->next_seq )
{
  foreach my $subseq_id( sort(keys(%{$subseq_locations->{$seq->id}})) )
  {
    my $location = $subseq_locations->{$seq->id}->{$subseq_id};
    my $subseq = $seq->subseq($location);
    my $outseq = Bio::PrimarySeq->new( -id => $subseq_id, -seq => $subseq );
    $outseq = $outseq->revcom if($minus->{$subseq_id});
    if( $type eq "cds" and uc($outseq->subseq(1,3)) ne "ATG" and not $quiet )
    {
      printf(STDERR "[xtractore] Warning: subsequence '%s' does not begin with 'ATG'\n", $subseq_id);
    }
    $outstream->write_seq($outseq);
  }
}

close(STDOUT);
	#!/usr/bin/env perl

	# Copyright (c) 2010-2011, Daniel S. Standage <daniel.standage@gmail.com>
	#
	# Permission to use, copy, modify, and/or distribute this software for any
	# purpose with or without fee is hereby granted, provided that the above
	# copyright notice and this permission notice appear in all copies.
	#
	# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
	# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
	# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
	# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
	# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
	# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
	# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

	# Pragmas
	use strict;
	use warnings;

	# Modules
	use Bio::Location::Split;
	use Bio::SeqIO;
	use Bio::Tools::GFF;
	use Getopt::Long;

	# Command line usage
	my $usage = "
	Usage: xtractore [options] gff3_file fasta_file
	--help Print this help message and exit
	--list=STRING File containing IDs of genes to extract (one ID per line)
	--outfile=STRING File to which extracted sequences will be printed
	(default is STDOUT)
	--quiet Don't print warnings, just fatal errors
	--type=STRING Type of feature to extract (default is 'gene'); using 'cds'
	will extract the coding sequences of each 'gene' feature
	";

	# Option defaults
	my $help = '';
	my $list = '';
	my $outfile = '';
	my $quiet = '';
	my $type = 'gene';

	# Parse options from command line
	GetOptions
	(
	'help' => \$help,
	'list=s' => \$list,
	'outfile=s' => \$outfile,
	'quiet' => \$quiet,
	'type=s' => \$type,
	);
	$type = lc($type);
	if($help)
	{
	printf(STDERR $usage);
	exit();
	}
	if($outfile ne '')
	{
	close(STDOUT);
	open(STDOUT, ">", $outfile) or die("[xtractore] Error: unable to write results to '$outfile' $!");
	}
	if( scalar(@ARGV) < 2 )
	{
	printf(STDERR "[xtractore] Error: must provide corresponding GFF3 and Fasta files %s", $usage);
	die();
	}

	# Create input streams
	my $gff3_infile = shift(@ARGV);
	my $fasta_infile = shift(@ARGV);
	my $gff3 = Bio::Tools::GFF->new( -file => $gff3_infile, -gff_version => 3 );
	my $fasta = Bio::SeqIO->new( -file => $fasta_infile, -format => 'Fasta' );
	my $outstream = Bio::SeqIO->new( -fh => \*STDOUT, -format => 'Fasta' );

	my $genes_to_extract = {};
	if($list ne '')
	{
	open(my $LIST, "<", $list) or die("[xtractore] Error: unable to open list file '$list'\n");
	while(<$LIST>)
	{
	chomp();
	$genes_to_extract->{ $_ } = 1 if($_ ne "");
	}
	close($LIST);
	}

	=pod
	Subsequences to be extracted will be stored in memory with the following data structure
	$subseq_locations =
	{
	seq_id =>
	{
	subseq_id => $bio_location_split_obj_1,
	subseq_id => $bio_location_split_obj_2,
	subseq_id => $bio_location_split_obj_3,
	},
	};
	=cut
	my $subseq_locations = {};
	my $minus = {};
	while( my $feature = $gff3->next_feature )
	{
	if($type eq "cds")
	{
	if( $feature->primary_tag eq "mRNA" )
	{
	my($gene_id) = $feature->get_tag_values("Parent");
	my($mRNA_id) = $feature->get_tag_values("ID");
	next unless( $list eq '' or $genes_to_extract->{$gene_id} );
	$subseq_locations->{ $feature->seq_id }->{ $mRNA_id } = Bio::Location::Split->new();
	}
	elsif( $feature->primary_tag eq "CDS" )
	{
	my($mRNA_id) = $feature->get_tag_values("Parent");
	if( $subseq_locations->{ $feature->seq_id }->{ $mRNA_id } )
	{
	my $loc = $feature->location;
	$minus->{ $mRNA_id } = 1 if($loc->strand == -1);
	$loc->strand(1);
	$subseq_locations->{ $feature->seq_id }->{ $mRNA_id }->add_sub_Location( $loc );
	}
	}
	}
	else
	{
	if( $feature->primary_tag eq $type )
	{
	my $feat_id;
	if($feature->has_tag("ID"))
	{
	($feat_id) = $feature->get_tag_values("ID");
	}
	else
	{
	$feat_id = sprintf("%s:%lu-%lu", $feature->seq_id, $feature->location->start, $feature->location->end);
	}
	next unless($list eq "" or $genes_to_extract->{$feat_id});

	$subseq_locations->{ $feature->seq_id }->{ $feat_id } = Bio::Location::Split->new();
	my $loc = $feature->location;
	$minus->{ $feat_id } = 1 if($loc->strand == -1);
	$loc->strand(1);
	$subseq_locations->{ $feature->seq_id }->{ $feat_id }->add_sub_Location( $loc );
	}
	}
	}

	# Iterate through all sequences and print out extracted subsequences
	while( my $seq = $fasta->next_seq )
	{
	foreach my $subseq_id( sort(keys(%{$subseq_locations->{$seq->id}})) )
	{
	my $location = $subseq_locations->{$seq->id}->{$subseq_id};
	my $subseq = $seq->subseq($location);
	my $outseq = Bio::PrimarySeq->new( -id => $subseq_id, -seq => $subseq );
	$outseq = $outseq->revcom if($minus->{$subseq_id});
	if( $type eq "cds" and uc($outseq->subseq(1,3)) ne "ATG" and not $quiet )
	{
	printf(STDERR "[xtractore] Warning: subsequence '%s' does not begin with 'ATG'\n", $subseq_id);
	}
	$outstream->write_seq($outseq);
	}
	}

	close(STDOUT);