standage/select-random.pl

## select-random.pl
#!/usr/bin/env perl

# Copyright (c) 2013, Daniel S. Standage <daniel.standage@gmail.com>
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

use strict;
use Bio::SeqIO;
use Getopt::Long;
use List::Util 'shuffle';

sub print_usage
{
  my $OUT = shift(@_);
  print $OUT "Usage: perl select-random.pl [options] seqs.fa
  Options:
    -h|--help               print this help message and exit
    -l|--seqlength: INT     randomly trim the sequence(s) to the given number of
                            nucleotides; 0 will not trim the sequence; default
                            is 0
    -n|--numseqs: INT       the number of sequences to select at random from the
                            input data; default is 1
";
}

my $numseqs   = 1;
my $seqlength = 0;
GetOptions
(
  "h|help"        => sub{ print_usage(\*STDOUT); exit(0) },
  "l|seqlength=i" => \$seqlength,
  "n|numseqs=i"   => \$numseqs,
);

my $seqfile = shift(@ARGV);
unless(-r $seqfile)
{
  printf(STDERR "error: input file '%s' unreadable", $seqfile);
  print_usage($seqfile);
  die();
}

my %seqs;
my $inseqs = Bio::SeqIO->new( -file => $seqfile, -format => "Fasta" );
while(my $seq = $inseqs->next_seq)
{
  my $id = $seq->id;
  $seqs{$id} = $seq
}

my @seqids = shuffle(keys(%seqs));
for my $seqid(@seqids)
{
  my $seq = $seqs{$seqid};
  if($seq->length >= $seqlength)
  {
    my $start = int(rand($seq->length - $seqlength)) + 1;
    my $end = $start + $seqlength - 1;
    my $subseq = Bio::PrimarySeq->new( -id => $seqid, -desc => "$start..$end", -seq => $seq->subseq($start, $end) );
    my $seqout = Bio::SeqIO->new( -fh => \*STDOUT, -format => "Fasta" );
    $seqout->write_seq($subseq);
    last;
  }
}
	#!/usr/bin/env perl

	# Copyright (c) 2013, Daniel S. Standage <daniel.standage@gmail.com>
	#
	# Permission to use, copy, modify, and/or distribute this software for any
	# purpose with or without fee is hereby granted, provided that the above
	# copyright notice and this permission notice appear in all copies.
	#
	# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
	# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
	# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
	# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
	# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
	# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
	# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

	use strict;
	use Bio::SeqIO;
	use Getopt::Long;
	use List::Util 'shuffle';

	sub print_usage
	{
	my $OUT = shift(@_);
	print $OUT "Usage: perl select-random.pl [options] seqs.fa
	Options:
	-h\|--help print this help message and exit
	-l\|--seqlength: INT randomly trim the sequence(s) to the given number of
	nucleotides; 0 will not trim the sequence; default
	is 0
	-n\|--numseqs: INT the number of sequences to select at random from the
	input data; default is 1
	";
	}

	my $numseqs = 1;
	my $seqlength = 0;
	GetOptions
	(
	"h\|help" => sub{ print_usage(\*STDOUT); exit(0) },
	"l\|seqlength=i" => \$seqlength,
	"n\|numseqs=i" => \$numseqs,
	);

	my $seqfile = shift(@ARGV);
	unless(-r $seqfile)
	{
	printf(STDERR "error: input file '%s' unreadable", $seqfile);
	print_usage($seqfile);
	die();
	}

	my %seqs;
	my $inseqs = Bio::SeqIO->new( -file => $seqfile, -format => "Fasta" );
	while(my $seq = $inseqs->next_seq)
	{
	my $id = $seq->id;
	$seqs{$id} = $seq
	}

	my @seqids = shuffle(keys(%seqs));
	for my $seqid(@seqids)
	{
	my $seq = $seqs{$seqid};
	if($seq->length >= $seqlength)
	{
	my $start = int(rand($seq->length - $seqlength)) + 1;
	my $end = $start + $seqlength - 1;
	my $subseq = Bio::PrimarySeq->new( -id => $seqid, -desc => "$start..$end", -seq => $seq->subseq($start, $end) );
	my $seqout = Bio::SeqIO->new( -fh => \*STDOUT, -format => "Fasta" );
	$seqout->write_seq($subseq);
	last;
	}
	}