Skip to content

Instantly share code, notes, and snippets.

@standage
Last active December 16, 2015 05:59
Show Gist options
  • Save standage/5387932 to your computer and use it in GitHub Desktop.
Save standage/5387932 to your computer and use it in GitHub Desktop.
Select sequence(s) at random from a Fasta file.
#!/usr/bin/env perl
# Copyright (c) 2013, Daniel S. Standage <daniel.standage@gmail.com>
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
use strict;
use Bio::SeqIO;
use Getopt::Long;
use List::Util 'shuffle';
sub print_usage
{
my $OUT = shift(@_);
print $OUT "Usage: perl select-random.pl [options] seqs.fa
Options:
-h|--help print this help message and exit
-l|--seqlength: INT randomly trim the sequence(s) to the given number of
nucleotides; 0 will not trim the sequence; default
is 0
-n|--numseqs: INT the number of sequences to select at random from the
input data; default is 1
";
}
my $numseqs = 1;
my $seqlength = 0;
GetOptions
(
"h|help" => sub{ print_usage(\*STDOUT); exit(0) },
"l|seqlength=i" => \$seqlength,
"n|numseqs=i" => \$numseqs,
);
my $seqfile = shift(@ARGV);
unless(-r $seqfile)
{
printf(STDERR "error: input file '%s' unreadable", $seqfile);
print_usage($seqfile);
die();
}
my %seqs;
my $inseqs = Bio::SeqIO->new( -file => $seqfile, -format => "Fasta" );
while(my $seq = $inseqs->next_seq)
{
my $id = $seq->id;
$seqs{$id} = $seq
}
my @seqids = shuffle(keys(%seqs));
for my $seqid(@seqids)
{
my $seq = $seqs{$seqid};
if($seq->length >= $seqlength)
{
my $start = int(rand($seq->length - $seqlength)) + 1;
my $end = $start + $seqlength - 1;
my $subseq = Bio::PrimarySeq->new( -id => $seqid, -desc => "$start..$end", -seq => $seq->subseq($start, $end) );
my $seqout = Bio::SeqIO->new( -fh => \*STDOUT, -format => "Fasta" );
$seqout->write_seq($subseq);
last;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment