Skip to content

Instantly share code, notes, and snippets.

@tsibley
Created June 27, 2014 21:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tsibley/6ec050b8ac7e54d6a8a7 to your computer and use it in GitHub Desktop.
Save tsibley/6ec050b8ac7e54d6a8a7 to your computer and use it in GitHub Desktop.
use strict;
use warnings;
use DDP;
use Bio::Regexp;
use Bio::Tools::IUPAC;
use Bio::PrimarySeq;
my $query = "GTGYCAGCMGCCGCGGTAA";
my $input = "TAACTCCGNGCCAGCAGCCNCGGTAATACGGAGG";
print "Query = $query\n";
print "Input = $input\n\n";
print "With standard use\n";
p match_normal($query, $input);
print "With inverted query/input\n";
p match_inverted($query, $input);
sub match_normal {
my ($query, $input) = @_;
my $probe = Bio::Regexp->new->dna->add($query);
return [ $probe->match($input) ];
}
sub match_inverted {
my ($query, $input) = @_;
# Generate all possible substrings of the input sequence with length equal
# to the query sequence. This could be ephemeral within a loop instead of
# stored in an array if memory constraints are an issue.
my %input;
my $start = 0;
my $qlen = length $query;
while (($qlen + $start) <= length $input) {
push @{ $input{ substr $input, $start, $qlen } ||= [] }, $start;
$start++;
}
# Match all inputs of the same length to the query
my $probe = Bio::Regexp->new->dna;
$probe->add($_) for keys %input;
# Decompose IUPAC bases in the query sequence, of which there should be
# much less than in the input.
my $queries = Bio::Tools::IUPAC->new(
-seq => Bio::PrimarySeq->new( -seq => $query, -alphabet => 'dna' )
);
my @matches;
while (my $unambiguous_query = $queries->next_seq) {
push @matches, $probe->match( $unambiguous_query->seq );
}
# Adjust indexes of match start/stop
my @real_matches;
while (my $match = splice @matches, 0, 1) {
my $input_fragment = $match->{regexp};
($match->{match}, $match->{regexp})
= ($match->{regexp}, $match->{match});
# XXX TODO: Doesn't handle matches on strand 2 (RC)
$match->{match} = $input;
for my $offset (@{ $input{ $input_fragment } }) {
my %adjusted_match = %$match;
$adjusted_match{start} += $offset;
$adjusted_match{end} += $offset;
push @real_matches, \%adjusted_match;
}
}
return \@real_matches;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment