Created
June 27, 2014 21:22
-
-
Save tsibley/6ec050b8ac7e54d6a8a7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use strict; | |
use warnings; | |
use DDP; | |
use Bio::Regexp; | |
use Bio::Tools::IUPAC; | |
use Bio::PrimarySeq; | |
my $query = "GTGYCAGCMGCCGCGGTAA"; | |
my $input = "TAACTCCGNGCCAGCAGCCNCGGTAATACGGAGG"; | |
print "Query = $query\n"; | |
print "Input = $input\n\n"; | |
print "With standard use\n"; | |
p match_normal($query, $input); | |
print "With inverted query/input\n"; | |
p match_inverted($query, $input); | |
sub match_normal { | |
my ($query, $input) = @_; | |
my $probe = Bio::Regexp->new->dna->add($query); | |
return [ $probe->match($input) ]; | |
} | |
sub match_inverted { | |
my ($query, $input) = @_; | |
# Generate all possible substrings of the input sequence with length equal | |
# to the query sequence. This could be ephemeral within a loop instead of | |
# stored in an array if memory constraints are an issue. | |
my %input; | |
my $start = 0; | |
my $qlen = length $query; | |
while (($qlen + $start) <= length $input) { | |
push @{ $input{ substr $input, $start, $qlen } ||= [] }, $start; | |
$start++; | |
} | |
# Match all inputs of the same length to the query | |
my $probe = Bio::Regexp->new->dna; | |
$probe->add($_) for keys %input; | |
# Decompose IUPAC bases in the query sequence, of which there should be | |
# much less than in the input. | |
my $queries = Bio::Tools::IUPAC->new( | |
-seq => Bio::PrimarySeq->new( -seq => $query, -alphabet => 'dna' ) | |
); | |
my @matches; | |
while (my $unambiguous_query = $queries->next_seq) { | |
push @matches, $probe->match( $unambiguous_query->seq ); | |
} | |
# Adjust indexes of match start/stop | |
my @real_matches; | |
while (my $match = splice @matches, 0, 1) { | |
my $input_fragment = $match->{regexp}; | |
($match->{match}, $match->{regexp}) | |
= ($match->{regexp}, $match->{match}); | |
# XXX TODO: Doesn't handle matches on strand 2 (RC) | |
$match->{match} = $input; | |
for my $offset (@{ $input{ $input_fragment } }) { | |
my %adjusted_match = %$match; | |
$adjusted_match{start} += $offset; | |
$adjusted_match{end} += $offset; | |
push @real_matches, \%adjusted_match; | |
} | |
} | |
return \@real_matches; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment