tsibley/Bio-Regexp-inverted-match

## Bio-Regexp-inverted-match
use strict;
use warnings;

use DDP;
use Bio::Regexp;
use Bio::Tools::IUPAC;
use Bio::PrimarySeq;

my $query = "GTGYCAGCMGCCGCGGTAA";
my $input = "TAACTCCGNGCCAGCAGCCNCGGTAATACGGAGG";

print "Query = $query\n";
print "Input = $input\n\n";

print "With standard use\n";
p match_normal($query, $input);

print "With inverted query/input\n";
p match_inverted($query, $input);

sub match_normal {
    my ($query, $input) = @_;

    my $probe = Bio::Regexp->new->dna->add($query);
    return [ $probe->match($input) ];
}

sub match_inverted {
    my ($query, $input) = @_;

    # Generate all possible substrings of the input sequence with length equal
    # to the query sequence.  This could be ephemeral within a loop instead of
    # stored in an array if memory constraints are an issue.
    my %input;
    my $start = 0;
    my $qlen  = length $query;

    while (($qlen + $start) <= length $input) {
        push @{ $input{ substr $input, $start, $qlen } ||= [] }, $start;
        $start++;
    }

    # Match all inputs of the same length to the query
    my $probe = Bio::Regexp->new->dna;
    $probe->add($_) for keys %input;

    # Decompose IUPAC bases in the query sequence, of which there should be
    # much less than in the input.
    my $queries = Bio::Tools::IUPAC->new(
        -seq => Bio::PrimarySeq->new( -seq => $query, -alphabet => 'dna' )
    );
    my @matches;
    while (my $unambiguous_query = $queries->next_seq) {
        push @matches, $probe->match( $unambiguous_query->seq );
    }

    # Adjust indexes of match start/stop
    my @real_matches;
    while (my $match = splice @matches, 0, 1) {
        my $input_fragment = $match->{regexp};

          ($match->{match},  $match->{regexp})
        = ($match->{regexp}, $match->{match});

        # XXX TODO: Doesn't handle matches on strand 2 (RC)
        $match->{match} = $input;

        for my $offset (@{ $input{ $input_fragment } }) {
            my %adjusted_match = %$match;
            $adjusted_match{start} += $offset;
            $adjusted_match{end}   += $offset;
            push @real_matches, \%adjusted_match;
        }
    }
    return \@real_matches;
}
	use strict;
	use warnings;

	use DDP;
	use Bio::Regexp;
	use Bio::Tools::IUPAC;
	use Bio::PrimarySeq;

	my $query = "GTGYCAGCMGCCGCGGTAA";
	my $input = "TAACTCCGNGCCAGCAGCCNCGGTAATACGGAGG";

	print "Query = $query\n";
	print "Input = $input\n\n";

	print "With standard use\n";
	p match_normal($query, $input);

	print "With inverted query/input\n";
	p match_inverted($query, $input);

	sub match_normal {
	my ($query, $input) = @_;

	my $probe = Bio::Regexp->new->dna->add($query);
	return [ $probe->match($input) ];
	}

	sub match_inverted {
	my ($query, $input) = @_;

	# Generate all possible substrings of the input sequence with length equal
	# to the query sequence. This could be ephemeral within a loop instead of
	# stored in an array if memory constraints are an issue.
	my %input;
	my $start = 0;
	my $qlen = length $query;

	while (($qlen + $start) <= length $input) {
	push @{ $input{ substr $input, $start, $qlen } \|\|= [] }, $start;
	$start++;
	}

	# Match all inputs of the same length to the query
	my $probe = Bio::Regexp->new->dna;
	$probe->add($_) for keys %input;

	# Decompose IUPAC bases in the query sequence, of which there should be
	# much less than in the input.
	my $queries = Bio::Tools::IUPAC->new(
	-seq => Bio::PrimarySeq->new( -seq => $query, -alphabet => 'dna' )
	);
	my @matches;
	while (my $unambiguous_query = $queries->next_seq) {
	push @matches, $probe->match( $unambiguous_query->seq );
	}

	# Adjust indexes of match start/stop
	my @real_matches;
	while (my $match = splice @matches, 0, 1) {
	my $input_fragment = $match->{regexp};

	($match->{match}, $match->{regexp})
	= ($match->{regexp}, $match->{match});

	# XXX TODO: Doesn't handle matches on strand 2 (RC)
	$match->{match} = $input;

	for my $offset (@{ $input{ $input_fragment } }) {
	my %adjusted_match = %$match;
	$adjusted_match{start} += $offset;
	$adjusted_match{end} += $offset;
	push @real_matches, \%adjusted_match;
	}
	}
	return \@real_matches;
	}