Created
January 23, 2016 01:51
-
-
Save BenGoldberg1/538e6aaf78433b8bd632 to your computer and use it in GitHub Desktop.
Modified version of http://perl.plover.com/Regex/Regex.pm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use strict; | |
use warnings; | |
# First, I started with http://perl.plover.com/Regex/Regex.pm | |
# Then, I beat it with an OO shaped stick until it looked like this. | |
# Note: Only the parser and executer are in this file; a compiler | |
# based vaguely on the original is in PloverHash.pl, and a compiler | |
# which uses an array instead of a hash is in PloverArray.pl | |
# -*- mode: perl; perl-indent-level: 2 -*- | |
# | |
# Compile and evaluate regular expressions in Perl without | |
# using built-in regular expressions. | |
# | |
# Original Author: Mark-Jason Dominus (mjd-tpj-regex@plover.com) | |
# Owner of the stick which beat MJD's program into this shape: | |
# Ben Goldberg (ben-goldberg@hotmail.com) | |
# Like the original this program is in the PUBLIC DOMAIN. | |
# | |
# | |
# Note to readers: | |
# The code is in three parts. | |
# 1. A parser for regexes | |
# 2. A compiler that trnaslates regexes into machines | |
# 3. An executer that executes a machine on a specified input | |
# | |
# The parser is really turgid and hard to understand, because it isn't | |
# the point of the package. I recommend that you skip to part 2 or 3 | |
# immediately, then work backwards. Parts 2 and 3 are much clearer. | |
package Regex; | |
# Regexps are handled in three phases. First, they're parsed from a string | |
# form into an internal parse tree form. | |
# | |
# Originally it was like so: | |
# ABC => [ "CONCAT" => [ A, B, C ] ] | |
# A* => [ "STAR" => A ] | |
# A+ => [ "PLUS" => A ] | |
# A|B|C => [ "ALTERN" => [ A, B, C ] ] | |
# literal character x => [ "LITERAL" => x ] | |
# | |
# After applying my OO-shaped-stick, the parse tree looks like: | |
# ABC => bless [A, B, C], "CONCAT" | |
# A* => bless [A], "STAR" | |
# , etc.. | |
# | |
# `parse' does this. | |
use vars qw,$PARSE_ERROR @seen,; | |
sub parse { | |
my @t = split(//, $_[1]); | |
local @seen = (); | |
$PARSE_ERROR = "Something is wrong"; | |
my $tree = parse_altern(@t); | |
die $PARSE_ERROR unless $tree; | |
$tree; | |
} | |
sub parse_altern { | |
my @alterns; | |
my @terms; | |
my $c; | |
while (defined($c = shift @_)) { | |
next if $c eq ''; | |
push @seen, $c; | |
if ($c eq '(') { | |
my $next_term = &parse_altern; | |
push @terms, $next_term; | |
} elsif ($c eq ')') { | |
#push @alterns, &joinup(CONCAT => @terms) if @terms; | |
#return &joinup(ALTERN => @alterns); | |
last; | |
} elsif ($c eq '|') { | |
push @alterns, joinup(CONCAT => @terms) if @terms; | |
@terms = (); | |
} elsif ($c eq '*' || $c eq '+') { | |
if (@terms) { | |
$terms[-1] = bless [ $terms[-1] ], ($c eq '*' ? 'STAR' : 'PLUS'); | |
} else { | |
$PARSE_ERROR = "Did not expect $c!\n\t@seen\n\t*\n\t@_\n"; | |
return undef; | |
} | |
} elsif ($c eq '\\') { | |
if( $CLASS::{$_[0]. q{::}} ) { | |
push @terms, bless [], "CLASS::".shift @_; | |
} else { | |
push @terms, bless [shift @_], 'LITERAL'; | |
} | |
} else { | |
push @terms, bless [$c], 'LITERAL'; | |
} | |
} # While there are tokens... | |
push @alterns, joinup(CONCAT => @terms) if @terms; | |
return joinup(ALTERN => @alterns) if @alterns; | |
return undef; | |
} | |
sub joinup { | |
my $tag = shift; | |
if (@_ == 1) { | |
$_[0]; | |
} else { | |
bless [@_], $tag; | |
} | |
} | |
################################################################ | |
# | |
# Execute NFA on a given string | |
# | |
################################################################ | |
package NFA::Exec; | |
sub match { | |
my $pack = shift; | |
my $nfa = shift; | |
my $string = shift; | |
my $machine = $pack->init($nfa, $string); | |
$machine->run(); | |
$machine->final_state(); | |
} | |
sub new { | |
&init(@_); | |
} | |
# | |
# Create a new execution of the specified NFS, and feed it | |
# the specified string as its input. | |
# | |
sub init { | |
my $pack = shift; | |
my $nfa = shift; | |
my $string = shift; | |
my $self = {}; | |
$self->{nfa} = $nfa; | |
$self->{input} = $string; | |
$self->{pos} = 0; | |
$self->{states} = [ $self->{nfa}->start_state ]; | |
bless $self => $pack; | |
$self->epsilon_transit(); | |
$self; | |
} | |
# | |
# Run an execution to the end of the input | |
# | |
sub run { | |
my $self = shift; | |
until ($self->end_of_input() || $self->states() == 0) { | |
$self->step; | |
} | |
} | |
# | |
# Is this execution object at the end of its input? | |
# | |
sub end_of_input { | |
my $self = shift; | |
$self->{pos} >= length($self->{input}); | |
} | |
# | |
# Advance an execution by one step. | |
# | |
sub step { | |
my $self = shift; | |
my $next_symbol = substr($self->{input}, $self->{pos}, 1); | |
if ($next_symbol eq '') { | |
# error | |
} else { | |
$self->transit($next_symbol); | |
$self->epsilon_transit(); | |
} | |
$self->{pos}++; | |
} | |
# | |
# Perform e-transitions in an execution | |
# | |
sub epsilon_transit { | |
my $self = shift; | |
my @newstates = $self->states; | |
my @result = @newstates; | |
my %seen = map {($_ => 1)} @newstates; | |
for (;;) { | |
my $s; | |
my @nextstates; | |
foreach my $s (@newstates) { | |
my @n = $self->{nfa}->transitions_for_symbol($s, '') or next; | |
push @nextstates, @n; | |
} | |
@newstates = grep {! $seen{$_}++} @nextstates; | |
last unless @newstates; | |
push @result, @newstates; | |
} | |
$self->{states} = \@result; | |
} | |
# | |
# Perform a transition | |
# | |
sub transit { | |
my $self = shift; | |
my $symbol = shift; | |
$self->{states} = [ $self->transitions_for_symbol($symbol) ]; | |
} | |
# | |
# Current states | |
# | |
sub states { | |
my $self = shift; | |
@{$self->{states}}; | |
} | |
# | |
# Should we accept? | |
# | |
sub final_state { | |
my $self = shift; | |
my $s; | |
foreach my $s ($self->states) { | |
return 1 if $self->{nfa}->is_end_state($s); | |
} | |
0; | |
} | |
# | |
# Get current transition table | |
# This is interesting because we have to merge the transition | |
# tables for several states. | |
sub transitions_for_symbol { | |
my ($self, $symbol) = @_; | |
my %ttab; | |
foreach my $s ($self->states) { | |
my @n = $self->{nfa}->transitions_for_symbol($s, $symbol) or next; | |
@ttab{ @n } = (); | |
} | |
keys %ttab; | |
} | |
sub NFA::self_test { | |
require Time::HiRes; | |
require Data::Dumper; | |
my @times; | |
push @times, Time::HiRes::time(); | |
#my $e = Regex->parse(q{((0|1|2|3|4|5|6|7|8|9)+|::)*}); | |
my $e = Regex->parse(q{(\d+|::)*}); | |
push @times, Time::HiRes::time(); | |
my $m = NFA->new($e); | |
#print Data::Dumper->new([$m], ['m'])->Terse(1)->Sortkeys(1)->Dump; | |
push @times, Time::HiRes::time(); | |
my $s = '::1234::5678901234567890::::1235467890::888:'; | |
my $r; | |
$r = NFA::Exec->match($m, $s) for 1..100; | |
push @times, Time::HiRes::time(); | |
print "String `$s' ", $r ? 'matched' : 'did not match', "\n"; | |
$r = ($s =~ /^(?:\d+|::)*$/) for 1..100; | |
push @times, Time::HiRes::time(); | |
print "String `$s' ", $r ? 'matched' : 'did not match', "\n"; | |
my $j = 0; | |
for my $t (@times) { | |
my $ot = $t; | |
$t -= $j; | |
$j = $ot; | |
} | |
shift @times; | |
print "Elapsed times, in seconds:\n"; | |
print $_, "\n" for @times; | |
chop $s; | |
$r = ($s =~ /^(?:\d+|::)*$/); | |
print "String `$s' ", $r ? 'matched' : 'did not match', "\n"; | |
$r = NFA::Exec->match($m, $s); | |
print "String `$s' ", $r ? 'matched' : 'did not match', "\n"; | |
} | |
sub NFA::self_test_two { | |
require Data::Dumper; | |
my $s = '::1234::5678901234567890::::1235467890::888::'; | |
my $e = Regex->parse(q{((0|1|2|3|4|5|6|7|8|9)+|::)*}); | |
my $m = NFA->new($e); | |
my $x = NFA::Exec->init($m, $s); | |
print Data::Dumper->new([$m], ['m'])->Terse(1)->Sortkeys(1)->Dump; | |
my $sort = sub { | |
my ($a0, $a1, $a2) = $a =~ /(\D*)(\d*)(.*)/; | |
my ($b0, $b1, $b2) = $b =~ /(\D*)(\d*)(.*)/; | |
$a1 <=> $b1 or $a0 cmp $b0 or $a2 cmp $b2; | |
}; | |
{ | |
print "[", join(' ', sort { $sort->() } $x->states), "]"; | |
print " => ", substr($x->{input}, $x->{pos}, 1), "\n"; | |
(print "End of input\n"), last if $x->end_of_input; | |
(print "No states\n"), last if !$x->states; | |
$x->step; | |
redo; | |
}; | |
print $x->final_state ? "Matched\n" : "Did not match\n"; | |
} | |
1; | |
__END__ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment