Skip to content

Instantly share code, notes, and snippets.

@BenGoldberg1
Created January 23, 2016 01:51
Show Gist options
  • Save BenGoldberg1/538e6aaf78433b8bd632 to your computer and use it in GitHub Desktop.
Save BenGoldberg1/538e6aaf78433b8bd632 to your computer and use it in GitHub Desktop.
use strict;
use warnings;
# First, I started with http://perl.plover.com/Regex/Regex.pm
# Then, I beat it with an OO shaped stick until it looked like this.
# Note: Only the parser and executer are in this file; a compiler
# based vaguely on the original is in PloverHash.pl, and a compiler
# which uses an array instead of a hash is in PloverArray.pl
# -*- mode: perl; perl-indent-level: 2 -*-
#
# Compile and evaluate regular expressions in Perl without
# using built-in regular expressions.
#
# Original Author: Mark-Jason Dominus (mjd-tpj-regex@plover.com)
# Owner of the stick which beat MJD's program into this shape:
# Ben Goldberg (ben-goldberg@hotmail.com)
# Like the original this program is in the PUBLIC DOMAIN.
#
#
# Note to readers:
# The code is in three parts.
# 1. A parser for regexes
# 2. A compiler that trnaslates regexes into machines
# 3. An executer that executes a machine on a specified input
#
# The parser is really turgid and hard to understand, because it isn't
# the point of the package. I recommend that you skip to part 2 or 3
# immediately, then work backwards. Parts 2 and 3 are much clearer.
package Regex;
# Regexps are handled in three phases. First, they're parsed from a string
# form into an internal parse tree form.
#
# Originally it was like so:
# ABC => [ "CONCAT" => [ A, B, C ] ]
# A* => [ "STAR" => A ]
# A+ => [ "PLUS" => A ]
# A|B|C => [ "ALTERN" => [ A, B, C ] ]
# literal character x => [ "LITERAL" => x ]
#
# After applying my OO-shaped-stick, the parse tree looks like:
# ABC => bless [A, B, C], "CONCAT"
# A* => bless [A], "STAR"
# , etc..
#
# `parse' does this.
use vars qw,$PARSE_ERROR @seen,;
sub parse {
my @t = split(//, $_[1]);
local @seen = ();
$PARSE_ERROR = "Something is wrong";
my $tree = parse_altern(@t);
die $PARSE_ERROR unless $tree;
$tree;
}
sub parse_altern {
my @alterns;
my @terms;
my $c;
while (defined($c = shift @_)) {
next if $c eq '';
push @seen, $c;
if ($c eq '(') {
my $next_term = &parse_altern;
push @terms, $next_term;
} elsif ($c eq ')') {
#push @alterns, &joinup(CONCAT => @terms) if @terms;
#return &joinup(ALTERN => @alterns);
last;
} elsif ($c eq '|') {
push @alterns, joinup(CONCAT => @terms) if @terms;
@terms = ();
} elsif ($c eq '*' || $c eq '+') {
if (@terms) {
$terms[-1] = bless [ $terms[-1] ], ($c eq '*' ? 'STAR' : 'PLUS');
} else {
$PARSE_ERROR = "Did not expect $c!\n\t@seen\n\t*\n\t@_\n";
return undef;
}
} elsif ($c eq '\\') {
if( $CLASS::{$_[0]. q{::}} ) {
push @terms, bless [], "CLASS::".shift @_;
} else {
push @terms, bless [shift @_], 'LITERAL';
}
} else {
push @terms, bless [$c], 'LITERAL';
}
} # While there are tokens...
push @alterns, joinup(CONCAT => @terms) if @terms;
return joinup(ALTERN => @alterns) if @alterns;
return undef;
}
sub joinup {
my $tag = shift;
if (@_ == 1) {
$_[0];
} else {
bless [@_], $tag;
}
}
################################################################
#
# Execute NFA on a given string
#
################################################################
package NFA::Exec;
sub match {
my $pack = shift;
my $nfa = shift;
my $string = shift;
my $machine = $pack->init($nfa, $string);
$machine->run();
$machine->final_state();
}
sub new {
&init(@_);
}
#
# Create a new execution of the specified NFS, and feed it
# the specified string as its input.
#
sub init {
my $pack = shift;
my $nfa = shift;
my $string = shift;
my $self = {};
$self->{nfa} = $nfa;
$self->{input} = $string;
$self->{pos} = 0;
$self->{states} = [ $self->{nfa}->start_state ];
bless $self => $pack;
$self->epsilon_transit();
$self;
}
#
# Run an execution to the end of the input
#
sub run {
my $self = shift;
until ($self->end_of_input() || $self->states() == 0) {
$self->step;
}
}
#
# Is this execution object at the end of its input?
#
sub end_of_input {
my $self = shift;
$self->{pos} >= length($self->{input});
}
#
# Advance an execution by one step.
#
sub step {
my $self = shift;
my $next_symbol = substr($self->{input}, $self->{pos}, 1);
if ($next_symbol eq '') {
# error
} else {
$self->transit($next_symbol);
$self->epsilon_transit();
}
$self->{pos}++;
}
#
# Perform e-transitions in an execution
#
sub epsilon_transit {
my $self = shift;
my @newstates = $self->states;
my @result = @newstates;
my %seen = map {($_ => 1)} @newstates;
for (;;) {
my $s;
my @nextstates;
foreach my $s (@newstates) {
my @n = $self->{nfa}->transitions_for_symbol($s, '') or next;
push @nextstates, @n;
}
@newstates = grep {! $seen{$_}++} @nextstates;
last unless @newstates;
push @result, @newstates;
}
$self->{states} = \@result;
}
#
# Perform a transition
#
sub transit {
my $self = shift;
my $symbol = shift;
$self->{states} = [ $self->transitions_for_symbol($symbol) ];
}
#
# Current states
#
sub states {
my $self = shift;
@{$self->{states}};
}
#
# Should we accept?
#
sub final_state {
my $self = shift;
my $s;
foreach my $s ($self->states) {
return 1 if $self->{nfa}->is_end_state($s);
}
0;
}
#
# Get current transition table
# This is interesting because we have to merge the transition
# tables for several states.
sub transitions_for_symbol {
my ($self, $symbol) = @_;
my %ttab;
foreach my $s ($self->states) {
my @n = $self->{nfa}->transitions_for_symbol($s, $symbol) or next;
@ttab{ @n } = ();
}
keys %ttab;
}
sub NFA::self_test {
require Time::HiRes;
require Data::Dumper;
my @times;
push @times, Time::HiRes::time();
#my $e = Regex->parse(q{((0|1|2|3|4|5|6|7|8|9)+|::)*});
my $e = Regex->parse(q{(\d+|::)*});
push @times, Time::HiRes::time();
my $m = NFA->new($e);
#print Data::Dumper->new([$m], ['m'])->Terse(1)->Sortkeys(1)->Dump;
push @times, Time::HiRes::time();
my $s = '::1234::5678901234567890::::1235467890::888:';
my $r;
$r = NFA::Exec->match($m, $s) for 1..100;
push @times, Time::HiRes::time();
print "String `$s' ", $r ? 'matched' : 'did not match', "\n";
$r = ($s =~ /^(?:\d+|::)*$/) for 1..100;
push @times, Time::HiRes::time();
print "String `$s' ", $r ? 'matched' : 'did not match', "\n";
my $j = 0;
for my $t (@times) {
my $ot = $t;
$t -= $j;
$j = $ot;
}
shift @times;
print "Elapsed times, in seconds:\n";
print $_, "\n" for @times;
chop $s;
$r = ($s =~ /^(?:\d+|::)*$/);
print "String `$s' ", $r ? 'matched' : 'did not match', "\n";
$r = NFA::Exec->match($m, $s);
print "String `$s' ", $r ? 'matched' : 'did not match', "\n";
}
sub NFA::self_test_two {
require Data::Dumper;
my $s = '::1234::5678901234567890::::1235467890::888::';
my $e = Regex->parse(q{((0|1|2|3|4|5|6|7|8|9)+|::)*});
my $m = NFA->new($e);
my $x = NFA::Exec->init($m, $s);
print Data::Dumper->new([$m], ['m'])->Terse(1)->Sortkeys(1)->Dump;
my $sort = sub {
my ($a0, $a1, $a2) = $a =~ /(\D*)(\d*)(.*)/;
my ($b0, $b1, $b2) = $b =~ /(\D*)(\d*)(.*)/;
$a1 <=> $b1 or $a0 cmp $b0 or $a2 cmp $b2;
};
{
print "[", join(' ', sort { $sort->() } $x->states), "]";
print " => ", substr($x->{input}, $x->{pos}, 1), "\n";
(print "End of input\n"), last if $x->end_of_input;
(print "No states\n"), last if !$x->states;
$x->step;
redo;
};
print $x->final_state ? "Matched\n" : "Did not match\n";
}
1;
__END__
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment