BenGoldberg1/PloverOO.pl

## PloverOO.pl
use strict;
use warnings;

# First, I started with http://perl.plover.com/Regex/Regex.pm
# Then, I beat it with an OO shaped stick until it looked like this.

# Note: Only the parser and executer are in this file; a compiler
# based vaguely on the original is in PloverHash.pl, and a compiler
# which uses an array instead of a hash is in PloverArray.pl

# -*- mode: perl; perl-indent-level: 2 -*-
#
# Compile and evaluate regular expressions in Perl without
# using built-in regular expressions.
#
# Original Author: Mark-Jason Dominus (mjd-tpj-regex@plover.com)
# Owner of the stick which beat MJD's program into this shape:
# Ben Goldberg (ben-goldberg@hotmail.com)
# Like the original this program is in the PUBLIC DOMAIN.
#

#
# Note to readers:
#  The code is in three parts.
#  1. A parser for regexes
#  2. A compiler that trnaslates regexes into machines
#  3. An executer that executes a machine on a specified input
#
# The parser is really turgid and hard to understand, because it isn't
# the point of the package.  I recommend that you skip to part 2 or 3
# immediately, then work backwards.  Parts 2 and 3 are much clearer.

package Regex;

# Regexps are handled in three phases.  First, they're parsed from a string
# form into an internal parse tree form.
#
# Originally it was like so:
#  ABC => [ "CONCAT" => [ A, B, C ] ]
#  A* => [ "STAR" => A ]
#  A+ => [ "PLUS" => A ]
#  A|B|C => [ "ALTERN" => [ A, B, C ] ]
#  literal character x => [ "LITERAL" => x ]
#
# After applying my OO-shaped-stick, the parse tree looks like:
#  ABC => bless [A, B, C], "CONCAT"
#  A* => bless [A], "STAR"
# , etc..
#
# `parse' does this.

use vars qw,$PARSE_ERROR @seen,;
sub parse {
	my @t = split(//, $_[1]);
	local @seen = ();
	$PARSE_ERROR = "Something is wrong";
	my $tree = parse_altern(@t);
	die $PARSE_ERROR unless $tree;
	$tree;
}

sub parse_altern {
	my @alterns;
	my @terms;
	my $c;
	while (defined($c = shift @_)) {
		next if $c eq '';
		push @seen, $c;
		if ($c eq '(') {
			my $next_term = &parse_altern;
			push @terms, $next_term;
		} elsif ($c eq ')') {
			#push @alterns, &joinup(CONCAT => @terms) if @terms;
			#return &joinup(ALTERN => @alterns);
			last;
		} elsif ($c eq '|') {
			push @alterns, joinup(CONCAT => @terms) if @terms;
			@terms = ();
		} elsif ($c eq '*' || $c eq '+') {
			if (@terms) {
				$terms[-1] = bless [ $terms[-1] ], ($c eq '*' ? 'STAR' : 'PLUS');
			} else {
				$PARSE_ERROR = "Did not expect $c!\n\t@seen\n\t*\n\t@_\n";
				return undef;
			}
		} elsif ($c eq '\\') {
			if( $CLASS::{$_[0]. q{::}} ) {
				push @terms, bless [], "CLASS::".shift @_;
			} else {
				push @terms, bless [shift @_], 'LITERAL';
			}
		} else {
			push @terms, bless [$c], 'LITERAL';
		}
	}				# While there are tokens...
	push @alterns, joinup(CONCAT => @terms) if @terms;
	return joinup(ALTERN => @alterns) if @alterns;
	return undef;
}

sub joinup {
	my $tag = shift;
	if (@_ == 1) {
		$_[0];
	} else {
		bless [@_], $tag;
	}
}


################################################################
#
# Execute NFA on a given string
#
################################################################

package NFA::Exec;


sub match {
	my $pack = shift;
	my $nfa = shift;
	my $string = shift;
	my $machine = $pack->init($nfa, $string);
	$machine->run();
	$machine->final_state();
}

sub new {
	&init(@_);
}

#
# Create a new execution of the specified NFS, and feed it
# the specified string as its input.
#
sub init {
	my $pack = shift;
	my $nfa = shift;
	my $string = shift;
	my $self = {};

	$self->{nfa} = $nfa;
	$self->{input} = $string;
	$self->{pos} = 0;
	$self->{states} = [ $self->{nfa}->start_state ];

	bless $self => $pack;

	$self->epsilon_transit();

	$self;
}

#
# Run an execution to the end of the input
#
sub run {
	my $self = shift;
	until ($self->end_of_input() || $self->states() == 0) {
		$self->step;
	}
}

#
# Is this execution object at the end of its input?
#
sub end_of_input {
	my $self = shift;
	$self->{pos} >= length($self->{input});
}

#
# Advance an execution by one step.
#
sub step {
	my $self = shift;
	my $next_symbol = substr($self->{input}, $self->{pos}, 1);
	if ($next_symbol eq '') {
		# error
	} else {
		$self->transit($next_symbol);
		$self->epsilon_transit();
	}
	$self->{pos}++;
}

#
# Perform e-transitions in an execution
#
sub epsilon_transit {
	my $self = shift;
	my @newstates = $self->states;
	my @result = @newstates;
	my %seen = map {($_ => 1)} @newstates;

	for (;;) {
		my $s;
		my @nextstates;
		foreach my $s (@newstates) {
			my @n = $self->{nfa}->transitions_for_symbol($s, '') or next;
			push @nextstates, @n;
		}
		@newstates = grep {! $seen{$_}++} @nextstates;
		last unless @newstates;
		push @result, @newstates;
	}

	$self->{states} = \@result;
}

#
# Perform a transition
#
sub transit {
	my $self = shift;
	my $symbol = shift;

	$self->{states} = [ $self->transitions_for_symbol($symbol) ];
}

#
# Current states
#
sub states {
	my $self = shift;
	@{$self->{states}};
}


#
# Should we accept?
#
sub final_state {
	my $self = shift;
	my $s;
	foreach my $s ($self->states) {
		return 1 if $self->{nfa}->is_end_state($s);
	}
	0;
}

#
# Get current transition table
# This is interesting because we have to merge the transition
# tables for several states.
sub transitions_for_symbol {
	my ($self, $symbol) = @_;
	my %ttab;
	foreach my $s ($self->states) {
		my @n = $self->{nfa}->transitions_for_symbol($s, $symbol) or next;
		@ttab{ @n } = ();
	}
	keys %ttab;
}

sub NFA::self_test {
	require Time::HiRes;
	require Data::Dumper;
	my @times;
	push @times, Time::HiRes::time();
	#my $e = Regex->parse(q{((0|1|2|3|4|5|6|7|8|9)+|::)*});
	my $e = Regex->parse(q{(\d+|::)*});
	push @times, Time::HiRes::time();
	my $m = NFA->new($e);
	#print Data::Dumper->new([$m], ['m'])->Terse(1)->Sortkeys(1)->Dump;

	push @times, Time::HiRes::time();

	my $s = '::1234::5678901234567890::::1235467890::888:';
	my $r;
	$r = NFA::Exec->match($m, $s) for 1..100;
	push @times, Time::HiRes::time();
	print "String `$s' ", $r ? 'matched' : 'did not match', "\n";


	$r = ($s =~ /^(?:\d+|::)*$/) for 1..100;
	push @times, Time::HiRes::time();
	print "String `$s' ", $r ? 'matched' : 'did not match', "\n";

	my $j = 0;
	for my $t (@times) {
		my $ot = $t;
		$t -= $j;
		$j = $ot;
	}
	shift @times;

	print "Elapsed times, in seconds:\n";
	print $_, "\n" for @times;

	chop $s;
	$r = ($s =~ /^(?:\d+|::)*$/);
	print "String `$s' ", $r ? 'matched' : 'did not match', "\n";
	$r = NFA::Exec->match($m, $s);
	print "String `$s' ", $r ? 'matched' : 'did not match', "\n";
}

sub NFA::self_test_two {
	require Data::Dumper;
	my $s = '::1234::5678901234567890::::1235467890::888::';
	my $e = Regex->parse(q{((0|1|2|3|4|5|6|7|8|9)+|::)*});
	my $m = NFA->new($e);
	my $x = NFA::Exec->init($m, $s);

	print Data::Dumper->new([$m], ['m'])->Terse(1)->Sortkeys(1)->Dump;

	my $sort = sub {
		my ($a0, $a1, $a2) = $a =~ /(\D*)(\d*)(.*)/;
		my ($b0, $b1, $b2) = $b =~ /(\D*)(\d*)(.*)/;
		$a1 <=> $b1 or $a0 cmp $b0 or $a2 cmp $b2;
	};
	{
		print "[", join(' ', sort { $sort->() } $x->states), "]";
		print " => ", substr($x->{input}, $x->{pos}, 1), "\n";
		(print "End of input\n"), last if $x->end_of_input;
		(print "No states\n"), last if !$x->states;
		$x->step;
		redo;
	};
	print $x->final_state ? "Matched\n" : "Did not match\n";
}


1;
__END__
	use strict;
	use warnings;

	# First, I started with http://perl.plover.com/Regex/Regex.pm
	# Then, I beat it with an OO shaped stick until it looked like this.

	# Note: Only the parser and executer are in this file; a compiler
	# based vaguely on the original is in PloverHash.pl, and a compiler
	# which uses an array instead of a hash is in PloverArray.pl

	# -- mode: perl; perl-indent-level: 2 --
	#
	# Compile and evaluate regular expressions in Perl without
	# using built-in regular expressions.
	#
	# Original Author: Mark-Jason Dominus (mjd-tpj-regex@plover.com)
	# Owner of the stick which beat MJD's program into this shape:
	# Ben Goldberg (ben-goldberg@hotmail.com)
	# Like the original this program is in the PUBLIC DOMAIN.
	#

	#
	# Note to readers:
	# The code is in three parts.
	# 1. A parser for regexes
	# 2. A compiler that trnaslates regexes into machines
	# 3. An executer that executes a machine on a specified input
	#
	# The parser is really turgid and hard to understand, because it isn't
	# the point of the package. I recommend that you skip to part 2 or 3
	# immediately, then work backwards. Parts 2 and 3 are much clearer.

	package Regex;

	# Regexps are handled in three phases. First, they're parsed from a string
	# form into an internal parse tree form.
	#
	# Originally it was like so:
	# ABC => [ "CONCAT" => [ A, B, C ] ]
	# A* => [ "STAR" => A ]
	# A+ => [ "PLUS" => A ]
	# A\|B\|C => [ "ALTERN" => [ A, B, C ] ]
	# literal character x => [ "LITERAL" => x ]
	#
	# After applying my OO-shaped-stick, the parse tree looks like:
	# ABC => bless [A, B, C], "CONCAT"
	# A* => bless [A], "STAR"
	# , etc..
	#
	# `parse' does this.

	use vars qw,$PARSE_ERROR @seen,;
	sub parse {
	my @t = split(//, $_[1]);
	local @seen = ();
	$PARSE_ERROR = "Something is wrong";
	my $tree = parse_altern(@t);
	die $PARSE_ERROR unless $tree;
	$tree;
	}

	sub parse_altern {
	my @alterns;
	my @terms;
	my $c;
	while (defined($c = shift @_)) {
	next if $c eq '';
	push @seen, $c;
	if ($c eq '(') {
	my $next_term = &parse_altern;
	push @terms, $next_term;
	} elsif ($c eq ')') {
	#push @alterns, &joinup(CONCAT => @terms) if @terms;
	#return &joinup(ALTERN => @alterns);
	last;
	} elsif ($c eq '\|') {
	push @alterns, joinup(CONCAT => @terms) if @terms;
	@terms = ();
	} elsif ($c eq '*' \|\| $c eq '+') {
	if (@terms) {
	$terms[-1] = bless [ $terms[-1] ], ($c eq '*' ? 'STAR' : 'PLUS');
	} else {
	$PARSE_ERROR = "Did not expect $c!\n\t@seen\n\t*\n\t@_\n";
	return undef;
	}
	} elsif ($c eq '\\') {
	if( $CLASS::{$_[0]. q{::}} ) {
	push @terms, bless [], "CLASS::".shift @_;
	} else {
	push @terms, bless [shift @_], 'LITERAL';
	}
	} else {
	push @terms, bless [$c], 'LITERAL';
	}
	} # While there are tokens...
	push @alterns, joinup(CONCAT => @terms) if @terms;
	return joinup(ALTERN => @alterns) if @alterns;
	return undef;
	}

	sub joinup {
	my $tag = shift;
	if (@_ == 1) {
	$_[0];
	} else {
	bless [@_], $tag;
	}
	}


	################################################################
	#
	# Execute NFA on a given string
	#
	################################################################

	package NFA::Exec;


	sub match {
	my $pack = shift;
	my $nfa = shift;
	my $string = shift;
	my $machine = $pack->init($nfa, $string);
	$machine->run();
	$machine->final_state();
	}

	sub new {
	&init(@_);
	}

	#
	# Create a new execution of the specified NFS, and feed it
	# the specified string as its input.
	#
	sub init {
	my $pack = shift;
	my $nfa = shift;
	my $string = shift;
	my $self = {};

	$self->{nfa} = $nfa;
	$self->{input} = $string;
	$self->{pos} = 0;
	$self->{states} = [ $self->{nfa}->start_state ];

	bless $self => $pack;

	$self->epsilon_transit();

	$self;
	}

	#
	# Run an execution to the end of the input
	#
	sub run {
	my $self = shift;
	until ($self->end_of_input() \|\| $self->states() == 0) {
	$self->step;
	}
	}

	#
	# Is this execution object at the end of its input?
	#
	sub end_of_input {
	my $self = shift;
	$self->{pos} >= length($self->{input});
	}

	#
	# Advance an execution by one step.
	#
	sub step {
	my $self = shift;
	my $next_symbol = substr($self->{input}, $self->{pos}, 1);
	if ($next_symbol eq '') {
	# error
	} else {
	$self->transit($next_symbol);
	$self->epsilon_transit();
	}
	$self->{pos}++;
	}

	#
	# Perform e-transitions in an execution
	#
	sub epsilon_transit {
	my $self = shift;
	my @newstates = $self->states;
	my @result = @newstates;
	my %seen = map {($_ => 1)} @newstates;

	for (;;) {
	my $s;
	my @nextstates;
	foreach my $s (@newstates) {
	my @n = $self->{nfa}->transitions_for_symbol($s, '') or next;
	push @nextstates, @n;
	}
	@newstates = grep {! $seen{$_}++} @nextstates;
	last unless @newstates;
	push @result, @newstates;
	}

	$self->{states} = \@result;
	}

	#
	# Perform a transition
	#
	sub transit {
	my $self = shift;
	my $symbol = shift;

	$self->{states} = [ $self->transitions_for_symbol($symbol) ];
	}

	#
	# Current states
	#
	sub states {
	my $self = shift;
	@{$self->{states}};
	}


	#
	# Should we accept?
	#
	sub final_state {
	my $self = shift;
	my $s;
	foreach my $s ($self->states) {
	return 1 if $self->{nfa}->is_end_state($s);
	}
	0;
	}

	#
	# Get current transition table
	# This is interesting because we have to merge the transition
	# tables for several states.
	sub transitions_for_symbol {
	my ($self, $symbol) = @_;
	my %ttab;
	foreach my $s ($self->states) {
	my @n = $self->{nfa}->transitions_for_symbol($s, $symbol) or next;
	@ttab{ @n } = ();
	}
	keys %ttab;
	}

	sub NFA::self_test {
	require Time::HiRes;
	require Data::Dumper;
	my @times;
	push @times, Time::HiRes::time();
	#my $e = Regex->parse(q{((0\|1\|2\|3\|4\|5\|6\|7\|8\|9)+\|::)*});
	my $e = Regex->parse(q{(\d+\|::)*});
	push @times, Time::HiRes::time();
	my $m = NFA->new($e);
	#print Data::Dumper->new([$m], ['m'])->Terse(1)->Sortkeys(1)->Dump;

	push @times, Time::HiRes::time();

	my $s = '::1234::5678901234567890::::1235467890::888:';
	my $r;
	$r = NFA::Exec->match($m, $s) for 1..100;
	push @times, Time::HiRes::time();
	print "String `$s' ", $r ? 'matched' : 'did not match', "\n";


	$r = ($s =~ /^(?:\d+\|::)*$/) for 1..100;
	push @times, Time::HiRes::time();
	print "String `$s' ", $r ? 'matched' : 'did not match', "\n";

	my $j = 0;
	for my $t (@times) {
	my $ot = $t;
	$t -= $j;
	$j = $ot;
	}
	shift @times;

	print "Elapsed times, in seconds:\n";
	print $_, "\n" for @times;

	chop $s;
	$r = ($s =~ /^(?:\d+\|::)*$/);
	print "String `$s' ", $r ? 'matched' : 'did not match', "\n";
	$r = NFA::Exec->match($m, $s);
	print "String `$s' ", $r ? 'matched' : 'did not match', "\n";
	}

	sub NFA::self_test_two {
	require Data::Dumper;
	my $s = '::1234::5678901234567890::::1235467890::888::';
	my $e = Regex->parse(q{((0\|1\|2\|3\|4\|5\|6\|7\|8\|9)+\|::)*});
	my $m = NFA->new($e);
	my $x = NFA::Exec->init($m, $s);

	print Data::Dumper->new([$m], ['m'])->Terse(1)->Sortkeys(1)->Dump;

	my $sort = sub {
	my ($a0, $a1, $a2) = $a =~ /(\D)(\d)(.*)/;
	my ($b0, $b1, $b2) = $b =~ /(\D)(\d)(.*)/;
	$a1 <=> $b1 or $a0 cmp $b0 or $a2 cmp $b2;
	};
	{
	print "[", join(' ', sort { $sort->() } $x->states), "]";
	print " => ", substr($x->{input}, $x->{pos}, 1), "\n";
	(print "End of input\n"), last if $x->end_of_input;
	(print "No states\n"), last if !$x->states;
	$x->step;
	redo;
	};
	print $x->final_state ? "Matched\n" : "Did not match\n";
	}


	1;
	__END__