Last active
December 16, 2015 11:59
-
-
Save jeffreykegler/5431739 to your computer and use it in GitHub Desktop.
Example of Perl-style heredoc parsing, for blog post.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!perl | |
# Copyright 2013 Jeffrey Kegler | |
# This file is part of Marpa::R2. Marpa::R2 is free software: you can | |
# redistribute it and/or modify it under the terms of the GNU Lesser | |
# General Public License as published by the Free Software Foundation, | |
# either version 3 of the License, or (at your option) any later version. | |
# | |
# Marpa::R2 is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
# Lesser General Public License for more details. | |
# | |
# You should have received a copy of the GNU Lesser | |
# General Public License along with Marpa::R2. If not, see | |
# http://www.gnu.org/licenses/. | |
# Example of Perl-style here-document parsing by Jeffrey Kegler | |
# Based on code by Peter Stuifzand | |
# With bugfix by mauke | |
use 5.010; | |
use strict; | |
use warnings; | |
use Test::More tests => 2; | |
use Marpa::R2 3.000000; | |
my $p = Demo::Heredoc::Parser->new; | |
my $v = $p->parse(<<"INPUT"); | |
say <<ENDA, <<ENDB, <<ENDC; say <<ENDD; | |
a | |
ENDA | |
b | |
ENDB | |
c | |
ENDC | |
d | |
ENDD | |
INPUT | |
my $expected = | |
[ [ [ 'say', [ "a\n", "b\n", "c\n", ], ], ], [ [ 'say', [ "d\n", ], ], ] ]; | |
is_deeply( $v, $expected ); | |
$v = $p->parse(<<'INPUT'); | |
<<A, <<B, <<A; | |
a1 | |
A | |
b | |
B | |
a2 | |
A | |
<<WHEN; | |
when | |
WHEN | |
INPUT | |
$expected = [ [ "a1\n", "b\n", "a2\n" ], ["when\n"], ]; | |
is_deeply( $v, $expected ); | |
exit; | |
package Demo::Heredoc::Parser; | |
sub new { | |
my $class = shift; | |
my $grammar = Marpa::R2::Scanless::G->new( | |
{ | |
default_action => '::array', | |
source => \<<'GRAMMAR', | |
:start ::= statements | |
statements ::= statement+ | |
# Statement should handle their own semicolons | |
statement ::= expressions semicolon action => ::first | |
expressions ::= expression+ separator => comma | |
expression ::= <heredoc declaration> action => ::first | |
| 'say' expressions | |
# The heredoc rule is different from how the source code actually looks The | |
# pause adverb allows to send only the parts the are useful | |
<heredoc declaration> ::= (<heredoc op>) <heredoc> action => ::first | |
# Pause at <heredoc> | |
:lexeme ~ <heredoc> pause => before event => 'heredoc declaration' | |
<heredoc op> ~ '<<' | |
semicolon ~ ';' | |
comma ~ ',' | |
# The syntax here is actually for the heredoc's terminator. | |
# The actual value of the <heredoc> lexeme will | |
# the heredoc, which will be provided by the external heredoc scanner. | |
<heredoc> ~ [\w]+ | |
:discard ~ ws | |
ws ~ [ \t\n]+ | |
GRAMMAR | |
} | |
); | |
my $self = { grammar => $grammar, }; | |
return bless $self, $class; | |
} ## end sub new | |
sub parse { | |
my ( $self, $input ) = @_; | |
my $recce = Marpa::R2::Scanless::R->new( { grammar => $self->{grammar} } ); | |
# Start the parse | |
my $pos = $recce->read( \$input ); | |
die "error" if $pos < 0; | |
my $last_heredoc_end; | |
# In this grammar, reading can stop or pause for one of three | |
# reasons: | |
# | |
# 1.) An event, which is always a heredoc declaration | |
# | |
# 2.) Reaching the end of physical input | |
# | |
# 3.) Reaching the end of a line which declares heredocs | |
# The loop condition detects the end of physical input | |
PARSE_SEGMENT: while ( $pos < length $input ) { | |
# We do not need to loop through events, although in a more | |
# complicated parser we would. In this parser, there is only | |
# one kind of event. | |
my $events = $recce->events(); | |
# We handle the non-event case here. Since we are not | |
# at end of physical input, and there has been no event, | |
# we are at the end of a line which declared heredocs. | |
# We resume reading from the end of the last heredoc | |
if ( not @{$events} ) { | |
$pos = $recce->resume($last_heredoc_end); | |
$last_heredoc_end = undef; | |
next PARSE_SEGMENT; | |
} | |
# If we are here, we paused before a heredoc terminator. | |
# The start and length of pause lexeme | |
# indicate the start and length of the heredoc terminator. | |
my ( $start_of_heredoc_terminator, $length_of_heredoc_terminator ) = | |
$recce->pause_span(); | |
my $end_of_heredoc_terminator = | |
$start_of_heredoc_terminator + $length_of_heredoc_terminator; | |
# Using the pause lexeme's start and length, find the terminator | |
my $terminator = | |
$recce->literal( $start_of_heredoc_terminator, | |
$length_of_heredoc_terminator ); | |
# We are in a line declaring heredoc's. We need to know its EOL. | |
my $eol_at_start_of_heredocs = index( $input, "\n", $pos ) + 1; | |
# The next heredoc starts after the previous one or, | |
# if this is the first heredoc in a line declaring them, | |
# after this heredoc declaring line's EOL. | |
my $heredoc_start = $last_heredoc_end // $eol_at_start_of_heredocs; | |
# We've found the start of the current heredoc. | |
pos($input) = $heredoc_start; | |
# The heredoc ends after a terminator on a line by itself. | |
$input =~ m/^\Q$terminator\E\n/gms | |
or die "Heredoc terminator $terminator not found before end of input"; | |
# Slurp the heredoc into a variable. | |
my $heredoc_body = substr $input, $heredoc_start, | |
$-[0] - $heredoc_start; | |
# Pass the heredoc body to the parser as the value of <heredoc> | |
$recce->lexeme_read( 'heredoc', $heredoc_start, | |
length($heredoc_body), $heredoc_body ) // die $recce->show_progress; | |
# If this line declares another heredoc, it will start after | |
# the last one, so we save the location. | |
$last_heredoc_end = pos $input; | |
# Resume parsing the line declaring heredocs. | |
# It runs from the end of this heredoc's terminator, to the next EOL. | |
$pos = | |
$recce->resume( $end_of_heredoc_terminator, | |
$eol_at_start_of_heredocs - $end_of_heredoc_terminator ); | |
} ## end PARSE_SEGMENT: while ( $pos < length $input ) | |
my $v = $recce->value; | |
return $$v; | |
} ## end sub parse | |
# vim: expandtab shiftwidth=4: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment