-
-
Save mauke/f8749ca2245d3bbbbc2f to your computer and use it in GitHub Desktop.
Example of Perl-style heredoc parsing, for blog post.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!perl | |
# Copyright 2013 Jeffrey Kegler | |
# This file is part of Marpa::R2. Marpa::R2 is free software: you can | |
# redistribute it and/or modify it under the terms of the GNU Lesser | |
# General Public License as published by the Free Software Foundation, | |
# either version 3 of the License, or (at your option) any later version. | |
# | |
# Marpa::R2 is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
# Lesser General Public License for more details. | |
# | |
# You should have received a copy of the GNU Lesser | |
# General Public License along with Marpa::R2. If not, see | |
# http://www.gnu.org/licenses/. | |
# Example of Perl-style here-document parsing, based | |
# on code by Peter Stuifzand | |
use strict; | |
use warnings; | |
use Test::More tests => 2; | |
use Marpa::R2 2.052000; | |
my $p = Demo::Heredoc::Parser->new; | |
my $v = $p->parse(<<"INPUT"); | |
say <<ENDA, <<ENDB, <<ENDC; say <<ENDD; | |
a | |
ENDA | |
b | |
ENDB | |
c | |
ENDC | |
d | |
ENDD | |
INPUT | |
my $expected = [ | |
[ [ 'say', [ "a\n", "b\n", "c\n", ], ], ], | |
[ [ 'say', [ "d\n", ], ], ] | |
]; | |
is_deeply( $v, $expected ); | |
$v = $p->parse(<<'INPUT'); | |
<<A, <<B, <<A; | |
a1 | |
A | |
b | |
B | |
a2 | |
A | |
INPUT | |
$expected = [ | |
[ "a1\n", "b\n", "a2\n" ], | |
]; | |
is_deeply( $v, $expected ); | |
exit; | |
package Demo::Heredoc::Parser; | |
sub new { | |
my $class = shift; | |
my $grammar = Marpa::R2::Scanless::G->new( | |
{ default_action => '::array', | |
source => \<<'GRAMMAR', | |
:start ::= statements | |
statements ::= statement+ | |
# Statement should handle their own semicolons | |
statement ::= expressions semicolon action => ::first | |
| newline | |
expressions ::= expression+ separator => comma | |
expression ::= heredoc action => ::first | |
| 'say' expressions | |
# The heredoc rule is different from how the source code actually looks The | |
# pause adverb allows to send only the parts the are useful | |
heredoc ::= (<heredoc op>) <heredoc terminator> action => ::first | |
# Pause at <heredoc terminator> and at newlines. | |
:lexeme ~ <heredoc terminator> pause => before | |
:lexeme ~ newline pause => before | |
<heredoc op> ~ '<<' | |
semicolon ~ ';' | |
comma ~ ',' | |
newline ~ [\n] | |
# The syntax here is for the terminator itself. | |
# The actual value of the <heredoc terminator> lexeme will | |
# the heredoc, which will be provided by the external heredoc scanner. | |
<heredoc terminator> ~ [\w]+ | |
# Only discard horizontal whitespace. If "\n" is included the parser won't | |
# pause at the end of line. | |
:discard ~ ws | |
ws ~ [ \t]+ | |
GRAMMAR | |
} | |
); | |
my $self = { grammar => $grammar, }; | |
return bless $self, $class; | |
} ## end sub new | |
sub parse { | |
my ( $self, $input ) = @_; | |
my $re = Marpa::R2::Scanless::R->new( { grammar => $self->{grammar} } ); | |
# Start the parse | |
my $pos = $re->read( \$input ); | |
die "error" if $pos < 0; | |
my $last_heredoc_end; | |
# Loop while the parse has't moved past the end | |
PARSE_SEGMENT: while ( $pos < length $input ) { | |
my $lexeme = $re->pause_lexeme(); | |
my ( $start_of_pause_lexeme, $length_of_pause_lexeme ) = | |
$re->pause_span(); | |
my $end_of_pause_lexeme = | |
$start_of_pause_lexeme + $length_of_pause_lexeme; | |
if ( $re->pause_lexeme() eq 'newline' ) { | |
# Resume from the end of the last heredoc, if there | |
# was one. Otherwise just resume at the start of the | |
# next line. | |
$pos = $re->resume( $last_heredoc_end // $end_of_pause_lexeme ); | |
$last_heredoc_end = undef; | |
next PARSE_SEGMENT; | |
} ## end if ( $re->pause_lexeme() eq 'newline' ) | |
# If we are here, the pause lexeme was <heredoc terminator> | |
# Find the <heredoc terminator> | |
my $terminator = | |
$re->literal( $start_of_pause_lexeme, $length_of_pause_lexeme ); | |
my $heredoc_start = $last_heredoc_end | |
// ( index( $input, "\n", $pos ) + 1 ); | |
# Find the heredoc body -- | |
# the literal text between the end of the last heredoc | |
# and the heredoc terminator for this heredoc | |
pos($input) = $heredoc_start; | |
$input =~ m/^\Q$terminator\E\n/gms | |
or die "Heredoc terminator $terminator not found before end of input"; | |
my $heredoc_body = substr $input, $heredoc_start, $-[0] - $heredoc_start; | |
# Pass the heredoc to the parser as the value of <heredoc terminator> | |
$re->lexeme_read( | |
'heredoc terminator', $heredoc_start, | |
length($heredoc_body), $heredoc_body | |
) // die $re->show_progress; | |
# Save of the position of the end of the match | |
# The next heredoc body starts there if there is one | |
$last_heredoc_end = pos $input; | |
# Resume parsing from the end of the <heredoc terminator> lexeme | |
$pos = $re->resume($end_of_pause_lexeme); | |
} ## end PARSE_SEGMENT: while ( $pos < length $input ) | |
my $v = $re->value; | |
return $$v; | |
} ## end sub parse |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment