Skip to content

Instantly share code, notes, and snippets.

@mauke
Forked from jeffreykegler/heredoc.t
Last active August 29, 2015 14:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mauke/f8749ca2245d3bbbbc2f to your computer and use it in GitHub Desktop.
Save mauke/f8749ca2245d3bbbbc2f to your computer and use it in GitHub Desktop.
Example of Perl-style heredoc parsing, for blog post.
#!perl
# Copyright 2013 Jeffrey Kegler
# This file is part of Marpa::R2. Marpa::R2 is free software: you can
# redistribute it and/or modify it under the terms of the GNU Lesser
# General Public License as published by the Free Software Foundation,
# either version 3 of the License, or (at your option) any later version.
#
# Marpa::R2 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser
# General Public License along with Marpa::R2. If not, see
# http://www.gnu.org/licenses/.
# Example of Perl-style here-document parsing, based
# on code by Peter Stuifzand
use strict;
use warnings;
use Test::More tests => 2;
use Marpa::R2 2.052000;
my $p = Demo::Heredoc::Parser->new;
my $v = $p->parse(<<"INPUT");
say <<ENDA, <<ENDB, <<ENDC; say <<ENDD;
a
ENDA
b
ENDB
c
ENDC
d
ENDD
INPUT
my $expected = [
[ [ 'say', [ "a\n", "b\n", "c\n", ], ], ],
[ [ 'say', [ "d\n", ], ], ]
];
is_deeply( $v, $expected );
$v = $p->parse(<<'INPUT');
<<A, <<B, <<A;
a1
A
b
B
a2
A
INPUT
$expected = [
[ "a1\n", "b\n", "a2\n" ],
];
is_deeply( $v, $expected );
exit;
package Demo::Heredoc::Parser;
sub new {
my $class = shift;
my $grammar = Marpa::R2::Scanless::G->new(
{ default_action => '::array',
source => \<<'GRAMMAR',
:start ::= statements
statements ::= statement+
# Statement should handle their own semicolons
statement ::= expressions semicolon action => ::first
| newline
expressions ::= expression+ separator => comma
expression ::= heredoc action => ::first
| 'say' expressions
# The heredoc rule is different from how the source code actually looks The
# pause adverb allows to send only the parts the are useful
heredoc ::= (<heredoc op>) <heredoc terminator> action => ::first
# Pause at <heredoc terminator> and at newlines.
:lexeme ~ <heredoc terminator> pause => before
:lexeme ~ newline pause => before
<heredoc op> ~ '<<'
semicolon ~ ';'
comma ~ ','
newline ~ [\n]
# The syntax here is for the terminator itself.
# The actual value of the <heredoc terminator> lexeme will
# the heredoc, which will be provided by the external heredoc scanner.
<heredoc terminator> ~ [\w]+
# Only discard horizontal whitespace. If "\n" is included the parser won't
# pause at the end of line.
:discard ~ ws
ws ~ [ \t]+
GRAMMAR
}
);
my $self = { grammar => $grammar, };
return bless $self, $class;
} ## end sub new
sub parse {
my ( $self, $input ) = @_;
my $re = Marpa::R2::Scanless::R->new( { grammar => $self->{grammar} } );
# Start the parse
my $pos = $re->read( \$input );
die "error" if $pos < 0;
my $last_heredoc_end;
# Loop while the parse has't moved past the end
PARSE_SEGMENT: while ( $pos < length $input ) {
my $lexeme = $re->pause_lexeme();
my ( $start_of_pause_lexeme, $length_of_pause_lexeme ) =
$re->pause_span();
my $end_of_pause_lexeme =
$start_of_pause_lexeme + $length_of_pause_lexeme;
if ( $re->pause_lexeme() eq 'newline' ) {
# Resume from the end of the last heredoc, if there
# was one. Otherwise just resume at the start of the
# next line.
$pos = $re->resume( $last_heredoc_end // $end_of_pause_lexeme );
$last_heredoc_end = undef;
next PARSE_SEGMENT;
} ## end if ( $re->pause_lexeme() eq 'newline' )
# If we are here, the pause lexeme was <heredoc terminator>
# Find the <heredoc terminator>
my $terminator =
$re->literal( $start_of_pause_lexeme, $length_of_pause_lexeme );
my $heredoc_start = $last_heredoc_end
// ( index( $input, "\n", $pos ) + 1 );
# Find the heredoc body --
# the literal text between the end of the last heredoc
# and the heredoc terminator for this heredoc
pos($input) = $heredoc_start;
$input =~ m/^\Q$terminator\E\n/gms
or die "Heredoc terminator $terminator not found before end of input";
my $heredoc_body = substr $input, $heredoc_start, $-[0] - $heredoc_start;
# Pass the heredoc to the parser as the value of <heredoc terminator>
$re->lexeme_read(
'heredoc terminator', $heredoc_start,
length($heredoc_body), $heredoc_body
) // die $re->show_progress;
# Save of the position of the end of the match
# The next heredoc body starts there if there is one
$last_heredoc_end = pos $input;
# Resume parsing from the end of the <heredoc terminator> lexeme
$pos = $re->resume($end_of_pause_lexeme);
} ## end PARSE_SEGMENT: while ( $pos < length $input )
my $v = $re->value;
return $$v;
} ## end sub parse
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment