public
Created

Example of Perl-style heredoc parsing, for blog post.

  • Download Gist
heredoc.t
Perl
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
#!perl
# Copyright 2013 Jeffrey Kegler
# This file is part of Marpa::R2. Marpa::R2 is free software: you can
# redistribute it and/or modify it under the terms of the GNU Lesser
# General Public License as published by the Free Software Foundation,
# either version 3 of the License, or (at your option) any later version.
#
# Marpa::R2 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser
# General Public License along with Marpa::R2. If not, see
# http://www.gnu.org/licenses/.
 
# Example of Perl-style here-document parsing, based
# on code by Peter Stuifzand
 
use 5.010;
use strict;
use warnings;
use English qw( -no_match_vars );
use Data::Dumper;
use GetOpt::Long;
use autodie;
use Test::More tests => 1;
 
use Marpa::R2 2.052000;
 
sub usage {
 
die <<"END_OF_USAGE_MESSAGE";
$PROGRAM_NAME
END_OF_USAGE_MESSAGE
} ## end sub usage
 
my $getopt_result = Getopt::Long::GetOptions();
usage() if not $getopt_result;
 
my $p = Demo::Heredoc::Parser->new;
 
my $v = $p->parse(<<"INPUT");
say <<ENDA, <<ENDB, <<ENDC; say <<ENDD;
a
ENDA
b
ENDB
c
ENDC
d
ENDD
 
INPUT
 
my $expected = [
[ [ 'say', [ "a\n", "b\n", "c\n", ], ], ],
[ [ 'say', [ "d\n", ], ], ]
];
 
is_deeply( $v, $expected );
 
package Demo::Heredoc::Parser;
 
sub new {
my $class = shift;
 
my $grammar = Marpa::R2::Scanless::G->new(
{ default_action => '::array',
 
source => \<<'GRAMMAR',
 
:start ::= statements
 
statements ::= statement+
 
# Statement should handle their own semicolons
 
statement ::= expressions semicolon action => ::first
| newline
 
expressions ::= expression+ separator => comma
 
expression ::= heredoc action => ::first
| 'say' expressions
 
# The heredoc rule is different from how the source code actually looks The
# pause adverb allows to send only the parts the are useful
 
heredoc ::= (<heredoc op>) <heredoc terminator> action => ::first
 
# Pause at <heredoc terminator> and at newlines.
:lexeme ~ <heredoc terminator> pause => before
:lexeme ~ newline pause => before
 
<heredoc op> ~ '<<'
semicolon ~ ';'
comma ~ ','
newline ~ [\n]
 
# The syntax here is for the terminator itself.
# The actual value of the <heredoc terminator> lexeme will
# the heredoc, which will be provided by the external heredoc scanner.
<heredoc terminator> ~ [\w]+
 
# Only discard horizontal whitespace. If "\n" is included the parser won't
# pause at the end of line.
 
:discard ~ ws
ws ~ [ \t]+
 
GRAMMAR
}
);
 
my $self = { grammar => $grammar, };
 
return bless $self, $class;
} ## end sub new
 
sub parse {
my ( $self, $input ) = @_;
 
my $re = Marpa::R2::Scanless::R->new( { grammar => $self->{grammar} } );
 
# Start the parse
my $pos = $re->read( \$input );
die "error" if $pos < 0;
 
my $last_heredoc_end;
 
# Loop while the parse has't moved past the end
PARSE_SEGMENT: while ( $pos < length $input ) {
 
my $lexeme = $re->pause_lexeme();
my ( $start_of_pause_lexeme, $length_of_pause_lexeme ) =
$re->pause_span();
my $end_of_pause_lexeme =
$start_of_pause_lexeme + $length_of_pause_lexeme;
 
if ( $re->pause_lexeme() eq 'newline' ) {
 
# Resume from the end of the last heredoc, if there
# was one. Otherwise just resume at the start of the
# next line.
$pos = $re->resume( $last_heredoc_end // $end_of_pause_lexeme );
$last_heredoc_end = undef;
next PARSE_SEGMENT;
} ## end if ( $re->pause_lexeme() eq 'newline' )
 
# If we are here, the pause lexeme was <heredoc terminator>
 
# Find the <heredoc terminator>
my $terminator =
$re->literal( $start_of_pause_lexeme, $length_of_pause_lexeme );
 
my $heredoc_start = $last_heredoc_end
// ( index( $input, "\n", $pos ) + 1 );
 
# Find the heredoc body --
# the literal text between the end of the last heredoc
# and the heredoc terminator for this heredoc
pos $input = $heredoc_start;
my ($heredoc_body) = ( $input =~ m/\G(.*)^$terminator\n/gmsc );
die "Heredoc terminator $terminator not found before end of input"
if not defined $heredoc_body;
 
# Pass the heredoc to the parser as the value of <heredoc terminator>
$re->lexeme_read(
'heredoc terminator', $heredoc_start,
length($heredoc_body), $heredoc_body
) // die $re->show_progress;
 
# Save of the position of the end of the match
# The next heredoc body starts there if there is one
$last_heredoc_end = pos $input;
 
# Resume parsing from the end of the <heredoc terminator> lexeme
$pos = $re->resume($end_of_pause_lexeme);
 
} ## end PARSE_SEGMENT: while ( $pos < length $input )
 
my $v = $re->value;
return $$v;
} ## end sub parse

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.