Skip to content

Instantly share code, notes, and snippets.

@jddurand
Created June 2, 2014 20:22
Show Gist options
  • Save jddurand/a16c463d25fb10a148b5 to your computer and use it in GitHub Desktop.
Save jddurand/a16c463d25fb10a148b5 to your computer and use it in GitHub Desktop.
W3C EBNF to Marpa using Marpa
inaccessible is ok by default
:start ::= document
AttDef ::= S Name S AttType S DefaultDecl
AttDef_any ::= AttDef *
AttType ::= StringType
AttType ::= TokenizedType
AttType ::= EnumeratedType
AttValue ::= _Lex030 _Gen060_any _Lex030
AttValue ::= _Lex032 _Gen063_any _Lex032
AttlistDecl ::= _Lex079 S Name AttDef_any S_maybe _Lex060
Attribute ::= Name Eq AttValue
CDEnd ::= _Lex041
CDSect ::= CDStart CData CDEnd
CDStart ::= _Lex052
CData ::= _Gen118
Char ::= _Lex001
Char ::= _Lex002
Char ::= _Lex003
Char ::= _Lex004
Char ::= _Lex005
Char ::= _Lex006
CharData ::= _Gen087
CharData_maybe ::= CharData
CharRef ::= _Lex095 _Lex026_many _Lex096
CharRef ::= _Lex097 _Lex098_many _Lex096
Char_any ::= Char *
Comment ::= _Lex042 _Gen094_any _Lex043
DeclSep ::= PEReference
DeclSep ::= S
DefaultDecl ::= _Lex089
DefaultDecl ::= _Lex090
DefaultDecl ::= _Gen244
ETag ::= _Lex065 Name S_maybe _Lex060
EmptyElemTag ::= _Lex064 Name _Gen187_any S_maybe _Lex066
EncName ::= _Lex106 _Gen287_any
EncodingDecl ::= S _Lex105 Eq _Gen284
EncodingDecl_maybe ::= EncodingDecl
EntityDecl ::= GEDecl
EntityDecl ::= PEDecl
EntityDef ::= EntityValue
EntityDef ::= _Gen273
EntityRef ::= _Lex099 Name _Lex096
EntityValue ::= _Lex030 _Gen050_any _Lex030
EntityValue ::= _Lex032 _Gen054_any _Lex032
EnumeratedType ::= NotationType
EnumeratedType ::= Enumeration
Enumeration ::= _Lex073 S_maybe Nmtoken _Gen239_any S_maybe _Lex075
Eq ::= S_maybe _Lex055 S_maybe
ExternalID ::= _Lex102 S SystemLiteral
ExternalID ::= _Lex103 S PubidLiteral S SystemLiteral
GEDecl ::= _Lex101 S Name S EntityDef S_maybe _Lex060
Ignore ::= _Gen259
Misc ::= Comment
Misc ::= PI
Misc ::= S
Misc_any ::= Misc *
Mixed ::= _Lex073 S_maybe _Lex077 _Gen216_any S_maybe _Lex078
Mixed ::= _Lex073 S_maybe _Lex077 S_maybe _Lex075
NDataDecl ::= S _Lex104 S Name
NDataDecl_maybe ::= NDataDecl
Name ::= NameStartChar _Gen038_any
NameChar ::= NameStartChar
NameChar ::= _Lex024
NameChar ::= _Lex025
NameChar ::= _Lex026
NameChar ::= _Lex027
NameChar ::= _Lex028
NameChar ::= _Lex029
NameStartChar ::= _Lex008
NameStartChar ::= _Lex009
NameStartChar ::= _Lex010
NameStartChar ::= _Lex011
NameStartChar ::= _Lex012
NameStartChar ::= _Lex013
NameStartChar ::= _Lex014
NameStartChar ::= _Lex015
NameStartChar ::= _Lex016
NameStartChar ::= _Lex017
NameStartChar ::= _Lex018
NameStartChar ::= _Lex019
NameStartChar ::= _Lex020
NameStartChar ::= _Lex021
NameStartChar ::= _Lex022
NameStartChar ::= _Lex023
Names ::= Name _Gen041_any
Nmtoken ::= _Gen044_many
Nmtokens ::= Nmtoken _Gen047_any
NotationDecl ::= _Lex108 S Name S _Gen291 S_maybe _Lex060
NotationType ::= _Lex088 S _Lex073 S_maybe Name _Gen236_any S_maybe _Lex075
PEDecl ::= _Lex101 S _Lex100 S Name S PEDef S_maybe _Lex060
PEDef ::= EntityValue
PEDef ::= ExternalID
PEReference ::= _Lex100 Name _Lex096
PI ::= _Lex044 PITarget _Gen102_maybe _Lex045
PITarget ::= _Gen112
PubidChar ::= _Lex007
PubidChar ::= _Lex003
PubidChar ::= _Lex002
PubidChar ::= _Lex038
PubidChar ::= _Lex039
PubidChar_any ::= PubidChar *
PubidLiteral ::= _Lex030 PubidChar_any _Lex030
PubidLiteral ::= _Lex032 _Gen076_any _Lex032
PublicID ::= _Lex103 S PubidLiteral
Reference ::= EntityRef
Reference ::= CharRef
S ::= _Gen009_many
SDDecl ::= S _Lex061 Eq _Gen168
SDDecl_maybe ::= SDDecl
STag ::= _Lex064 Name _Gen173_any S_maybe _Lex060
S_maybe ::= S
StringType ::= _Lex080
SystemLiteral ::= _Gen069
SystemLiteral ::= _Gen071
TextDecl ::= _Lex053 VersionInfo_maybe EncodingDecl S_maybe _Lex045
TextDecl_maybe ::= TextDecl
TokenizedType ::= _Lex081
TokenizedType ::= _Lex082
TokenizedType ::= _Lex083
TokenizedType ::= _Lex084
TokenizedType ::= _Lex085
TokenizedType ::= _Lex086
TokenizedType ::= _Lex087
VersionInfo ::= S _Lex054 Eq _Gen129
VersionInfo_maybe ::= VersionInfo
VersionNum ::= _Lex056 _Lex026_many
XMLDecl ::= _Lex053 VersionInfo EncodingDecl_maybe SDDecl_maybe S_maybe _Lex045
XMLDecl_maybe ::= XMLDecl
_Gen009 ::= _Lex007
_Gen009 ::= _Lex001
_Gen009 ::= _Lex003
_Gen009 ::= _Lex002
_Gen009_many ::= _Gen009 +
_Gen038 ::= NameChar
_Gen038_any ::= _Gen038 *
_Gen041 ::= _Lex007 Name
_Gen041_any ::= _Gen041 *
_Gen044 ::= NameChar
_Gen044_many ::= _Gen044 +
_Gen047 ::= _Lex007 Nmtoken
_Gen047_any ::= _Gen047 *
_Gen050 ::= _Lex031
_Gen050 ::= PEReference
_Gen050 ::= Reference
_Gen050_any ::= _Gen050 *
_Gen054 ::= _Lex033
_Gen054 ::= PEReference
_Gen054 ::= Reference
_Gen054_any ::= _Gen054 *
_Gen060 ::= _Lex034
_Gen060 ::= Reference
_Gen060_any ::= _Gen060 *
_Gen063 ::= _Lex035
_Gen063 ::= Reference
_Gen063_any ::= _Gen063 *
_Gen069 ::= _Lex030 _Lex036_any _Lex030
_Gen071 ::= _Lex032 _Lex037_any _Lex032
_Gen075 ::= _Exception001
_Gen076 ::= _Gen075
_Gen076_any ::= _Gen076 *
_Gen086 ::= _Lex040_any _Lex041 _Lex040_any
_Gen087 ::= _Exception002
_Gen089 ::= _Exception003
_Gen090 ::= _Gen089
_Gen091 ::= _Exception003
_Gen092 ::= _Gen091
_Gen093 ::= _Lex024 _Gen092
_Gen094 ::= _Gen090
_Gen094 ::= _Gen093
_Gen094_any ::= _Gen094 *
_Gen099 ::= Char_any _Lex045 Char_any
_Gen100 ::= _Exception004
_Gen101 ::= _Gen100
_Gen102 ::= S _Gen101
_Gen102_maybe ::= _Gen102
_Gen105 ::= _Lex046
_Gen105 ::= _Lex047
_Gen107 ::= _Lex048
_Gen107 ::= _Lex049
_Gen109 ::= _Lex050
_Gen109 ::= _Lex051
_Gen111 ::= _Gen105 _Gen107 _Gen109
_Gen112 ::= _Exception005
_Gen116 ::= Char_any _Lex041 Char_any
_Gen117 ::= _Exception006
_Gen118 ::= _Gen117
_Gen122 ::= doctypedecl Misc_any
_Gen122_maybe ::= _Gen122
_Gen129 ::= _Lex032 VersionNum _Lex032
_Gen129 ::= _Lex030 VersionNum _Lex030
_Gen138 ::= S ExternalID
_Gen138_maybe ::= _Gen138
_Gen140 ::= _Lex058 intSubset _Lex059 S_maybe
_Gen140_maybe ::= _Gen140
_Gen145 ::= markupdecl
_Gen145 ::= DeclSep
_Gen145_any ::= _Gen145 *
_Gen157 ::= markupdecl
_Gen157 ::= conditionalSect
_Gen157 ::= DeclSep
_Gen157_any ::= _Gen157 *
_Gen162 ::= _Lex062
_Gen162 ::= _Lex063
_Gen164 ::= _Lex032 _Gen162 _Lex032
_Gen165 ::= _Lex062
_Gen165 ::= _Lex063
_Gen167 ::= _Lex030 _Gen165 _Lex030
_Gen168 ::= _Gen164
_Gen168 ::= _Gen167
_Gen173 ::= S Attribute
_Gen173_any ::= _Gen173 *
_Gen179 ::= element
_Gen179 ::= Reference
_Gen179 ::= CDSect
_Gen179 ::= PI
_Gen179 ::= Comment
_Gen184 ::= _Gen179 CharData_maybe
_Gen184_any ::= _Gen184 *
_Gen187 ::= S Attribute
_Gen187_any ::= _Gen187 *
_Gen195 ::= choice
_Gen195 ::= seq
_Gen197 ::= _Lex070
_Gen197 ::= _Lex071
_Gen197 ::= _Lex072
_Gen197_maybe ::= _Gen197
_Gen202 ::= Name
_Gen202 ::= choice
_Gen202 ::= seq
_Gen205 ::= _Lex070
_Gen205 ::= _Lex071
_Gen205 ::= _Lex072
_Gen205_maybe ::= _Gen205
_Gen210 ::= S_maybe _Lex074 S_maybe cp
_Gen210_many ::= _Gen210 +
_Gen213 ::= S_maybe _Lex076 S_maybe cp
_Gen213_any ::= _Gen213 *
_Gen216 ::= S_maybe _Lex074 S_maybe Name
_Gen216_any ::= _Gen216 *
_Gen236 ::= S_maybe _Lex074 S_maybe Name
_Gen236_any ::= _Gen236 *
_Gen239 ::= S_maybe _Lex074 S_maybe Nmtoken
_Gen239_any ::= _Gen239 *
_Gen242 ::= _Lex091 S
_Gen242_maybe ::= _Gen242
_Gen244 ::= _Gen242_maybe AttValue
_Gen253 ::= _Lex092 ignoreSectContents _Lex041 Ignore
_Gen253_any ::= _Gen253 *
_Gen256 ::= _Lex092
_Gen256 ::= _Lex041
_Gen258 ::= Char_any _Gen256 Char_any
_Gen259 ::= _Exception007
_Gen273 ::= ExternalID NDataDecl_maybe
_Gen284 ::= _Lex030 EncName _Lex030
_Gen284 ::= _Lex032 EncName _Lex032
_Gen287 ::= _Lex107
_Gen287 ::= _Lex024
_Gen287_any ::= _Gen287 *
_Gen291 ::= ExternalID
_Gen291 ::= PublicID
_Lex026_many ::= _Lex026 +
_Lex036_any ::= _Lex036 *
_Lex037_any ::= _Lex037 *
_Lex040_any ::= _Lex040 *
_Lex098_many ::= _Lex098 +
children ::= _Gen195 _Gen197_maybe
choice ::= _Lex073 S_maybe cp _Gen210_many S_maybe _Lex075
conditionalSect ::= includeSect
conditionalSect ::= ignoreSect
content ::= CharData_maybe _Gen184_any
contentspec ::= _Lex068
contentspec ::= _Lex069
contentspec ::= Mixed
contentspec ::= children
cp ::= _Gen202 _Gen205_maybe
doctypedecl ::= _Lex057 S Name _Gen138_maybe S_maybe _Gen140_maybe _Lex060
document ::= prolog element Misc_any
element ::= EmptyElemTag
element ::= STag content ETag
elementdecl ::= _Lex067 S Name S contentspec S_maybe _Lex060
extParsedEnt ::= TextDecl_maybe content
extSubset ::= TextDecl_maybe extSubsetDecl
extSubsetDecl ::= _Gen157_any
ignoreSect ::= _Lex092 S_maybe _Lex094 S_maybe _Lex058 ignoreSectContents_any _Lex041
ignoreSectContents ::= Ignore _Gen253_any
ignoreSectContents_any ::= ignoreSectContents *
includeSect ::= _Lex092 S_maybe _Lex093 S_maybe _Lex058 extSubsetDecl _Lex041
intSubset ::= _Gen145_any
markupdecl ::= elementdecl
markupdecl ::= AttlistDecl
markupdecl ::= EntityDecl
markupdecl ::= NotationDecl
markupdecl ::= PI
markupdecl ::= Comment
prolog ::= XMLDecl_maybe Misc_any _Gen122_maybe
seq ::= _Lex073 S_maybe cp _Gen213_any S_maybe _Lex075
_Lex001 ~ [\x{9}]
_Lex002 ~ [\x{a}]
_Lex003 ~ [\x{d}]
_Lex004 ~ [\x{20}-\x{d7ff}]
_Lex005 ~ [\x{e000}-\x{fffd}]
_Lex006 ~ [\x{10000}-\x{10ffff}]
_Lex007 ~ [\x{20}]
_Lex008 ~ ':'
_Lex009 ~ [A-Z]
_Lex010 ~ '_'
_Lex011 ~ [a-z]
_Lex012 ~ [\x{c0}-\x{d6}]
_Lex013 ~ [\x{d8}-\x{f6}]
_Lex014 ~ [\x{f8}-\x{2ff}]
_Lex015 ~ [\x{370}-\x{37d}]
_Lex016 ~ [\x{37f}-\x{1fff}]
_Lex017 ~ [\x{200c}-\x{200d}]
_Lex018 ~ [\x{2070}-\x{218f}]
_Lex019 ~ [\x{2c00}-\x{2fef}]
_Lex020 ~ [\x{3001}-\x{d7ff}]
_Lex021 ~ [\x{f900}-\x{fdcf}]
_Lex022 ~ [\x{fdf0}-\x{fffd}]
_Lex023 ~ [\x{10000}-\x{effff}]
_Lex024 ~ '-'
_Lex025 ~ '.'
_Lex026 ~ [0-9]
_Lex027 ~ [\x{b7}]
_Lex028 ~ [\x{300}-\x{36f}]
_Lex029 ~ [\x{203f}-\x{2040}]
_Lex030 ~ '"'
_Lex031 ~ [^%&"]
_Lex032 ~ [']
_Lex033 ~ [^%&']
_Lex034 ~ [^<&"]
_Lex035 ~ [^<&']
_Lex036 ~ [^"]
_Lex037 ~ [^']
_Lex038 ~ [a-zA-Z0-9]
_Lex039 ~ [\-'()+,./:=?;!*#@$_%]
_Lex040 ~ [^<&]
_Lex041 ~ ']]>'
_Lex042 ~ '<!--'
_Lex043 ~ '-->'
_Lex044 ~ '<?'
_Lex045 ~ '?>'
_Lex046 ~ 'X'
_Lex047 ~ 'x'
_Lex048 ~ 'M'
_Lex049 ~ 'm'
_Lex050 ~ 'L'
_Lex051 ~ 'l'
_Lex052 ~ '<![CDATA['
_Lex053 ~ '<?xml'
_Lex054 ~ 'version'
_Lex055 ~ '='
_Lex056 ~ '1.'
_Lex057 ~ '<!DOCTYPE'
_Lex058 ~ '['
_Lex059 ~ ']'
_Lex060 ~ '>'
_Lex061 ~ 'standalone'
_Lex062 ~ 'yes'
_Lex063 ~ 'no'
_Lex064 ~ '<'
_Lex065 ~ '</'
_Lex066 ~ '/>'
_Lex067 ~ '<!ELEMENT'
_Lex068 ~ 'EMPTY'
_Lex069 ~ 'ANY'
_Lex070 ~ '?'
_Lex071 ~ '*'
_Lex072 ~ '+'
_Lex073 ~ '('
_Lex074 ~ '|'
_Lex075 ~ ')'
_Lex076 ~ ','
_Lex077 ~ '#PCDATA'
_Lex078 ~ ')*'
_Lex079 ~ '<!ATTLIST'
_Lex080 ~ 'CDATA'
_Lex081 ~ 'ID'
_Lex082 ~ 'IDREF'
_Lex083 ~ 'IDREFS'
_Lex084 ~ 'ENTITY'
_Lex085 ~ 'ENTITIES'
_Lex086 ~ 'NMTOKEN'
_Lex087 ~ 'NMTOKENS'
_Lex088 ~ 'NOTATION'
_Lex089 ~ '#REQUIRED'
_Lex090 ~ '#IMPLIED'
_Lex091 ~ '#FIXED'
_Lex092 ~ '<!['
_Lex093 ~ 'INCLUDE'
_Lex094 ~ 'IGNORE'
_Lex095 ~ '&#'
_Lex096 ~ ';'
_Lex097 ~ '&#x'
_Lex098 ~ [0-9a-fA-F]
_Lex099 ~ '&'
_Lex100 ~ '%'
_Lex101 ~ '<!ENTITY'
_Lex102 ~ 'SYSTEM'
_Lex103 ~ 'PUBLIC'
_Lex104 ~ 'NDATA'
_Lex105 ~ 'encoding'
_Lex106 ~ [A-Za-z]
_Lex107 ~ [A-Za-z0-9._]
_Lex108 ~ '<!NOTATION'
_Exception001 ~ 'PubidChar - _Lex032'
_Exception002 ~ '_Lex040_any - _Gen086'
_Exception003 ~ 'Char - _Lex024'
_Exception004 ~ 'Char_any - _Gen099'
_Exception005 ~ 'Name - _Gen111'
_Exception006 ~ 'Char_any - _Gen116'
_Exception007 ~ 'Char_any - _Gen258'
#!env perl
#
# This program creates a Marpa BNF of a W3C BNF.
#
# The negation form "term - term" is restricted to cases where
# first and second term consist only of lexemes.
#
use strict;
use warnings FATAL => 'all';
package Actions;
sub new() {
my $self = {
quantifiers => {},
rules => [],
lexemes => {},
lexemesWithExclusion => {},
constraints => {},
symbols => {},
start => {number => undef, rule => ''},
grammar => ''
};
return bless $self, shift;
}
sub _pushLexemes {
my ($self, $rcp, $key) = @_;
foreach (sort {$a cmp $b} keys %{$self->{$key}}) {
if ($self->{$key}->{$_} eq '\'') {
$self->{$key}->{$_} = '[\']';
}
if ($self->{$key}->{$_} =~ /^\[.+/) {
push(@{$rcp}, join(' ', $_, '~', $self->{$key}->{$_}));
} elsif ($self->{$key}->{$_} =~ /^\\x\{/) {
push(@{$rcp}, join(' ', $_, '~', '[' . $self->{$key}->{$_} . ']'));
} else {
push(@{$rcp}, join(' ', $_, '~', '\'' . $self->{$key}->{$_} . '\''));
}
$self->{symbols}->{$_}++;
}
}
sub _pushG1 {
my ($self, $rcp) = @_;
foreach (sort {$a->{lhs} cmp $b->{lhs}} @{$self->{rules}}) {
push(@{$rcp}, join(' ', $_->{lhs}, '::=', $_->{rhs}, $_->{quantifier}));
$self->{symbols}->{$_->{lhs}}++;
}
}
sub _rules {
my ($self, @rules) = @_;
my @rc = ();
if (defined($self->{start}->{number})) {
push(@rc, 'inaccessible is ok by default');
push(@rc, ':start ::= ' . $self->{start}->{rule});
push(@rc, '');
}
$self->_pushG1(\@rc);
$self->_pushLexemes(\@rc, 'lexemes');
$self->_pushLexemes(\@rc, 'lexemesWithExclusion');
$self->{grammar} = join("\n", @rc) . "\n";
return $self;
}
sub _rule {
my ($self, $rulenumber, $symbol, $rulesep, $expressions, $lhscontraints, $quantifier) = @_;
foreach (@{$expressions}) {
push(@{$self->{rules}}, {lhs => $symbol, rhs => $_, quantifier => $quantifier || ''});
}
if (defined($rulenumber)) {
#
# Generated rules have number undefined
#
$rulenumber =~ /\d+/;
my $number = $&;
if (! defined($self->{start}->{number}) ||
$number < $self->{start}->{number}) {
$self->{start} = {number => $number, rule => $symbol};
}
}
return $self;
}
sub _concatenation {
my ($self, $exceptions, $RHSconstraints) = @_;
foreach (@{$RHSconstraints}) {
$self->{constraints}->{$_} //= undef;
}
return "@{$exceptions}";
}
sub _constraint {
my ($self, $type, $name, $end) = @_;
$name =~ s/[^\w]/_/g;
$type =~ /\w+/;
return "$&_$name";
}
sub _char {
my ($self, $char) = @_;
#
# A char is either and _HEX or a _CHAR_RANGE
#
my $rc = undef;
if ($char =~ /^\#x(.*)/) {
$rc = chr(hex($1));
} else {
$rc = $char;
}
}
sub _printable {
my ($self, $chr) = @_;
if ($chr =~ /[\s]/ || (! ($chr =~ /[[:ascii:]]/) || ($chr =~ /[[:cntrl:]]/))) {
$chr = sprintf('\\x{%x}', ord($chr));
}
return $chr;
}
sub _factorCaretRange {
my ($self, $lbracket, $caret, $ranges, $rbracket) = @_;
return $self->_factor("[^$ranges]");
}
sub _factorRange {
my ($self, $lbracket, $ranges, $rbracket) = @_;
return $self->_factor("[$ranges]");
}
sub _ranges {
my ($self, @ranges) = @_;
return join('', @ranges);
}
sub _range {
my ($self, $char1, $char2) = @_;
my $range;
$char1 = $self->_printable($char1);
if (defined($char2)) {
$char2 = $self->_printable($char2);
$range = "$char1-$char2";
} else {
$range = $char1;
}
return $range;
}
sub _range1 {
my ($self, $char) = @_;
return $self->_range($self->_char($char));
}
sub _range2 {
my ($self, $char1, $minus, $char2) = @_;
return $self->_range($self->_char($char1), $self->_char($char2));
}
sub _factorExpressions {
my ($self, $lparen, $expressions, $rparen) = @_;
my $symbol = sprintf('_Gen%03d', 1 + (scalar @{$self->{rules}}));
$self->_rule(undef, $symbol, undef, $expressions, []);
return $symbol;
}
sub _factor {
my ($self, $value) = @_;
my @name = grep {$self->{lexemes}->{$_} eq $value} keys %{$self->{lexemes}};
my $name;
if (! @name) {
$name = sprintf('_Lex%03d', 1 + (keys %{$self->{lexemes}}));
$self->{lexemes}->{$name} = $value;
} else {
$name = $name[0];
}
return $name;
}
sub _factorString {
my ($self, $quote1, $string, $quote2) = @_;
return $self->_factor($string);
}
sub _hexMany {
my ($self, @hex) = @_;
return $self->_factor(join('', map {$self->_printable($self->_char($_))} @hex));
}
sub _termFactorQuantifier {
my ($self, $factor, $quantifier) = @_;
my $symbol;
if ($quantifier eq '*') {
$symbol = sprintf('%s_any', $factor);
if (! exists($self->{quantifier}->{$symbol})) {
$self->_rule(undef, $symbol, undef, [ $factor], [], $quantifier);
$self->{quantifier}->{$symbol}++;
}
} elsif ($quantifier eq '+') {
$symbol = sprintf('%s_many', $factor);
if (! exists($self->{quantifier}->{$symbol})) {
$self->_rule(undef, $symbol, undef, [ $factor], [], $quantifier);
$self->{quantifier}->{$symbol}++;
}
} elsif ($quantifier eq '?') {
$symbol = sprintf('%s_maybe', $factor);
if (! exists($self->{quantifier}->{$symbol})) {
$self->_rule(undef, $symbol, undef, ["$factor"], []);
$self->_rule(undef, $symbol, undef, [], []);
$self->{quantifier}->{$symbol}++;
}
} else {
die "Unsupported quantifier '$quantifier'";
}
return $symbol;
}
sub _exceptionTermMinusTerm {
my ($self, $term1, $minus, $term2) = @_;
my $value = "$term1 $minus $term2";
my @name = grep {$self->{lexemesWithExclusion}->{$_} eq $value} keys %{$self->{lexemesWithExclusion}};
my $name;
if (! @name) {
$name = sprintf('_Exception%03d', 1 + (keys %{$self->{lexemesWithExclusion}}));
$self->{lexemesWithExclusion}->{$name} = $value;
print STDERR "[WARN] Lexeme with exclusion: $name ::= $value\n";
} else {
$name = $name[0];
}
my $symbol = sprintf('_Gen%03d', 1 + (scalar @{$self->{rules}}));
$self->_rule(undef, $symbol, undef, [ $name ], []);
return $symbol;
}
package main;
use Marpa::R2;
use Data::Section -setup;
use Data::Dumper;
our $DATA = __PACKAGE__->local_section_data;
# Grammar and test suite are in __DATA__
# --------------------------------------
my $grammar = Marpa::R2::Scanless::G->new( { source => $DATA->{'grammar_source'}, action_object => 'Actions' });
my $recce = Marpa::R2::Scanless::R->new( {grammar => $grammar
# , trace_terminals => 1
});
eval {$recce->read($DATA->{'xml_1_0.bnf'})} || do {print STDERR "$@\n" . $recce->show_progress(); exit(1)};
my $nbvalue = 0;
my @filenames = ();
my @values = ();
my $startRule = '';
my @rules = ();
while (defined($_ = $recce->value)) {
push(@values, $_);
++$nbvalue;
push(@filenames, sprintf("C:\\Windows\\Temp\\jdd%03d.txt", $nbvalue));
open(VALUE, '>', $filenames[-1]) || die "Cannot open $filenames[-1], $!\n";
print VALUE ${$values[-1]};
close(VALUE) || warn "Cannot close $filenames[-1], $!\n";
last if ($nbvalue >= 2);
}
if ($nbvalue != 1) {
print STDERR "Oups, \$nbvalue != 1\n";
if (@filenames) {
print STDERR "Please compare @filenames\n";
}
} else {
print STDERR "Good, \$nbvalue=$nbvalue, c.f. $filenames[-1]\n";
print ${$values[0]}->{grammar};
Marpa::R2::Scanless::G->new( { source => \${$values[0]}->{grammar} });
print STDERR "Symbols: " . join(' ', sort keys %{${$values[0]}->{symbols}}) . "\n";
}
exit(0);
__DATA__
__[ grammar_source ]__
:start ::= rules
:default ::= action => ::first
lexeme default = latm => 1
#
# Inspired by ebnf-ebx.el from Emacs's ebnf2ps package
#
# The XML Spec seems to apply [WFC:] and [VC:] constraints to all rule that are on the
# same line.
# This would have complicated too much the grammar given that almost all contraints
# apply on all RHSs of a given LHS.
# The exceptions are marked with [RWFC:] and [RCV:], i.e. when the contraint is on a
# specific RHS member.
rules ::= rule+ action => _rules
rule ::= RULENUMBER SYMBOL RULESEP expressions LHSconstraints action => _rule
expressions ::= concatenation+ separator => PIPE action => [values]
concatenation ::= exceptions RHSconstraints action => _concatenation
exceptions ::= exception+ action => [values]
exception ::= term
| term MINUS term action => _exceptionTermMinusTerm
term ::= factor
| factor QUANTIFIER action => _termFactorQuantifier
hexMany ::= HEX+ action => _hexMany
factor ::= hexMany
| LBRACKET ranges RBRACKET action => _factorRange
| LBRACKET CARET ranges RBRACKET action => _factorCaretRange
| DQUOTE STRINGDQUOTE DQUOTE action => _factorString
| SQUOTE STRINGSQUOTE SQUOTE action => _factorString
| LPAREN expressions RPAREN action => _factorExpressions
| SYMBOL
ranges ::= range+ action => _ranges
range ::= CHAR action => _range1
| CHAR MINUS CHAR action => _range2
LHSconstraint ::= LHSWfcConstraint
| LHSVcConstraint
LHSconstraints ::= LHSconstraint* action => [values]
LHSWfcConstraint ::= LHSWFCSTART CONSTRAINTBODY RBRACKET action => _constraint
LHSVcConstraint ::= LHSVCSTART CONSTRAINTBODY RBRACKET action => _constraint
RHSconstraint ::= RHSWfcConstraint
| RHSVcConstraint
RHSconstraints ::= RHSconstraint* action => [values]
RHSWfcConstraint ::= RHSWFCSTART CONSTRAINTBODY RBRACKET action => _constraint
RHSVcConstraint ::= RHSVCSTART CONSTRAINTBODY RBRACKET action => _constraint
RULESEP ~ '::='
PIPE ~ '|'
MINUS ~ '-'
QUANTIFIER ~ '*' | '+' | '?'
HEX ~ _HEX
CHAR ~ _CHAR
LBRACKET ~ '['
RBRACKET ~ ']'
LPAREN ~ '('
RPAREN ~ ')'
CARET ~ '^'
DQUOTE ~ '"'
SQUOTE ~ [']
STRINGDQUOTE ~ _STRING_DQUOTE_UNIT*
STRINGSQUOTE ~ _STRING_SQUOTE_UNIT*
SYMBOL ~ _SYMBOL_START _SYMBOL_END
LHSWFCSTART ~ '[wfc:':i
LHSVCSTART ~ '[vc:':i
RHSWFCSTART ~ '[rhs_wfc:':i
RHSVCSTART ~ '[rhs_vc:':i
RULENUMBER ~ '[' _RULE_NUMBER_START _RULE_NUMBER_END ']'
_STRING_DQUOTE_UNIT ~ [^"] | '\"'
_STRING_SQUOTE_UNIT ~ [^'] | '\' [']
_HEX ~ __HEX_START __HEX_END
_CHAR_RANGE ~ [^\r\n\t\v\f\]\[\-\^]
| '\['
| '\]'
| '\-'
| '\^'
_CHAR ~ _HEX | _CHAR_RANGE
CONSTRAINTBODY ~ [^\]]*
_SYMBOL_START ~ [a-zA-Z]
_SYMBOL_END ~ [-_a-zA-Z]*
_RULE_NUMBER_START ~ [\d]+
_RULE_NUMBER_END ~ [a-zA-Z]*
__HEX_START ~ '#x'
__HEX_END ~ [0-9A-Fa-f]+
############################################################################
# Discard of a C comment, c.f. https://gist.github.com/jeffreykegler/5015057
############################################################################
<C style comment> ~ '/*' <comment interior> '*/'
<comment interior> ~
<optional non stars>
<optional star prefixed segments>
<optional pre final stars>
<optional non stars> ~ [^*]*
<optional star prefixed segments> ~ <star prefixed segment>*
<star prefixed segment> ~ <stars> [^/*] <optional star free text>
<stars> ~ [*]+
<optional star free text> ~ [^*]*
<optional pre final stars> ~ [*]*
:discard ~ <C style comment>
#################
# Generic discard
#################
__SPACE_ANY ~ [\s]+
:discard ~ __SPACE_ANY
__[ xml_1_0.bnf ]__
[1] document ::= prolog element Misc*
[2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */
[3] S ::= (#x20 | #x9 | #xD | #xA)+
[4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
[4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
[5] Name ::= NameStartChar (NameChar)*
[6] Names ::= Name (#x20 Name)*
[7] Nmtoken ::= (NameChar)+
[8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
[9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
| "'" ([^%&'] | PEReference | Reference)* "'"
[10] AttValue ::= '"' ([^<&"] | Reference)* '"'
| "'" ([^<&'] | Reference)* "'"
[11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
[12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
[13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [\-'()+,./:=?;!*#@$_%]
[14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
[15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
[16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
[17] PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
[18] CDSect ::= CDStart CData CDEnd
[19] CDStart ::= '<![CDATA['
[20] CData ::= (Char* - (Char* ']]>' Char*))
[21] CDEnd ::= ']]>'
[22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
[23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
[24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
[25] Eq ::= S? '=' S?
[26] VersionNum ::= '1.' [0-9]+
[27] Misc ::= Comment | PI | S
[28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' [VC: Root Element Type]
[WFC: External Subset]
[28a] DeclSep ::= PEReference | S [WFC: PE Between Declarations]
[28b] intSubset ::= (markupdecl | DeclSep)*
[29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment [VC: Proper Declaration/PE Nesting]
[WFC: PEs in Internal Subset]
[30] extSubset ::= TextDecl? extSubsetDecl
[31] extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)*
[32] SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"')) [VC: Standalone Document Declaration]
[39] element ::= EmptyElemTag
| STag content ETag [WFC: Element Type Match]
[VC: Element Valid]
[40] STag ::= '<' Name (S Attribute)* S? '>' [WFC: Unique Att Spec]
[41] Attribute ::= Name Eq AttValue [VC: Attribute Value Type]
[WFC: No External Entity References]
[WFC: No < in Attribute Values]
[42] ETag ::= '</' Name S? '>'
[43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
[44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' [WFC: Unique Att Spec]
[45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' [VC: Unique Element Type Declaration]
[46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | children
[47] children ::= (choice | seq) ('?' | '*' | '+')?
[48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
[49] choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' [VC: Proper Group/PE Nesting]
[50] seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' [VC: Proper Group/PE Nesting]
[51] Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*'
| '(' S? '#PCDATA' S? ')' [VC: Proper Group/PE Nesting]
[VC: No Duplicate Types]
[52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
[53] AttDef ::= S Name S AttType S DefaultDecl
[54] AttType ::= StringType | TokenizedType | EnumeratedType
[55] StringType ::= 'CDATA'
[56] TokenizedType ::= 'ID' [RHS_VC: ID]
[RHS_VC: One ID per Element Type]
[RHS_VC: ID Attribute Default]
| 'IDREF' [RHS_VC: IDREF]
| 'IDREFS' [RHS_VC: IDREF]
| 'ENTITY' [RHS_VC: Entity Name]
| 'ENTITIES' [RHS_VC: Entity Name]
| 'NMTOKEN' [RHS_VC: Name Token]
| 'NMTOKENS' [RHS_VC: Name Token]
[57] EnumeratedType ::= NotationType | Enumeration
[58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' [VC: Notation Attributes]
[VC: One Notation Per Element Type]
[VC: No Notation on Empty Element]
[VC: No Duplicate Tokens]
[59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' [VC: Enumeration]
[VC: No Duplicate Tokens]
[60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
| (('#FIXED' S)? AttValue) [VC: Required Attribute]
[VC: Attribute Default Value Syntactically Correct]
[WFC: No < in Attribute Values]
[VC: Fixed Attribute Default]
[WFC: No External Entity References]
[61] conditionalSect ::= includeSect | ignoreSect
[62] includeSect ::= '<![' S? 'INCLUDE' S? '[' extSubsetDecl ']]>' [VC: Proper Conditional Section/PE Nesting]
[63] ignoreSect ::= '<![' S? 'IGNORE' S? '[' ignoreSectContents* ']]>' [VC: Proper Conditional Section/PE Nesting]
[64] ignoreSectContents ::= Ignore ('<![' ignoreSectContents ']]>' Ignore)*
[65] Ignore ::= Char* - (Char* ('<![' | ']]>') Char*)
[66] CharRef ::= '&#' [0-9]+ ';'
| '&#x' [0-9a-fA-F]+ ';' [WFC: Legal Character]
[67] Reference ::= EntityRef | CharRef
[68] EntityRef ::= '&' Name ';' [WFC: Entity Declared]
[VC: Entity Declared]
[WFC: Parsed Entity]
[WFC: No Recursion]
[69] PEReference ::= '%' Name ';' [VC: Entity Declared]
[WFC: No Recursion]
[WFC: In DTD]
[70] EntityDecl ::= GEDecl | PEDecl
[71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
[72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
[73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
[74] PEDef ::= EntityValue | ExternalID
[75] ExternalID ::= 'SYSTEM' S SystemLiteral
| 'PUBLIC' S PubidLiteral S SystemLiteral
[76] NDataDecl ::= S 'NDATA' S Name [VC: Notation Declared]
[77] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
[78] extParsedEnt ::= TextDecl? content
[80] EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )
[81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* /* Encoding name contains only Latin characters */
[82] NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>' [VC: Unique Notation Name]
[83] PublicID ::= 'PUBLIC' S PubidLiteral
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment