Created
June 2, 2014 20:22
-
-
Save jddurand/a16c463d25fb10a148b5 to your computer and use it in GitHub Desktop.
W3C EBNF to Marpa using Marpa
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
inaccessible is ok by default | |
:start ::= document | |
AttDef ::= S Name S AttType S DefaultDecl | |
AttDef_any ::= AttDef * | |
AttType ::= StringType | |
AttType ::= TokenizedType | |
AttType ::= EnumeratedType | |
AttValue ::= _Lex030 _Gen060_any _Lex030 | |
AttValue ::= _Lex032 _Gen063_any _Lex032 | |
AttlistDecl ::= _Lex079 S Name AttDef_any S_maybe _Lex060 | |
Attribute ::= Name Eq AttValue | |
CDEnd ::= _Lex041 | |
CDSect ::= CDStart CData CDEnd | |
CDStart ::= _Lex052 | |
CData ::= _Gen118 | |
Char ::= _Lex001 | |
Char ::= _Lex002 | |
Char ::= _Lex003 | |
Char ::= _Lex004 | |
Char ::= _Lex005 | |
Char ::= _Lex006 | |
CharData ::= _Gen087 | |
CharData_maybe ::= CharData | |
CharRef ::= _Lex095 _Lex026_many _Lex096 | |
CharRef ::= _Lex097 _Lex098_many _Lex096 | |
Char_any ::= Char * | |
Comment ::= _Lex042 _Gen094_any _Lex043 | |
DeclSep ::= PEReference | |
DeclSep ::= S | |
DefaultDecl ::= _Lex089 | |
DefaultDecl ::= _Lex090 | |
DefaultDecl ::= _Gen244 | |
ETag ::= _Lex065 Name S_maybe _Lex060 | |
EmptyElemTag ::= _Lex064 Name _Gen187_any S_maybe _Lex066 | |
EncName ::= _Lex106 _Gen287_any | |
EncodingDecl ::= S _Lex105 Eq _Gen284 | |
EncodingDecl_maybe ::= EncodingDecl | |
EntityDecl ::= GEDecl | |
EntityDecl ::= PEDecl | |
EntityDef ::= EntityValue | |
EntityDef ::= _Gen273 | |
EntityRef ::= _Lex099 Name _Lex096 | |
EntityValue ::= _Lex030 _Gen050_any _Lex030 | |
EntityValue ::= _Lex032 _Gen054_any _Lex032 | |
EnumeratedType ::= NotationType | |
EnumeratedType ::= Enumeration | |
Enumeration ::= _Lex073 S_maybe Nmtoken _Gen239_any S_maybe _Lex075 | |
Eq ::= S_maybe _Lex055 S_maybe | |
ExternalID ::= _Lex102 S SystemLiteral | |
ExternalID ::= _Lex103 S PubidLiteral S SystemLiteral | |
GEDecl ::= _Lex101 S Name S EntityDef S_maybe _Lex060 | |
Ignore ::= _Gen259 | |
Misc ::= Comment | |
Misc ::= PI | |
Misc ::= S | |
Misc_any ::= Misc * | |
Mixed ::= _Lex073 S_maybe _Lex077 _Gen216_any S_maybe _Lex078 | |
Mixed ::= _Lex073 S_maybe _Lex077 S_maybe _Lex075 | |
NDataDecl ::= S _Lex104 S Name | |
NDataDecl_maybe ::= NDataDecl | |
Name ::= NameStartChar _Gen038_any | |
NameChar ::= NameStartChar | |
NameChar ::= _Lex024 | |
NameChar ::= _Lex025 | |
NameChar ::= _Lex026 | |
NameChar ::= _Lex027 | |
NameChar ::= _Lex028 | |
NameChar ::= _Lex029 | |
NameStartChar ::= _Lex008 | |
NameStartChar ::= _Lex009 | |
NameStartChar ::= _Lex010 | |
NameStartChar ::= _Lex011 | |
NameStartChar ::= _Lex012 | |
NameStartChar ::= _Lex013 | |
NameStartChar ::= _Lex014 | |
NameStartChar ::= _Lex015 | |
NameStartChar ::= _Lex016 | |
NameStartChar ::= _Lex017 | |
NameStartChar ::= _Lex018 | |
NameStartChar ::= _Lex019 | |
NameStartChar ::= _Lex020 | |
NameStartChar ::= _Lex021 | |
NameStartChar ::= _Lex022 | |
NameStartChar ::= _Lex023 | |
Names ::= Name _Gen041_any | |
Nmtoken ::= _Gen044_many | |
Nmtokens ::= Nmtoken _Gen047_any | |
NotationDecl ::= _Lex108 S Name S _Gen291 S_maybe _Lex060 | |
NotationType ::= _Lex088 S _Lex073 S_maybe Name _Gen236_any S_maybe _Lex075 | |
PEDecl ::= _Lex101 S _Lex100 S Name S PEDef S_maybe _Lex060 | |
PEDef ::= EntityValue | |
PEDef ::= ExternalID | |
PEReference ::= _Lex100 Name _Lex096 | |
PI ::= _Lex044 PITarget _Gen102_maybe _Lex045 | |
PITarget ::= _Gen112 | |
PubidChar ::= _Lex007 | |
PubidChar ::= _Lex003 | |
PubidChar ::= _Lex002 | |
PubidChar ::= _Lex038 | |
PubidChar ::= _Lex039 | |
PubidChar_any ::= PubidChar * | |
PubidLiteral ::= _Lex030 PubidChar_any _Lex030 | |
PubidLiteral ::= _Lex032 _Gen076_any _Lex032 | |
PublicID ::= _Lex103 S PubidLiteral | |
Reference ::= EntityRef | |
Reference ::= CharRef | |
S ::= _Gen009_many | |
SDDecl ::= S _Lex061 Eq _Gen168 | |
SDDecl_maybe ::= SDDecl | |
STag ::= _Lex064 Name _Gen173_any S_maybe _Lex060 | |
S_maybe ::= S | |
StringType ::= _Lex080 | |
SystemLiteral ::= _Gen069 | |
SystemLiteral ::= _Gen071 | |
TextDecl ::= _Lex053 VersionInfo_maybe EncodingDecl S_maybe _Lex045 | |
TextDecl_maybe ::= TextDecl | |
TokenizedType ::= _Lex081 | |
TokenizedType ::= _Lex082 | |
TokenizedType ::= _Lex083 | |
TokenizedType ::= _Lex084 | |
TokenizedType ::= _Lex085 | |
TokenizedType ::= _Lex086 | |
TokenizedType ::= _Lex087 | |
VersionInfo ::= S _Lex054 Eq _Gen129 | |
VersionInfo_maybe ::= VersionInfo | |
VersionNum ::= _Lex056 _Lex026_many | |
XMLDecl ::= _Lex053 VersionInfo EncodingDecl_maybe SDDecl_maybe S_maybe _Lex045 | |
XMLDecl_maybe ::= XMLDecl | |
_Gen009 ::= _Lex007 | |
_Gen009 ::= _Lex001 | |
_Gen009 ::= _Lex003 | |
_Gen009 ::= _Lex002 | |
_Gen009_many ::= _Gen009 + | |
_Gen038 ::= NameChar | |
_Gen038_any ::= _Gen038 * | |
_Gen041 ::= _Lex007 Name | |
_Gen041_any ::= _Gen041 * | |
_Gen044 ::= NameChar | |
_Gen044_many ::= _Gen044 + | |
_Gen047 ::= _Lex007 Nmtoken | |
_Gen047_any ::= _Gen047 * | |
_Gen050 ::= _Lex031 | |
_Gen050 ::= PEReference | |
_Gen050 ::= Reference | |
_Gen050_any ::= _Gen050 * | |
_Gen054 ::= _Lex033 | |
_Gen054 ::= PEReference | |
_Gen054 ::= Reference | |
_Gen054_any ::= _Gen054 * | |
_Gen060 ::= _Lex034 | |
_Gen060 ::= Reference | |
_Gen060_any ::= _Gen060 * | |
_Gen063 ::= _Lex035 | |
_Gen063 ::= Reference | |
_Gen063_any ::= _Gen063 * | |
_Gen069 ::= _Lex030 _Lex036_any _Lex030 | |
_Gen071 ::= _Lex032 _Lex037_any _Lex032 | |
_Gen075 ::= _Exception001 | |
_Gen076 ::= _Gen075 | |
_Gen076_any ::= _Gen076 * | |
_Gen086 ::= _Lex040_any _Lex041 _Lex040_any | |
_Gen087 ::= _Exception002 | |
_Gen089 ::= _Exception003 | |
_Gen090 ::= _Gen089 | |
_Gen091 ::= _Exception003 | |
_Gen092 ::= _Gen091 | |
_Gen093 ::= _Lex024 _Gen092 | |
_Gen094 ::= _Gen090 | |
_Gen094 ::= _Gen093 | |
_Gen094_any ::= _Gen094 * | |
_Gen099 ::= Char_any _Lex045 Char_any | |
_Gen100 ::= _Exception004 | |
_Gen101 ::= _Gen100 | |
_Gen102 ::= S _Gen101 | |
_Gen102_maybe ::= _Gen102 | |
_Gen105 ::= _Lex046 | |
_Gen105 ::= _Lex047 | |
_Gen107 ::= _Lex048 | |
_Gen107 ::= _Lex049 | |
_Gen109 ::= _Lex050 | |
_Gen109 ::= _Lex051 | |
_Gen111 ::= _Gen105 _Gen107 _Gen109 | |
_Gen112 ::= _Exception005 | |
_Gen116 ::= Char_any _Lex041 Char_any | |
_Gen117 ::= _Exception006 | |
_Gen118 ::= _Gen117 | |
_Gen122 ::= doctypedecl Misc_any | |
_Gen122_maybe ::= _Gen122 | |
_Gen129 ::= _Lex032 VersionNum _Lex032 | |
_Gen129 ::= _Lex030 VersionNum _Lex030 | |
_Gen138 ::= S ExternalID | |
_Gen138_maybe ::= _Gen138 | |
_Gen140 ::= _Lex058 intSubset _Lex059 S_maybe | |
_Gen140_maybe ::= _Gen140 | |
_Gen145 ::= markupdecl | |
_Gen145 ::= DeclSep | |
_Gen145_any ::= _Gen145 * | |
_Gen157 ::= markupdecl | |
_Gen157 ::= conditionalSect | |
_Gen157 ::= DeclSep | |
_Gen157_any ::= _Gen157 * | |
_Gen162 ::= _Lex062 | |
_Gen162 ::= _Lex063 | |
_Gen164 ::= _Lex032 _Gen162 _Lex032 | |
_Gen165 ::= _Lex062 | |
_Gen165 ::= _Lex063 | |
_Gen167 ::= _Lex030 _Gen165 _Lex030 | |
_Gen168 ::= _Gen164 | |
_Gen168 ::= _Gen167 | |
_Gen173 ::= S Attribute | |
_Gen173_any ::= _Gen173 * | |
_Gen179 ::= element | |
_Gen179 ::= Reference | |
_Gen179 ::= CDSect | |
_Gen179 ::= PI | |
_Gen179 ::= Comment | |
_Gen184 ::= _Gen179 CharData_maybe | |
_Gen184_any ::= _Gen184 * | |
_Gen187 ::= S Attribute | |
_Gen187_any ::= _Gen187 * | |
_Gen195 ::= choice | |
_Gen195 ::= seq | |
_Gen197 ::= _Lex070 | |
_Gen197 ::= _Lex071 | |
_Gen197 ::= _Lex072 | |
_Gen197_maybe ::= _Gen197 | |
_Gen202 ::= Name | |
_Gen202 ::= choice | |
_Gen202 ::= seq | |
_Gen205 ::= _Lex070 | |
_Gen205 ::= _Lex071 | |
_Gen205 ::= _Lex072 | |
_Gen205_maybe ::= _Gen205 | |
_Gen210 ::= S_maybe _Lex074 S_maybe cp | |
_Gen210_many ::= _Gen210 + | |
_Gen213 ::= S_maybe _Lex076 S_maybe cp | |
_Gen213_any ::= _Gen213 * | |
_Gen216 ::= S_maybe _Lex074 S_maybe Name | |
_Gen216_any ::= _Gen216 * | |
_Gen236 ::= S_maybe _Lex074 S_maybe Name | |
_Gen236_any ::= _Gen236 * | |
_Gen239 ::= S_maybe _Lex074 S_maybe Nmtoken | |
_Gen239_any ::= _Gen239 * | |
_Gen242 ::= _Lex091 S | |
_Gen242_maybe ::= _Gen242 | |
_Gen244 ::= _Gen242_maybe AttValue | |
_Gen253 ::= _Lex092 ignoreSectContents _Lex041 Ignore | |
_Gen253_any ::= _Gen253 * | |
_Gen256 ::= _Lex092 | |
_Gen256 ::= _Lex041 | |
_Gen258 ::= Char_any _Gen256 Char_any | |
_Gen259 ::= _Exception007 | |
_Gen273 ::= ExternalID NDataDecl_maybe | |
_Gen284 ::= _Lex030 EncName _Lex030 | |
_Gen284 ::= _Lex032 EncName _Lex032 | |
_Gen287 ::= _Lex107 | |
_Gen287 ::= _Lex024 | |
_Gen287_any ::= _Gen287 * | |
_Gen291 ::= ExternalID | |
_Gen291 ::= PublicID | |
_Lex026_many ::= _Lex026 + | |
_Lex036_any ::= _Lex036 * | |
_Lex037_any ::= _Lex037 * | |
_Lex040_any ::= _Lex040 * | |
_Lex098_many ::= _Lex098 + | |
children ::= _Gen195 _Gen197_maybe | |
choice ::= _Lex073 S_maybe cp _Gen210_many S_maybe _Lex075 | |
conditionalSect ::= includeSect | |
conditionalSect ::= ignoreSect | |
content ::= CharData_maybe _Gen184_any | |
contentspec ::= _Lex068 | |
contentspec ::= _Lex069 | |
contentspec ::= Mixed | |
contentspec ::= children | |
cp ::= _Gen202 _Gen205_maybe | |
doctypedecl ::= _Lex057 S Name _Gen138_maybe S_maybe _Gen140_maybe _Lex060 | |
document ::= prolog element Misc_any | |
element ::= EmptyElemTag | |
element ::= STag content ETag | |
elementdecl ::= _Lex067 S Name S contentspec S_maybe _Lex060 | |
extParsedEnt ::= TextDecl_maybe content | |
extSubset ::= TextDecl_maybe extSubsetDecl | |
extSubsetDecl ::= _Gen157_any | |
ignoreSect ::= _Lex092 S_maybe _Lex094 S_maybe _Lex058 ignoreSectContents_any _Lex041 | |
ignoreSectContents ::= Ignore _Gen253_any | |
ignoreSectContents_any ::= ignoreSectContents * | |
includeSect ::= _Lex092 S_maybe _Lex093 S_maybe _Lex058 extSubsetDecl _Lex041 | |
intSubset ::= _Gen145_any | |
markupdecl ::= elementdecl | |
markupdecl ::= AttlistDecl | |
markupdecl ::= EntityDecl | |
markupdecl ::= NotationDecl | |
markupdecl ::= PI | |
markupdecl ::= Comment | |
prolog ::= XMLDecl_maybe Misc_any _Gen122_maybe | |
seq ::= _Lex073 S_maybe cp _Gen213_any S_maybe _Lex075 | |
_Lex001 ~ [\x{9}] | |
_Lex002 ~ [\x{a}] | |
_Lex003 ~ [\x{d}] | |
_Lex004 ~ [\x{20}-\x{d7ff}] | |
_Lex005 ~ [\x{e000}-\x{fffd}] | |
_Lex006 ~ [\x{10000}-\x{10ffff}] | |
_Lex007 ~ [\x{20}] | |
_Lex008 ~ ':' | |
_Lex009 ~ [A-Z] | |
_Lex010 ~ '_' | |
_Lex011 ~ [a-z] | |
_Lex012 ~ [\x{c0}-\x{d6}] | |
_Lex013 ~ [\x{d8}-\x{f6}] | |
_Lex014 ~ [\x{f8}-\x{2ff}] | |
_Lex015 ~ [\x{370}-\x{37d}] | |
_Lex016 ~ [\x{37f}-\x{1fff}] | |
_Lex017 ~ [\x{200c}-\x{200d}] | |
_Lex018 ~ [\x{2070}-\x{218f}] | |
_Lex019 ~ [\x{2c00}-\x{2fef}] | |
_Lex020 ~ [\x{3001}-\x{d7ff}] | |
_Lex021 ~ [\x{f900}-\x{fdcf}] | |
_Lex022 ~ [\x{fdf0}-\x{fffd}] | |
_Lex023 ~ [\x{10000}-\x{effff}] | |
_Lex024 ~ '-' | |
_Lex025 ~ '.' | |
_Lex026 ~ [0-9] | |
_Lex027 ~ [\x{b7}] | |
_Lex028 ~ [\x{300}-\x{36f}] | |
_Lex029 ~ [\x{203f}-\x{2040}] | |
_Lex030 ~ '"' | |
_Lex031 ~ [^%&"] | |
_Lex032 ~ ['] | |
_Lex033 ~ [^%&'] | |
_Lex034 ~ [^<&"] | |
_Lex035 ~ [^<&'] | |
_Lex036 ~ [^"] | |
_Lex037 ~ [^'] | |
_Lex038 ~ [a-zA-Z0-9] | |
_Lex039 ~ [\-'()+,./:=?;!*#@$_%] | |
_Lex040 ~ [^<&] | |
_Lex041 ~ ']]>' | |
_Lex042 ~ '<!--' | |
_Lex043 ~ '-->' | |
_Lex044 ~ '<?' | |
_Lex045 ~ '?>' | |
_Lex046 ~ 'X' | |
_Lex047 ~ 'x' | |
_Lex048 ~ 'M' | |
_Lex049 ~ 'm' | |
_Lex050 ~ 'L' | |
_Lex051 ~ 'l' | |
_Lex052 ~ '<![CDATA[' | |
_Lex053 ~ '<?xml' | |
_Lex054 ~ 'version' | |
_Lex055 ~ '=' | |
_Lex056 ~ '1.' | |
_Lex057 ~ '<!DOCTYPE' | |
_Lex058 ~ '[' | |
_Lex059 ~ ']' | |
_Lex060 ~ '>' | |
_Lex061 ~ 'standalone' | |
_Lex062 ~ 'yes' | |
_Lex063 ~ 'no' | |
_Lex064 ~ '<' | |
_Lex065 ~ '</' | |
_Lex066 ~ '/>' | |
_Lex067 ~ '<!ELEMENT' | |
_Lex068 ~ 'EMPTY' | |
_Lex069 ~ 'ANY' | |
_Lex070 ~ '?' | |
_Lex071 ~ '*' | |
_Lex072 ~ '+' | |
_Lex073 ~ '(' | |
_Lex074 ~ '|' | |
_Lex075 ~ ')' | |
_Lex076 ~ ',' | |
_Lex077 ~ '#PCDATA' | |
_Lex078 ~ ')*' | |
_Lex079 ~ '<!ATTLIST' | |
_Lex080 ~ 'CDATA' | |
_Lex081 ~ 'ID' | |
_Lex082 ~ 'IDREF' | |
_Lex083 ~ 'IDREFS' | |
_Lex084 ~ 'ENTITY' | |
_Lex085 ~ 'ENTITIES' | |
_Lex086 ~ 'NMTOKEN' | |
_Lex087 ~ 'NMTOKENS' | |
_Lex088 ~ 'NOTATION' | |
_Lex089 ~ '#REQUIRED' | |
_Lex090 ~ '#IMPLIED' | |
_Lex091 ~ '#FIXED' | |
_Lex092 ~ '<![' | |
_Lex093 ~ 'INCLUDE' | |
_Lex094 ~ 'IGNORE' | |
_Lex095 ~ '&#' | |
_Lex096 ~ ';' | |
_Lex097 ~ '&#x' | |
_Lex098 ~ [0-9a-fA-F] | |
_Lex099 ~ '&' | |
_Lex100 ~ '%' | |
_Lex101 ~ '<!ENTITY' | |
_Lex102 ~ 'SYSTEM' | |
_Lex103 ~ 'PUBLIC' | |
_Lex104 ~ 'NDATA' | |
_Lex105 ~ 'encoding' | |
_Lex106 ~ [A-Za-z] | |
_Lex107 ~ [A-Za-z0-9._] | |
_Lex108 ~ '<!NOTATION' | |
_Exception001 ~ 'PubidChar - _Lex032' | |
_Exception002 ~ '_Lex040_any - _Gen086' | |
_Exception003 ~ 'Char - _Lex024' | |
_Exception004 ~ 'Char_any - _Gen099' | |
_Exception005 ~ 'Name - _Gen111' | |
_Exception006 ~ 'Char_any - _Gen116' | |
_Exception007 ~ 'Char_any - _Gen258' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!env perl | |
# | |
# This program creates a Marpa BNF of a W3C BNF. | |
# | |
# The negation form "term - term" is restricted to cases where | |
# first and second term consist only of lexemes. | |
# | |
use strict; | |
use warnings FATAL => 'all'; | |
package Actions; | |
sub new() { | |
my $self = { | |
quantifiers => {}, | |
rules => [], | |
lexemes => {}, | |
lexemesWithExclusion => {}, | |
constraints => {}, | |
symbols => {}, | |
start => {number => undef, rule => ''}, | |
grammar => '' | |
}; | |
return bless $self, shift; | |
} | |
sub _pushLexemes { | |
my ($self, $rcp, $key) = @_; | |
foreach (sort {$a cmp $b} keys %{$self->{$key}}) { | |
if ($self->{$key}->{$_} eq '\'') { | |
$self->{$key}->{$_} = '[\']'; | |
} | |
if ($self->{$key}->{$_} =~ /^\[.+/) { | |
push(@{$rcp}, join(' ', $_, '~', $self->{$key}->{$_})); | |
} elsif ($self->{$key}->{$_} =~ /^\\x\{/) { | |
push(@{$rcp}, join(' ', $_, '~', '[' . $self->{$key}->{$_} . ']')); | |
} else { | |
push(@{$rcp}, join(' ', $_, '~', '\'' . $self->{$key}->{$_} . '\'')); | |
} | |
$self->{symbols}->{$_}++; | |
} | |
} | |
sub _pushG1 { | |
my ($self, $rcp) = @_; | |
foreach (sort {$a->{lhs} cmp $b->{lhs}} @{$self->{rules}}) { | |
push(@{$rcp}, join(' ', $_->{lhs}, '::=', $_->{rhs}, $_->{quantifier})); | |
$self->{symbols}->{$_->{lhs}}++; | |
} | |
} | |
sub _rules { | |
my ($self, @rules) = @_; | |
my @rc = (); | |
if (defined($self->{start}->{number})) { | |
push(@rc, 'inaccessible is ok by default'); | |
push(@rc, ':start ::= ' . $self->{start}->{rule}); | |
push(@rc, ''); | |
} | |
$self->_pushG1(\@rc); | |
$self->_pushLexemes(\@rc, 'lexemes'); | |
$self->_pushLexemes(\@rc, 'lexemesWithExclusion'); | |
$self->{grammar} = join("\n", @rc) . "\n"; | |
return $self; | |
} | |
sub _rule { | |
my ($self, $rulenumber, $symbol, $rulesep, $expressions, $lhscontraints, $quantifier) = @_; | |
foreach (@{$expressions}) { | |
push(@{$self->{rules}}, {lhs => $symbol, rhs => $_, quantifier => $quantifier || ''}); | |
} | |
if (defined($rulenumber)) { | |
# | |
# Generated rules have number undefined | |
# | |
$rulenumber =~ /\d+/; | |
my $number = $&; | |
if (! defined($self->{start}->{number}) || | |
$number < $self->{start}->{number}) { | |
$self->{start} = {number => $number, rule => $symbol}; | |
} | |
} | |
return $self; | |
} | |
sub _concatenation { | |
my ($self, $exceptions, $RHSconstraints) = @_; | |
foreach (@{$RHSconstraints}) { | |
$self->{constraints}->{$_} //= undef; | |
} | |
return "@{$exceptions}"; | |
} | |
sub _constraint { | |
my ($self, $type, $name, $end) = @_; | |
$name =~ s/[^\w]/_/g; | |
$type =~ /\w+/; | |
return "$&_$name"; | |
} | |
sub _char { | |
my ($self, $char) = @_; | |
# | |
# A char is either and _HEX or a _CHAR_RANGE | |
# | |
my $rc = undef; | |
if ($char =~ /^\#x(.*)/) { | |
$rc = chr(hex($1)); | |
} else { | |
$rc = $char; | |
} | |
} | |
sub _printable { | |
my ($self, $chr) = @_; | |
if ($chr =~ /[\s]/ || (! ($chr =~ /[[:ascii:]]/) || ($chr =~ /[[:cntrl:]]/))) { | |
$chr = sprintf('\\x{%x}', ord($chr)); | |
} | |
return $chr; | |
} | |
sub _factorCaretRange { | |
my ($self, $lbracket, $caret, $ranges, $rbracket) = @_; | |
return $self->_factor("[^$ranges]"); | |
} | |
sub _factorRange { | |
my ($self, $lbracket, $ranges, $rbracket) = @_; | |
return $self->_factor("[$ranges]"); | |
} | |
sub _ranges { | |
my ($self, @ranges) = @_; | |
return join('', @ranges); | |
} | |
sub _range { | |
my ($self, $char1, $char2) = @_; | |
my $range; | |
$char1 = $self->_printable($char1); | |
if (defined($char2)) { | |
$char2 = $self->_printable($char2); | |
$range = "$char1-$char2"; | |
} else { | |
$range = $char1; | |
} | |
return $range; | |
} | |
sub _range1 { | |
my ($self, $char) = @_; | |
return $self->_range($self->_char($char)); | |
} | |
sub _range2 { | |
my ($self, $char1, $minus, $char2) = @_; | |
return $self->_range($self->_char($char1), $self->_char($char2)); | |
} | |
sub _factorExpressions { | |
my ($self, $lparen, $expressions, $rparen) = @_; | |
my $symbol = sprintf('_Gen%03d', 1 + (scalar @{$self->{rules}})); | |
$self->_rule(undef, $symbol, undef, $expressions, []); | |
return $symbol; | |
} | |
sub _factor { | |
my ($self, $value) = @_; | |
my @name = grep {$self->{lexemes}->{$_} eq $value} keys %{$self->{lexemes}}; | |
my $name; | |
if (! @name) { | |
$name = sprintf('_Lex%03d', 1 + (keys %{$self->{lexemes}})); | |
$self->{lexemes}->{$name} = $value; | |
} else { | |
$name = $name[0]; | |
} | |
return $name; | |
} | |
sub _factorString { | |
my ($self, $quote1, $string, $quote2) = @_; | |
return $self->_factor($string); | |
} | |
sub _hexMany { | |
my ($self, @hex) = @_; | |
return $self->_factor(join('', map {$self->_printable($self->_char($_))} @hex)); | |
} | |
sub _termFactorQuantifier { | |
my ($self, $factor, $quantifier) = @_; | |
my $symbol; | |
if ($quantifier eq '*') { | |
$symbol = sprintf('%s_any', $factor); | |
if (! exists($self->{quantifier}->{$symbol})) { | |
$self->_rule(undef, $symbol, undef, [ $factor], [], $quantifier); | |
$self->{quantifier}->{$symbol}++; | |
} | |
} elsif ($quantifier eq '+') { | |
$symbol = sprintf('%s_many', $factor); | |
if (! exists($self->{quantifier}->{$symbol})) { | |
$self->_rule(undef, $symbol, undef, [ $factor], [], $quantifier); | |
$self->{quantifier}->{$symbol}++; | |
} | |
} elsif ($quantifier eq '?') { | |
$symbol = sprintf('%s_maybe', $factor); | |
if (! exists($self->{quantifier}->{$symbol})) { | |
$self->_rule(undef, $symbol, undef, ["$factor"], []); | |
$self->_rule(undef, $symbol, undef, [], []); | |
$self->{quantifier}->{$symbol}++; | |
} | |
} else { | |
die "Unsupported quantifier '$quantifier'"; | |
} | |
return $symbol; | |
} | |
sub _exceptionTermMinusTerm { | |
my ($self, $term1, $minus, $term2) = @_; | |
my $value = "$term1 $minus $term2"; | |
my @name = grep {$self->{lexemesWithExclusion}->{$_} eq $value} keys %{$self->{lexemesWithExclusion}}; | |
my $name; | |
if (! @name) { | |
$name = sprintf('_Exception%03d', 1 + (keys %{$self->{lexemesWithExclusion}})); | |
$self->{lexemesWithExclusion}->{$name} = $value; | |
print STDERR "[WARN] Lexeme with exclusion: $name ::= $value\n"; | |
} else { | |
$name = $name[0]; | |
} | |
my $symbol = sprintf('_Gen%03d', 1 + (scalar @{$self->{rules}})); | |
$self->_rule(undef, $symbol, undef, [ $name ], []); | |
return $symbol; | |
} | |
package main; | |
use Marpa::R2; | |
use Data::Section -setup; | |
use Data::Dumper; | |
our $DATA = __PACKAGE__->local_section_data; | |
# Grammar and test suite are in __DATA__ | |
# -------------------------------------- | |
my $grammar = Marpa::R2::Scanless::G->new( { source => $DATA->{'grammar_source'}, action_object => 'Actions' }); | |
my $recce = Marpa::R2::Scanless::R->new( {grammar => $grammar | |
# , trace_terminals => 1 | |
}); | |
eval {$recce->read($DATA->{'xml_1_0.bnf'})} || do {print STDERR "$@\n" . $recce->show_progress(); exit(1)}; | |
my $nbvalue = 0; | |
my @filenames = (); | |
my @values = (); | |
my $startRule = ''; | |
my @rules = (); | |
while (defined($_ = $recce->value)) { | |
push(@values, $_); | |
++$nbvalue; | |
push(@filenames, sprintf("C:\\Windows\\Temp\\jdd%03d.txt", $nbvalue)); | |
open(VALUE, '>', $filenames[-1]) || die "Cannot open $filenames[-1], $!\n"; | |
print VALUE ${$values[-1]}; | |
close(VALUE) || warn "Cannot close $filenames[-1], $!\n"; | |
last if ($nbvalue >= 2); | |
} | |
if ($nbvalue != 1) { | |
print STDERR "Oups, \$nbvalue != 1\n"; | |
if (@filenames) { | |
print STDERR "Please compare @filenames\n"; | |
} | |
} else { | |
print STDERR "Good, \$nbvalue=$nbvalue, c.f. $filenames[-1]\n"; | |
print ${$values[0]}->{grammar}; | |
Marpa::R2::Scanless::G->new( { source => \${$values[0]}->{grammar} }); | |
print STDERR "Symbols: " . join(' ', sort keys %{${$values[0]}->{symbols}}) . "\n"; | |
} | |
exit(0); | |
__DATA__ | |
__[ grammar_source ]__ | |
:start ::= rules | |
:default ::= action => ::first | |
lexeme default = latm => 1 | |
# | |
# Inspired by ebnf-ebx.el from Emacs's ebnf2ps package | |
# | |
# The XML Spec seems to apply [WFC:] and [VC:] constraints to all rule that are on the | |
# same line. | |
# This would have complicated too much the grammar given that almost all contraints | |
# apply on all RHSs of a given LHS. | |
# The exceptions are marked with [RWFC:] and [RCV:], i.e. when the contraint is on a | |
# specific RHS member. | |
rules ::= rule+ action => _rules | |
rule ::= RULENUMBER SYMBOL RULESEP expressions LHSconstraints action => _rule | |
expressions ::= concatenation+ separator => PIPE action => [values] | |
concatenation ::= exceptions RHSconstraints action => _concatenation | |
exceptions ::= exception+ action => [values] | |
exception ::= term | |
| term MINUS term action => _exceptionTermMinusTerm | |
term ::= factor | |
| factor QUANTIFIER action => _termFactorQuantifier | |
hexMany ::= HEX+ action => _hexMany | |
factor ::= hexMany | |
| LBRACKET ranges RBRACKET action => _factorRange | |
| LBRACKET CARET ranges RBRACKET action => _factorCaretRange | |
| DQUOTE STRINGDQUOTE DQUOTE action => _factorString | |
| SQUOTE STRINGSQUOTE SQUOTE action => _factorString | |
| LPAREN expressions RPAREN action => _factorExpressions | |
| SYMBOL | |
ranges ::= range+ action => _ranges | |
range ::= CHAR action => _range1 | |
| CHAR MINUS CHAR action => _range2 | |
LHSconstraint ::= LHSWfcConstraint | |
| LHSVcConstraint | |
LHSconstraints ::= LHSconstraint* action => [values] | |
LHSWfcConstraint ::= LHSWFCSTART CONSTRAINTBODY RBRACKET action => _constraint | |
LHSVcConstraint ::= LHSVCSTART CONSTRAINTBODY RBRACKET action => _constraint | |
RHSconstraint ::= RHSWfcConstraint | |
| RHSVcConstraint | |
RHSconstraints ::= RHSconstraint* action => [values] | |
RHSWfcConstraint ::= RHSWFCSTART CONSTRAINTBODY RBRACKET action => _constraint | |
RHSVcConstraint ::= RHSVCSTART CONSTRAINTBODY RBRACKET action => _constraint | |
RULESEP ~ '::=' | |
PIPE ~ '|' | |
MINUS ~ '-' | |
QUANTIFIER ~ '*' | '+' | '?' | |
HEX ~ _HEX | |
CHAR ~ _CHAR | |
LBRACKET ~ '[' | |
RBRACKET ~ ']' | |
LPAREN ~ '(' | |
RPAREN ~ ')' | |
CARET ~ '^' | |
DQUOTE ~ '"' | |
SQUOTE ~ ['] | |
STRINGDQUOTE ~ _STRING_DQUOTE_UNIT* | |
STRINGSQUOTE ~ _STRING_SQUOTE_UNIT* | |
SYMBOL ~ _SYMBOL_START _SYMBOL_END | |
LHSWFCSTART ~ '[wfc:':i | |
LHSVCSTART ~ '[vc:':i | |
RHSWFCSTART ~ '[rhs_wfc:':i | |
RHSVCSTART ~ '[rhs_vc:':i | |
RULENUMBER ~ '[' _RULE_NUMBER_START _RULE_NUMBER_END ']' | |
_STRING_DQUOTE_UNIT ~ [^"] | '\"' | |
_STRING_SQUOTE_UNIT ~ [^'] | '\' ['] | |
_HEX ~ __HEX_START __HEX_END | |
_CHAR_RANGE ~ [^\r\n\t\v\f\]\[\-\^] | |
| '\[' | |
| '\]' | |
| '\-' | |
| '\^' | |
_CHAR ~ _HEX | _CHAR_RANGE | |
CONSTRAINTBODY ~ [^\]]* | |
_SYMBOL_START ~ [a-zA-Z] | |
_SYMBOL_END ~ [-_a-zA-Z]* | |
_RULE_NUMBER_START ~ [\d]+ | |
_RULE_NUMBER_END ~ [a-zA-Z]* | |
__HEX_START ~ '#x' | |
__HEX_END ~ [0-9A-Fa-f]+ | |
############################################################################ | |
# Discard of a C comment, c.f. https://gist.github.com/jeffreykegler/5015057 | |
############################################################################ | |
<C style comment> ~ '/*' <comment interior> '*/' | |
<comment interior> ~ | |
<optional non stars> | |
<optional star prefixed segments> | |
<optional pre final stars> | |
<optional non stars> ~ [^*]* | |
<optional star prefixed segments> ~ <star prefixed segment>* | |
<star prefixed segment> ~ <stars> [^/*] <optional star free text> | |
<stars> ~ [*]+ | |
<optional star free text> ~ [^*]* | |
<optional pre final stars> ~ [*]* | |
:discard ~ <C style comment> | |
################# | |
# Generic discard | |
################# | |
__SPACE_ANY ~ [\s]+ | |
:discard ~ __SPACE_ANY | |
__[ xml_1_0.bnf ]__ | |
[1] document ::= prolog element Misc* | |
[2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */ | |
[3] S ::= (#x20 | #x9 | #xD | #xA)+ | |
[4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] | |
[4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] | |
[5] Name ::= NameStartChar (NameChar)* | |
[6] Names ::= Name (#x20 Name)* | |
[7] Nmtoken ::= (NameChar)+ | |
[8] Nmtokens ::= Nmtoken (#x20 Nmtoken)* | |
[9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' | |
| "'" ([^%&'] | PEReference | Reference)* "'" | |
[10] AttValue ::= '"' ([^<&"] | Reference)* '"' | |
| "'" ([^<&'] | Reference)* "'" | |
[11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") | |
[12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" | |
[13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9] | [\-'()+,./:=?;!*#@$_%] | |
[14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) | |
[15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' | |
[16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' | |
[17] PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) | |
[18] CDSect ::= CDStart CData CDEnd | |
[19] CDStart ::= '<![CDATA[' | |
[20] CData ::= (Char* - (Char* ']]>' Char*)) | |
[21] CDEnd ::= ']]>' | |
[22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? | |
[23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' | |
[24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"') | |
[25] Eq ::= S? '=' S? | |
[26] VersionNum ::= '1.' [0-9]+ | |
[27] Misc ::= Comment | PI | S | |
[28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>' [VC: Root Element Type] | |
[WFC: External Subset] | |
[28a] DeclSep ::= PEReference | S [WFC: PE Between Declarations] | |
[28b] intSubset ::= (markupdecl | DeclSep)* | |
[29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment [VC: Proper Declaration/PE Nesting] | |
[WFC: PEs in Internal Subset] | |
[30] extSubset ::= TextDecl? extSubsetDecl | |
[31] extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep)* | |
[32] SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"')) [VC: Standalone Document Declaration] | |
[39] element ::= EmptyElemTag | |
| STag content ETag [WFC: Element Type Match] | |
[VC: Element Valid] | |
[40] STag ::= '<' Name (S Attribute)* S? '>' [WFC: Unique Att Spec] | |
[41] Attribute ::= Name Eq AttValue [VC: Attribute Value Type] | |
[WFC: No External Entity References] | |
[WFC: No < in Attribute Values] | |
[42] ETag ::= '</' Name S? '>' | |
[43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* | |
[44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' [WFC: Unique Att Spec] | |
[45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' [VC: Unique Element Type Declaration] | |
[46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | children | |
[47] children ::= (choice | seq) ('?' | '*' | '+')? | |
[48] cp ::= (Name | choice | seq) ('?' | '*' | '+')? | |
[49] choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' [VC: Proper Group/PE Nesting] | |
[50] seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' [VC: Proper Group/PE Nesting] | |
[51] Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' | |
| '(' S? '#PCDATA' S? ')' [VC: Proper Group/PE Nesting] | |
[VC: No Duplicate Types] | |
[52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>' | |
[53] AttDef ::= S Name S AttType S DefaultDecl | |
[54] AttType ::= StringType | TokenizedType | EnumeratedType | |
[55] StringType ::= 'CDATA' | |
[56] TokenizedType ::= 'ID' [RHS_VC: ID] | |
[RHS_VC: One ID per Element Type] | |
[RHS_VC: ID Attribute Default] | |
| 'IDREF' [RHS_VC: IDREF] | |
| 'IDREFS' [RHS_VC: IDREF] | |
| 'ENTITY' [RHS_VC: Entity Name] | |
| 'ENTITIES' [RHS_VC: Entity Name] | |
| 'NMTOKEN' [RHS_VC: Name Token] | |
| 'NMTOKENS' [RHS_VC: Name Token] | |
[57] EnumeratedType ::= NotationType | Enumeration | |
[58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' [VC: Notation Attributes] | |
[VC: One Notation Per Element Type] | |
[VC: No Notation on Empty Element] | |
[VC: No Duplicate Tokens] | |
[59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' [VC: Enumeration] | |
[VC: No Duplicate Tokens] | |
[60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | |
| (('#FIXED' S)? AttValue) [VC: Required Attribute] | |
[VC: Attribute Default Value Syntactically Correct] | |
[WFC: No < in Attribute Values] | |
[VC: Fixed Attribute Default] | |
[WFC: No External Entity References] | |
[61] conditionalSect ::= includeSect | ignoreSect | |
[62] includeSect ::= '<![' S? 'INCLUDE' S? '[' extSubsetDecl ']]>' [VC: Proper Conditional Section/PE Nesting] | |
[63] ignoreSect ::= '<![' S? 'IGNORE' S? '[' ignoreSectContents* ']]>' [VC: Proper Conditional Section/PE Nesting] | |
[64] ignoreSectContents ::= Ignore ('<![' ignoreSectContents ']]>' Ignore)* | |
[65] Ignore ::= Char* - (Char* ('<![' | ']]>') Char*) | |
[66] CharRef ::= '&#' [0-9]+ ';' | |
| '&#x' [0-9a-fA-F]+ ';' [WFC: Legal Character] | |
[67] Reference ::= EntityRef | CharRef | |
[68] EntityRef ::= '&' Name ';' [WFC: Entity Declared] | |
[VC: Entity Declared] | |
[WFC: Parsed Entity] | |
[WFC: No Recursion] | |
[69] PEReference ::= '%' Name ';' [VC: Entity Declared] | |
[WFC: No Recursion] | |
[WFC: In DTD] | |
[70] EntityDecl ::= GEDecl | PEDecl | |
[71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>' | |
[72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>' | |
[73] EntityDef ::= EntityValue | (ExternalID NDataDecl?) | |
[74] PEDef ::= EntityValue | ExternalID | |
[75] ExternalID ::= 'SYSTEM' S SystemLiteral | |
| 'PUBLIC' S PubidLiteral S SystemLiteral | |
[76] NDataDecl ::= S 'NDATA' S Name [VC: Notation Declared] | |
[77] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' | |
[78] extParsedEnt ::= TextDecl? content | |
[80] EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" ) | |
[81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* /* Encoding name contains only Latin characters */ | |
[82] NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>' [VC: Unique Notation Name] | |
[83] PublicID ::= 'PUBLIC' S PubidLiteral |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment