Last active
December 27, 2015 08:38
-
-
Save hoehrmann/7297242 to your computer and use it in GitHub Desktop.
Turns ABNF grammar into a "simpler" ABNF grammar that is essentially in Greibach Normal Form plus epsilon rules so information on which non-terminals match the empty string is not lost; furthermore it only generates terminals for the form %x0000-10FFFF. Specifically, this means that the grammar contains no left recursion.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!perl -w | |
use Modern::Perl; | |
use Parse::ABNF; | |
use Data::Dumper; | |
use YAML::XS; | |
use Graph::Directed; | |
use List::OrderBy; | |
##################################################################### | |
# | |
##################################################################### | |
printf STDERR "Reading ABNF grammar from STDIN\n"; | |
my $text = join '', <>; | |
# remove some offending leading white space | |
$text =~ s/^\s+(?=[\w-]+\s*=)//mg; | |
my $rules = Parse::ABNF->new->parse($text, 0); | |
die unless $rules; | |
my $grammar = make('Grammar', $rules, start => undef); | |
my ($greibach) = grammar_to_greibach_normal_form_plus_epsilons($grammar); | |
dump_simple_grammar($greibach); | |
##################################################################### | |
# | |
##################################################################### | |
sub make { | |
my ($class, $value, %rest) = @_; | |
return { | |
class => $class, | |
value => $value, | |
%rest, | |
}; | |
} | |
sub BoundRepetition { | |
my ($p, $min, $max) = @_; | |
die unless $min; | |
die unless $max >= $min; | |
my @items; | |
for (1 .. $max) { | |
my $ref = make('Reference', undef, name => $_ - 1); | |
my $group = make('Group', [$p, $ref]); | |
push @items, make('Rule', $group, name => $_); | |
} | |
unshift @items, make('Rule', $p, name => 0); | |
my @choices = map make('Reference', undef, name => $_), $min .. $max; | |
push @items, make('Choice', \@choices); | |
return make('Grammar', \@items, start => $#items); | |
} | |
sub OneOrMore { | |
my ($p) = @_; | |
my $ref = make('Reference', undef, name => 0); | |
my $group = make('Group', [$p, $ref]); | |
my $choice = make('Choice', [$group, $p]); | |
my $r = make('Rule', $choice, name => 0); | |
return make('Grammar', [$r], start => 0); | |
} | |
sub ZeroOrMore { | |
my ($p) = @_; | |
my $empty = make('Empty', undef); | |
return make('Choice', [$empty, OneOrMore($p)]); | |
} | |
sub UnboundRepetition { | |
my ($p, $min) = @_; | |
return ZeroOrMore($p) if $min == 0; | |
return OneOrMore($p) if $min == 1; | |
my $repeat = BoundRepetition($p, $min, $min); | |
my $more = ZeroOrMore($p); | |
return make('Group', [$repeat, $more]); | |
} | |
sub Literal { | |
my ($p) = @_; | |
return make('Empty') unless length $p->{value}; | |
my @temp = map { | |
my $olc = ord lc; | |
my $ouc = ord uc; | |
my @ranges = map { | |
make('Range', undef, type => 'decimal', min => $_, max => $_); | |
} $olc, $ouc; | |
make('Choice', [@ranges]); | |
} split//, $p->{value}; | |
return make('Group', [@temp]); | |
} | |
sub String { | |
my ($p) = @_; | |
my @ranges = map { Range($_) } map { | |
make('Range', undef, type => $p->{type}, min => $_, max => $_); | |
} @{ $p->{value} }; | |
return make('Group', [@ranges]); | |
} | |
sub Range { | |
my ($p) = @_; | |
my $min = $p->{min}; | |
my $max = $p->{max}; | |
$min = hex $min if $p->{type} eq 'hex'; | |
$max = hex $max if $p->{type} eq 'hex'; | |
$min = ord pack 'b*', reverse $min if $p->{type} eq 'binary'; # LOL | |
$max = ord pack 'b*', reverse $max if $p->{type} eq 'binary'; | |
make('Range', undef, type => 'decimal', min => $min, max => $max); | |
} | |
##################################################################### | |
# | |
##################################################################### | |
sub simplify { | |
my ($g, $m, $e, $p) = @_; | |
state $counter = 1; | |
# Result contains only Rule, Group, Choice, Range, Reference, | |
# Empty, and all Group elements have no more than two children. | |
if ($p->{class} eq 'Reference') { | |
my $name = $m->{$p->{name}} // $p->{name}; | |
return make('Reference', undef, name => $name); | |
} | |
return $p if $p->{class} eq 'Empty'; | |
return Range($p) if $p->{class} eq 'Range'; | |
return String($p) if $p->{class} eq 'String'; | |
return Literal($p) if $p->{class} eq 'Literal'; | |
die "Giving up on prose value" if $p->{class} eq 'ProseValue'; | |
if ($p->{class} eq 'Repetition') { | |
return simplify($g, $m, $e, BoundRepetition($p->{value}, $p->{min}, $p->{max})) | |
if defined $p->{max}; | |
return simplify($g, $m, $e, UnboundRepetition($p->{value}, $p->{min})); | |
} | |
return make('Rule', simplify($g, $m, $e, $p->{value}), name => $m->{$p->{name}}) | |
if $p->{class} eq 'Rule'; | |
if ($p->{class} eq 'Choice') { | |
my @temp = map { simplify($g, $m, $e, $_) } @{ $p->{value} }; | |
return make('Choice', \@temp); | |
} | |
if ($p->{class} eq 'Group') { | |
my @temp = map { simplify($g, $m, $e, $_) } @{ $p->{value} }; | |
return (make('Group', \@temp)); | |
} | |
if ($p->{class} eq 'Grammar') { | |
my $m = {}; | |
# TODO: what if multiple Rule elements for a given name? | |
$m->{ $_->{name} } = "__" . $counter++ for @{ $p->{value} }; | |
my @rules = map { simplify($g, $m, $e, $_) } @{ $p->{value} }; | |
push @{ $e }, @rules; | |
return make('Reference', undef, name => $m->{$p->{start}}); | |
} | |
die $p->{class}; | |
} | |
sub simpler_grammar { | |
my ($grammar) = @_; | |
my $rules = $grammar->{value}; | |
my %name_map = map { $_->{name}, $_->{name} } @$rules; | |
my @extra_rules; | |
my @simple_rules = map { | |
simplify($grammar, \%name_map, \@extra_rules, $_) | |
} @$rules; | |
make('Grammar', [@simple_rules, @extra_rules], start => $grammar->{start}); | |
} | |
sub even_simpler_grammar { | |
my ($grammar) = simpler_grammar(shift()); | |
my $rules = $grammar->{value}; | |
my @simple_rules = map { | |
simplify2($_) | |
} @$rules; | |
simpler_grammar make('Grammar', [@simple_rules], start => $grammar->{start}); | |
} | |
sub simplify_group { | |
my ($p) = @_; | |
return $p unless $p->{class} eq 'Group'; | |
my @items = map { $_->{class} eq 'Group' ? @{ $_->{value} } : $_ } @{ $p->{value} }; | |
my @without_empty = grep { $_->{class} ne 'Empty' } @items; | |
return $items[0] unless @without_empty; | |
return make('Group', \@without_empty); | |
} | |
##################################################################### | |
# | |
##################################################################### | |
sub simplify2 { | |
my ($p) = @_; | |
for ($p->{class}) { | |
when("Rule") { | |
return make('Rule', simplify2($p->{value}), name => $p->{name}); | |
} | |
when("Choice") { | |
my @options = map { | |
make('Rule', simplify2($_), name => '__choice', combine => 'choice') | |
} @{ $p->{value} }; | |
return make('Grammar', \@options, start => '__choice'); | |
} | |
when("Group") { | |
my @items = map { simplify2($_) } @{ $p->{value} }; | |
return simplify_group make('Group', \@items); | |
} | |
when("Range") { return $p } | |
when("Reference") { return $p } | |
when("Empty") { return $p } | |
} | |
die; | |
} | |
sub grammar_to_list { | |
my ($simple) = @_; | |
my @result; | |
for my $rule (@{ $simple->{value} }) { | |
my @items = $rule->{value}->{class} eq 'Group' ? | |
@{ $rule->{value}->{value} } : $rule->{value}; | |
push @result, [ | |
$rule->{name}, \@items | |
] | |
} | |
@result; | |
} | |
##################################################################### | |
# | |
##################################################################### | |
sub greibach_helper { | |
my (@list) = @_; | |
my %numbering; | |
my %number_to_rhs; | |
my $add_rules = sub { | |
my @list = @_; | |
for my $rule (@list) { | |
my @values = @{ $rule->[1] }; | |
my @refs = grep { $_->{class} eq 'Reference' } @values; | |
my @names = ($rule->[0], map { $_->{name} } @refs); | |
for (@names) { | |
next if $numbering{$_}; | |
$numbering{$_} = 1 + keys %numbering; | |
} | |
my $number = $numbering{ $rule->[0] }; | |
push @{ $number_to_rhs{ $number } }, $rule->[1]; | |
} | |
}; | |
$add_rules->(@list); | |
# We should no longer use @list, the data is in $number_to_rhs now | |
for my $number (1 .. keys %numbering) { | |
my @rhs = @{ $number_to_rhs{ $number } }; | |
for (my $ix = 0; $ix < @rhs; ++$ix) { | |
my $rhs = $rhs[$ix]; | |
next if $rhs->[0]{class} ne 'Reference'; | |
my $number_rhs = $numbering{ $rhs->[0]{name} }; | |
next if $number_rhs >= $number; | |
my @subst = @{ $number_to_rhs{ $number_rhs } }; | |
splice @rhs, $ix--, 1, (); | |
my @rest = @$rhs; | |
my $removed = shift @rest; | |
for (@subst) { | |
push @rhs, [@$_, @rest] | |
} | |
} | |
$number_to_rhs{ $number } = \@rhs; | |
# left recursion | |
my @recursion_suffix; | |
my @recursion_other; | |
for (my $ix = 0; $ix < @rhs; ++$ix) { | |
my $rhs = $rhs[$ix]; | |
if ($rhs->[0]{class} eq 'Reference') { | |
my $number_rhs = $numbering{ $rhs->[0]{name} }; | |
if ($number_rhs == $number) { | |
my @copy = @$rhs; | |
shift @copy; | |
push @recursion_suffix, \@copy; | |
} else { | |
push @recursion_other, $rhs; | |
} | |
} else { | |
push @recursion_other, $rhs; | |
} | |
} | |
next unless @recursion_suffix; | |
my $name = sprintf "__greibach_%u", | |
1 + @new_right_recursions + keys %numbering; | |
my @subst = @recursion_other; | |
push @subst, [@$_, make('Reference', undef, name => $name)] | |
for @recursion_other; | |
$number_to_rhs{ $number } = \@subst; | |
push my @new, [@$_, make('Reference', undef, name => $name)] | |
for @recursion_suffix; | |
$add_rules->(map [ $name, $_ ], @recursion_suffix, @new); | |
} | |
# TODO: Possibly, this needs to be done in the loop above. | |
for my $number (reverse(1 .. keys %numbering)) { | |
my @rhs = @{ $number_to_rhs{ $number } }; | |
for (my $ix = 0; $ix < @rhs; ++$ix) { | |
my $rhs = $rhs[$ix]; | |
next if $rhs->[0]{class} ne 'Reference'; | |
my $number_rhs = $numbering{ $rhs->[0]{name} }; | |
my @subst = @{ $number_to_rhs{ $number_rhs } }; | |
splice @rhs, $ix--, 1, (); | |
my @rest = @$rhs; | |
my $removed = shift @rest; | |
for (@subst) { | |
push @rhs, [@$_, @rest] | |
} | |
} | |
$number_to_rhs{ $number } = \@rhs; | |
} | |
######## | |
my %numbering_to_name = map { $numbering{$_}, $_ } keys %numbering; | |
my @new_list; | |
for my $number (1 .. keys %numbering) { | |
push @new_list, [ $numbering_to_name{$number}, $_ ] | |
for @{ $number_to_rhs{$number} }; | |
} | |
return @new_list; | |
} | |
##################################################################### | |
# | |
##################################################################### | |
sub grammar_to_greibach_normal_form { | |
my ($grammar) = @_; | |
my ($chomsky, $is_nullable) = grammar_to_chomsky_normal_form($grammar); | |
my @list = grammar_to_list($chomsky); | |
my (@new_list) = greibach_helper(@list); | |
@list = @new_list; | |
my @new_rules = map { | |
make('Rule', scalar((Group(@{ $_->[1] }))), name => $_->[0]) | |
} grep { | |
1 | |
} @list; | |
my @epsilon_rules; | |
while (my ($k, $v) = each %{ $is_nullable }) { | |
next unless $v; | |
push @epsilon_rules, make('Rule', make('Empty'), name => $k); | |
} | |
my $new_grammar = make('Grammar', \@new_rules, start => $grammar->{start}); | |
return ($new_grammar, \@epsilon_rules); | |
} | |
sub grammar_to_greibach_normal_form_plus_epsilons { | |
my ($g, $nullable) = grammar_to_greibach_normal_form(@_); | |
return make('Grammar', [@{$g->{value}}, @$nullable], start => $g->{start}); | |
} | |
##################################################################### | |
# | |
##################################################################### | |
sub grammar_to_chomsky_normal_form { | |
my ($grammar) = @_; | |
my @list = grammar_to_list(even_simpler_grammar($grammar)); | |
my $counter = 1; | |
my @result; | |
my %terminal_map; | |
my @terminal_rules; | |
for my $rule (@list) { | |
my @values; | |
for my $item (@{ $rule->[1] }) { | |
if ($item->{class} ne 'Range') { | |
push @values, $item; | |
next; | |
} | |
my $key = sprintf "#%04x-%04x", $item->{min}, $item->{max}; | |
unless ($terminal_map{$key}) { | |
$terminal_map{$key} = make('Reference', undef, name => sprintf("__cnf_%u", $counter++)); | |
push @terminal_rules, [ | |
$terminal_map{$key}->{name}, | |
[ $item ] | |
]; | |
} | |
push @values, $terminal_map{$key}; | |
} | |
$rule->[1] = \@values; | |
} | |
push @list, @terminal_rules; | |
# Allow only two non-terminals on the right side | |
my @contractions; | |
for my $rule (@list) { | |
my @values = @{ $rule->[1] }; | |
next if @values <= 2; | |
while (@values >= 2) { | |
my $x = shift @values; | |
my $y = shift @values; | |
my $ref = make('Reference', undef, name => sprintf("__cnf_%u", $counter++)); | |
unshift @values, $ref; | |
push @contractions, [ $ref->{name}, [ $x, $y ] ]; | |
} | |
$rule->[1] = \@values; | |
} | |
push @list, @contractions; | |
my @nullables = grep { @{$_->[1]} == 1 and $_->[1][0]{class} eq 'Empty' } @list; | |
my %is_nullable = map { $_->[0], 1 } @nullables; | |
use List::MoreUtils qw/all/; | |
my $changed = 0; | |
while (1) { | |
$changed = 0; | |
for my $rule (@list) { | |
next unless $rule->[1][0]->{class} eq 'Reference'; | |
next unless all { $is_nullable{$_->{name}} } @{ $rule->[1] }; | |
next if $is_nullable{ $rule->[0] }; | |
$is_nullable{ $rule->[0] }++; | |
$changed++; | |
} | |
last unless $changed; | |
} | |
my @epsilon_replacements; | |
for my $rule (@list) { | |
my @values = @{ $rule->[1] }; | |
next unless @values == 2; | |
if ($is_nullable{ $values[0]->{name} }) { | |
push @epsilon_replacements, [ | |
$rule->[0], [$values[1]] | |
]; | |
} | |
if ($is_nullable{ $values[1]->{name} }) { | |
push @epsilon_replacements, [ | |
$rule->[0], [$values[0]] | |
]; | |
} | |
} | |
# Throughout the code we assume that Empty is alone | |
@list = grep { $_->[1][0]{class} ne 'Empty'; } @list; | |
push @list, @epsilon_replacements; | |
my $graph = Graph::Directed->new; | |
for my $rule (@list) { | |
my @values = @{ $rule->[1] }; | |
if (@values == 1 and $values[0]->{class} eq 'Reference') { | |
$graph->add_edge($rule->[0], $values[0]->{name}); | |
} | |
} | |
my %rename_map; | |
while (1) { | |
# TODO: use strongly_connected_components instead | |
my (@cycle) = $graph->find_a_cycle; | |
last unless @cycle; | |
my $name = shift @cycle; | |
for my $vertex (@cycle) { | |
$rename_map{$vertex} = $name; | |
$graph->add_edge($name, $_->[1]) for $graph->edges_from($_); | |
$graph->add_edge($_->[1], $name) for $graph->edges_to($_); | |
} | |
} | |
for my $rule (@list) { | |
my @values = @{ $rule->[1] }; | |
$rule->[0] = $rename_map{$rule->[0]} // $rule->[0]; | |
my @new_values; | |
for my $item (@values) { | |
if ($item->{class} eq 'Reference') { | |
if (defined $rename_map{ $item->{name} }) { | |
push @new_values, make('Reference', undef, name => | |
$rename_map{ $item->{name} }); | |
next; | |
} | |
} | |
push @new_values, $item; | |
} | |
$rule->[1] = \@new_values; | |
} | |
my %name_to_rhs; | |
for my $rule (@list) { | |
push @{ $name_to_rhs{$rule->[0]} }, $rule->[1]; | |
} | |
for my $v (reverse $graph->toposort) { | |
for my $e ($graph->edges_from($v)) { | |
next unless $name_to_rhs{$e->[1]}; | |
my @rhs = @{ $name_to_rhs{$e->[1]} }; | |
push @list, [ $v, [@$_] ] for @rhs; | |
push @{ $name_to_rhs{$v} }, $_ for @rhs; | |
} | |
} | |
my @new_rules = map { | |
make('Rule', Group(@{ $_->[1] }), name => $_->[0]) | |
} grep { | |
@{ $_->[1] } != 1 or $_->[1][0]{class} ne 'Reference'; | |
} @list; | |
my $new_grammar = make('Grammar', \@new_rules, start => $grammar->{start}); | |
return ($new_grammar, \%is_nullable); | |
} | |
sub Group { | |
return $_[0] if @_ == 1; | |
return simplify_group make('Group', [@_]); | |
} | |
sub dump_simple_grammar { | |
my ($simple) = @_; | |
my %seen; | |
for my $rule (order_by { $_->{name} =~ /^__/ } @{ $simple->{value} }) { | |
my @items = $rule->{value}->{class} eq 'Group' ? | |
@{ $rule->{value}->{value} } : $rule->{value}; | |
my $fix_name = sub { | |
my $name = shift; | |
$name =~ s/^__/auto--/; | |
return $name; | |
}; | |
my $item_serializer = sub { | |
my $item = shift; | |
for ($item->{class}) { | |
when("Range") { | |
return sprintf "%%x%04x-%04x", $item->{min}, $item->{max}; | |
} | |
when("Reference") { | |
return sprintf "%s", $fix_name->($item->{name}); | |
} | |
when("Empty") { | |
return sprintf '""'; | |
} | |
default { | |
warn Dumper { rule => $rule, item => $item }; | |
} | |
} | |
}; | |
my $name = $fix_name->($rule->{name}); | |
printf "%-30s =%1s %s\n", $name, ($seen{$name}++ ? '/' : ''), join(' ', | |
map { $item_serializer->($_) } @items); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment