Skip to content

Instantly share code, notes, and snippets.

@hoehrmann
Last active December 27, 2015 08:38
Show Gist options
  • Save hoehrmann/7297242 to your computer and use it in GitHub Desktop.
Save hoehrmann/7297242 to your computer and use it in GitHub Desktop.
Turns ABNF grammar into a "simpler" ABNF grammar that is essentially in Greibach Normal Form plus epsilon rules so information on which non-terminals match the empty string is not lost; furthermore it only generates terminals for the form %x0000-10FFFF. Specifically, this means that the grammar contains no left recursion.
#!perl -w
use Modern::Perl;
use Parse::ABNF;
use Data::Dumper;
use YAML::XS;
use Graph::Directed;
use List::OrderBy;
#####################################################################
#
#####################################################################
printf STDERR "Reading ABNF grammar from STDIN\n";
my $text = join '', <>;
# remove some offending leading white space
$text =~ s/^\s+(?=[\w-]+\s*=)//mg;
my $rules = Parse::ABNF->new->parse($text, 0);
die unless $rules;
my $grammar = make('Grammar', $rules, start => undef);
my ($greibach) = grammar_to_greibach_normal_form_plus_epsilons($grammar);
dump_simple_grammar($greibach);
#####################################################################
#
#####################################################################
sub make {
my ($class, $value, %rest) = @_;
return {
class => $class,
value => $value,
%rest,
};
}
sub BoundRepetition {
my ($p, $min, $max) = @_;
die unless $min;
die unless $max >= $min;
my @items;
for (1 .. $max) {
my $ref = make('Reference', undef, name => $_ - 1);
my $group = make('Group', [$p, $ref]);
push @items, make('Rule', $group, name => $_);
}
unshift @items, make('Rule', $p, name => 0);
my @choices = map make('Reference', undef, name => $_), $min .. $max;
push @items, make('Choice', \@choices);
return make('Grammar', \@items, start => $#items);
}
sub OneOrMore {
my ($p) = @_;
my $ref = make('Reference', undef, name => 0);
my $group = make('Group', [$p, $ref]);
my $choice = make('Choice', [$group, $p]);
my $r = make('Rule', $choice, name => 0);
return make('Grammar', [$r], start => 0);
}
sub ZeroOrMore {
my ($p) = @_;
my $empty = make('Empty', undef);
return make('Choice', [$empty, OneOrMore($p)]);
}
sub UnboundRepetition {
my ($p, $min) = @_;
return ZeroOrMore($p) if $min == 0;
return OneOrMore($p) if $min == 1;
my $repeat = BoundRepetition($p, $min, $min);
my $more = ZeroOrMore($p);
return make('Group', [$repeat, $more]);
}
sub Literal {
my ($p) = @_;
return make('Empty') unless length $p->{value};
my @temp = map {
my $olc = ord lc;
my $ouc = ord uc;
my @ranges = map {
make('Range', undef, type => 'decimal', min => $_, max => $_);
} $olc, $ouc;
make('Choice', [@ranges]);
} split//, $p->{value};
return make('Group', [@temp]);
}
sub String {
my ($p) = @_;
my @ranges = map { Range($_) } map {
make('Range', undef, type => $p->{type}, min => $_, max => $_);
} @{ $p->{value} };
return make('Group', [@ranges]);
}
sub Range {
my ($p) = @_;
my $min = $p->{min};
my $max = $p->{max};
$min = hex $min if $p->{type} eq 'hex';
$max = hex $max if $p->{type} eq 'hex';
$min = ord pack 'b*', reverse $min if $p->{type} eq 'binary'; # LOL
$max = ord pack 'b*', reverse $max if $p->{type} eq 'binary';
make('Range', undef, type => 'decimal', min => $min, max => $max);
}
#####################################################################
#
#####################################################################
sub simplify {
my ($g, $m, $e, $p) = @_;
state $counter = 1;
# Result contains only Rule, Group, Choice, Range, Reference,
# Empty, and all Group elements have no more than two children.
if ($p->{class} eq 'Reference') {
my $name = $m->{$p->{name}} // $p->{name};
return make('Reference', undef, name => $name);
}
return $p if $p->{class} eq 'Empty';
return Range($p) if $p->{class} eq 'Range';
return String($p) if $p->{class} eq 'String';
return Literal($p) if $p->{class} eq 'Literal';
die "Giving up on prose value" if $p->{class} eq 'ProseValue';
if ($p->{class} eq 'Repetition') {
return simplify($g, $m, $e, BoundRepetition($p->{value}, $p->{min}, $p->{max}))
if defined $p->{max};
return simplify($g, $m, $e, UnboundRepetition($p->{value}, $p->{min}));
}
return make('Rule', simplify($g, $m, $e, $p->{value}), name => $m->{$p->{name}})
if $p->{class} eq 'Rule';
if ($p->{class} eq 'Choice') {
my @temp = map { simplify($g, $m, $e, $_) } @{ $p->{value} };
return make('Choice', \@temp);
}
if ($p->{class} eq 'Group') {
my @temp = map { simplify($g, $m, $e, $_) } @{ $p->{value} };
return (make('Group', \@temp));
}
if ($p->{class} eq 'Grammar') {
my $m = {};
# TODO: what if multiple Rule elements for a given name?
$m->{ $_->{name} } = "__" . $counter++ for @{ $p->{value} };
my @rules = map { simplify($g, $m, $e, $_) } @{ $p->{value} };
push @{ $e }, @rules;
return make('Reference', undef, name => $m->{$p->{start}});
}
die $p->{class};
}
sub simpler_grammar {
my ($grammar) = @_;
my $rules = $grammar->{value};
my %name_map = map { $_->{name}, $_->{name} } @$rules;
my @extra_rules;
my @simple_rules = map {
simplify($grammar, \%name_map, \@extra_rules, $_)
} @$rules;
make('Grammar', [@simple_rules, @extra_rules], start => $grammar->{start});
}
sub even_simpler_grammar {
my ($grammar) = simpler_grammar(shift());
my $rules = $grammar->{value};
my @simple_rules = map {
simplify2($_)
} @$rules;
simpler_grammar make('Grammar', [@simple_rules], start => $grammar->{start});
}
sub simplify_group {
my ($p) = @_;
return $p unless $p->{class} eq 'Group';
my @items = map { $_->{class} eq 'Group' ? @{ $_->{value} } : $_ } @{ $p->{value} };
my @without_empty = grep { $_->{class} ne 'Empty' } @items;
return $items[0] unless @without_empty;
return make('Group', \@without_empty);
}
#####################################################################
#
#####################################################################
sub simplify2 {
my ($p) = @_;
for ($p->{class}) {
when("Rule") {
return make('Rule', simplify2($p->{value}), name => $p->{name});
}
when("Choice") {
my @options = map {
make('Rule', simplify2($_), name => '__choice', combine => 'choice')
} @{ $p->{value} };
return make('Grammar', \@options, start => '__choice');
}
when("Group") {
my @items = map { simplify2($_) } @{ $p->{value} };
return simplify_group make('Group', \@items);
}
when("Range") { return $p }
when("Reference") { return $p }
when("Empty") { return $p }
}
die;
}
sub grammar_to_list {
my ($simple) = @_;
my @result;
for my $rule (@{ $simple->{value} }) {
my @items = $rule->{value}->{class} eq 'Group' ?
@{ $rule->{value}->{value} } : $rule->{value};
push @result, [
$rule->{name}, \@items
]
}
@result;
}
#####################################################################
#
#####################################################################
sub greibach_helper {
my (@list) = @_;
my %numbering;
my %number_to_rhs;
my $add_rules = sub {
my @list = @_;
for my $rule (@list) {
my @values = @{ $rule->[1] };
my @refs = grep { $_->{class} eq 'Reference' } @values;
my @names = ($rule->[0], map { $_->{name} } @refs);
for (@names) {
next if $numbering{$_};
$numbering{$_} = 1 + keys %numbering;
}
my $number = $numbering{ $rule->[0] };
push @{ $number_to_rhs{ $number } }, $rule->[1];
}
};
$add_rules->(@list);
# We should no longer use @list, the data is in $number_to_rhs now
for my $number (1 .. keys %numbering) {
my @rhs = @{ $number_to_rhs{ $number } };
for (my $ix = 0; $ix < @rhs; ++$ix) {
my $rhs = $rhs[$ix];
next if $rhs->[0]{class} ne 'Reference';
my $number_rhs = $numbering{ $rhs->[0]{name} };
next if $number_rhs >= $number;
my @subst = @{ $number_to_rhs{ $number_rhs } };
splice @rhs, $ix--, 1, ();
my @rest = @$rhs;
my $removed = shift @rest;
for (@subst) {
push @rhs, [@$_, @rest]
}
}
$number_to_rhs{ $number } = \@rhs;
# left recursion
my @recursion_suffix;
my @recursion_other;
for (my $ix = 0; $ix < @rhs; ++$ix) {
my $rhs = $rhs[$ix];
if ($rhs->[0]{class} eq 'Reference') {
my $number_rhs = $numbering{ $rhs->[0]{name} };
if ($number_rhs == $number) {
my @copy = @$rhs;
shift @copy;
push @recursion_suffix, \@copy;
} else {
push @recursion_other, $rhs;
}
} else {
push @recursion_other, $rhs;
}
}
next unless @recursion_suffix;
my $name = sprintf "__greibach_%u",
1 + @new_right_recursions + keys %numbering;
my @subst = @recursion_other;
push @subst, [@$_, make('Reference', undef, name => $name)]
for @recursion_other;
$number_to_rhs{ $number } = \@subst;
push my @new, [@$_, make('Reference', undef, name => $name)]
for @recursion_suffix;
$add_rules->(map [ $name, $_ ], @recursion_suffix, @new);
}
# TODO: Possibly, this needs to be done in the loop above.
for my $number (reverse(1 .. keys %numbering)) {
my @rhs = @{ $number_to_rhs{ $number } };
for (my $ix = 0; $ix < @rhs; ++$ix) {
my $rhs = $rhs[$ix];
next if $rhs->[0]{class} ne 'Reference';
my $number_rhs = $numbering{ $rhs->[0]{name} };
my @subst = @{ $number_to_rhs{ $number_rhs } };
splice @rhs, $ix--, 1, ();
my @rest = @$rhs;
my $removed = shift @rest;
for (@subst) {
push @rhs, [@$_, @rest]
}
}
$number_to_rhs{ $number } = \@rhs;
}
########
my %numbering_to_name = map { $numbering{$_}, $_ } keys %numbering;
my @new_list;
for my $number (1 .. keys %numbering) {
push @new_list, [ $numbering_to_name{$number}, $_ ]
for @{ $number_to_rhs{$number} };
}
return @new_list;
}
#####################################################################
#
#####################################################################
sub grammar_to_greibach_normal_form {
my ($grammar) = @_;
my ($chomsky, $is_nullable) = grammar_to_chomsky_normal_form($grammar);
my @list = grammar_to_list($chomsky);
my (@new_list) = greibach_helper(@list);
@list = @new_list;
my @new_rules = map {
make('Rule', scalar((Group(@{ $_->[1] }))), name => $_->[0])
} grep {
1
} @list;
my @epsilon_rules;
while (my ($k, $v) = each %{ $is_nullable }) {
next unless $v;
push @epsilon_rules, make('Rule', make('Empty'), name => $k);
}
my $new_grammar = make('Grammar', \@new_rules, start => $grammar->{start});
return ($new_grammar, \@epsilon_rules);
}
sub grammar_to_greibach_normal_form_plus_epsilons {
my ($g, $nullable) = grammar_to_greibach_normal_form(@_);
return make('Grammar', [@{$g->{value}}, @$nullable], start => $g->{start});
}
#####################################################################
#
#####################################################################
sub grammar_to_chomsky_normal_form {
my ($grammar) = @_;
my @list = grammar_to_list(even_simpler_grammar($grammar));
my $counter = 1;
my @result;
my %terminal_map;
my @terminal_rules;
for my $rule (@list) {
my @values;
for my $item (@{ $rule->[1] }) {
if ($item->{class} ne 'Range') {
push @values, $item;
next;
}
my $key = sprintf "#%04x-%04x", $item->{min}, $item->{max};
unless ($terminal_map{$key}) {
$terminal_map{$key} = make('Reference', undef, name => sprintf("__cnf_%u", $counter++));
push @terminal_rules, [
$terminal_map{$key}->{name},
[ $item ]
];
}
push @values, $terminal_map{$key};
}
$rule->[1] = \@values;
}
push @list, @terminal_rules;
# Allow only two non-terminals on the right side
my @contractions;
for my $rule (@list) {
my @values = @{ $rule->[1] };
next if @values <= 2;
while (@values >= 2) {
my $x = shift @values;
my $y = shift @values;
my $ref = make('Reference', undef, name => sprintf("__cnf_%u", $counter++));
unshift @values, $ref;
push @contractions, [ $ref->{name}, [ $x, $y ] ];
}
$rule->[1] = \@values;
}
push @list, @contractions;
my @nullables = grep { @{$_->[1]} == 1 and $_->[1][0]{class} eq 'Empty' } @list;
my %is_nullable = map { $_->[0], 1 } @nullables;
use List::MoreUtils qw/all/;
my $changed = 0;
while (1) {
$changed = 0;
for my $rule (@list) {
next unless $rule->[1][0]->{class} eq 'Reference';
next unless all { $is_nullable{$_->{name}} } @{ $rule->[1] };
next if $is_nullable{ $rule->[0] };
$is_nullable{ $rule->[0] }++;
$changed++;
}
last unless $changed;
}
my @epsilon_replacements;
for my $rule (@list) {
my @values = @{ $rule->[1] };
next unless @values == 2;
if ($is_nullable{ $values[0]->{name} }) {
push @epsilon_replacements, [
$rule->[0], [$values[1]]
];
}
if ($is_nullable{ $values[1]->{name} }) {
push @epsilon_replacements, [
$rule->[0], [$values[0]]
];
}
}
# Throughout the code we assume that Empty is alone
@list = grep { $_->[1][0]{class} ne 'Empty'; } @list;
push @list, @epsilon_replacements;
my $graph = Graph::Directed->new;
for my $rule (@list) {
my @values = @{ $rule->[1] };
if (@values == 1 and $values[0]->{class} eq 'Reference') {
$graph->add_edge($rule->[0], $values[0]->{name});
}
}
my %rename_map;
while (1) {
# TODO: use strongly_connected_components instead
my (@cycle) = $graph->find_a_cycle;
last unless @cycle;
my $name = shift @cycle;
for my $vertex (@cycle) {
$rename_map{$vertex} = $name;
$graph->add_edge($name, $_->[1]) for $graph->edges_from($_);
$graph->add_edge($_->[1], $name) for $graph->edges_to($_);
}
}
for my $rule (@list) {
my @values = @{ $rule->[1] };
$rule->[0] = $rename_map{$rule->[0]} // $rule->[0];
my @new_values;
for my $item (@values) {
if ($item->{class} eq 'Reference') {
if (defined $rename_map{ $item->{name} }) {
push @new_values, make('Reference', undef, name =>
$rename_map{ $item->{name} });
next;
}
}
push @new_values, $item;
}
$rule->[1] = \@new_values;
}
my %name_to_rhs;
for my $rule (@list) {
push @{ $name_to_rhs{$rule->[0]} }, $rule->[1];
}
for my $v (reverse $graph->toposort) {
for my $e ($graph->edges_from($v)) {
next unless $name_to_rhs{$e->[1]};
my @rhs = @{ $name_to_rhs{$e->[1]} };
push @list, [ $v, [@$_] ] for @rhs;
push @{ $name_to_rhs{$v} }, $_ for @rhs;
}
}
my @new_rules = map {
make('Rule', Group(@{ $_->[1] }), name => $_->[0])
} grep {
@{ $_->[1] } != 1 or $_->[1][0]{class} ne 'Reference';
} @list;
my $new_grammar = make('Grammar', \@new_rules, start => $grammar->{start});
return ($new_grammar, \%is_nullable);
}
sub Group {
return $_[0] if @_ == 1;
return simplify_group make('Group', [@_]);
}
sub dump_simple_grammar {
my ($simple) = @_;
my %seen;
for my $rule (order_by { $_->{name} =~ /^__/ } @{ $simple->{value} }) {
my @items = $rule->{value}->{class} eq 'Group' ?
@{ $rule->{value}->{value} } : $rule->{value};
my $fix_name = sub {
my $name = shift;
$name =~ s/^__/auto--/;
return $name;
};
my $item_serializer = sub {
my $item = shift;
for ($item->{class}) {
when("Range") {
return sprintf "%%x%04x-%04x", $item->{min}, $item->{max};
}
when("Reference") {
return sprintf "%s", $fix_name->($item->{name});
}
when("Empty") {
return sprintf '""';
}
default {
warn Dumper { rule => $rule, item => $item };
}
}
};
my $name = $fix_name->($rule->{name});
printf "%-30s =%1s %s\n", $name, ($seen{$name}++ ? '/' : ''), join(' ',
map { $item_serializer->($_) } @items);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment