rns/sl_panda_ast_from_asf.t

## sl_panda_ast_from_asf.t
use 5.010;
use strict;
use warnings;

use Test::More tests => 3;
use Marpa::R2;

use Data::Dumper;
$Data::Dumper::Indent = 0;
$Data::Dumper::Terse = 1;
$Data::Dumper::Deepcopy = 1;

my $dsl = <<'END_OF_SOURCE';

:default ::= action => [ name, values ]
lexeme default = action => [ name, value ]

:discard ~ whitespace

S   ::= NP  VP  period

NP  ::= NN
    |   NNS
    |   DT  NN
    |   NN  NNS
    |   NNS CC NNS

VP  ::= VBZ NP
    |   VP VBZ NNS
    |   VP CC VP
    |   VP VP CC VP
    |   VBZ

CC          ~ 'and'
DT          ~ 'a' | 'an'
NN          ~ 'panda'
NNS         ~ 'shoots' | 'leaves'
VBZ         ~ 'eats' | 'shoots' | 'leaves'
period      ~ '.'
whitespace  ~ [\s]+

END_OF_SOURCE

my $sentence = 'a panda eats shoots and leaves.';

my $bracketed_expected = <<'END_OF_OUTPUT';
(S (NP (DT a) (NN panda))
  (VP
    (VP (VBZ eats) (NP (NNS shoots))) (CC and)
    (VP (VBZ leaves)))
  (. .))
(S (NP (DT a) (NN panda))
  (VP
    (VP (VBZ eats))
    (VP (VBZ shoots)) (CC and)
    (VP (VBZ leaves)))
  (. .))
(S (NP (DT a) (NN panda))
  (VP (VBZ eats) (NP (NNS shoots) (CC and) (NNS leaves)))
  (. .))
END_OF_OUTPUT

# parse $sentence
my $grammar = Marpa::R2::Scanless::G->new( { source => \$dsl, } );
my $recce = Marpa::R2::Scanless::R->new( { grammar => $grammar } );
$recce->read( \$sentence );

# get ASTs by enumerating values
my @expected_asts = ();
while ( defined( my $value_ref = $recce->value() ) ) {
    my $value = $value_ref ? ${$value_ref} : 'No parse';
    push @expected_asts, $value;
}

is  join("\n", sort map { bracket($_) } @expected_asts) . "\n",
    $bracketed_expected,
    "ASTs from value()";

# get ASTs by traversing the ASF
my $asts = [];
$recce->series_restart(); # recognizer needs to be reset before building the ASF
my $asf = Marpa::R2::ASF->new( { slr => $recce } );
$asts = $asf->traverse( {}, \&ast_builder );
@$asts = map { unpack_token_nodes($_) } @$asts;

# ASTs from the ASF have been built, but their order differs from that
# of ASTs built by enumerating the parse values. Hence, we expect that
# 6 tests must fail, and 3 trees must be the same of a total of 9
my $match = 0;
for my $i_got (0..@$asts-1){
    for my $i_expected (0..@expected_asts-1){
        $match++ if Dumper($asts->[$i_got]) eq Dumper($expected_asts[$i_expected]);
    }
}
is $match, 3, "Ambiguous English phrase: ASTs from ASF";

is join("\n", sort map { bracket($_) } @$asts) . "\n",
   $bracketed_expected,
   "Ambiguous English phrase: bracketed ASTs from ASF";

# traverser to build ASTs from the ASF
# todo: add callbacks for node construction,
# e.g. for pruning by setting node to undef
my ($token_head, $token_head_length, $token_template);
BEGIN {
    $token_head = "__TOKEN_HEAD";
    $token_head_length = length($token_head);
    $token_template = "C/A*C/A*"; # 2 length-prefixed strings: token’s name and value
}

sub ast_builder {

    # This routine converts the glade into a list of elements.  It is called recursively.
    my ($glade, $scratch) = @_;
    my $rule_id     = $glade->rule_id();
    my $symbol_id   = $glade->symbol_id();
    my $symbol_name = $grammar->symbol_name($symbol_id);

    # A token is a single choice, and we know enough to return the node
    if ( not defined $rule_id ) {
        my $literal = $glade->literal();
        # A token node need to be a single value to avoid confusing all_choices()
        # so we pack them to unpack later
        return [ $token_head . pack($token_template, $symbol_name, $literal) ];
    } ## end if ( not defined $rule_id )

    # Our result will be a list of choices
    my @return_value = ();

    CHOICE: while (1) {

        # The results at each position are a list of choices, so
        # to produce a new result list, we need to take a Cartesian
        # product of all the choices
        my @results = $glade->all_choices();

        # Special case for the start rule: return all ASTs as array refs
        if ( $symbol_name eq '[:start]' ) {
            my $asts = [ map { @{$_} } @results ];
#            say "# ASTs:\n", Dumper $asts;
            return $asts;
        }

        # Now we have a list of choices, as a list of lists.
        # Each sub list is a list of elements, which we need
        # to join into a single element.  The result will be
        # to collapse one level of lists, and leave us
        # with a list of elements
        push @return_value, map {
            # new AST node construction
            # to check for root $symbol_name eq start symbol of the grammar
            my $node = [ $symbol_name, @{$_} ];
#            say "# node:\n", Dumper $node;
#            $symbol_name eq 'NP' ? $node : undef;
        } @results;

        # Look at the next alternative in this glade, or end the
        # loop if there is none
        last CHOICE if not defined $glade->next();

    } ## end CHOICE: while (1)

    # Return the list of elements for this glade
    return \@return_value;
} ## end sub ast_builder

sub unpack_token_nodes{
    my $ast = shift;
    if ( ref $ast eq "ARRAY" ){
        for my $i (0..@$ast-1){
            my $child = $ast->[$i];
            # test if a child is a token node
            if (ref $child ne "ARRAY" and index ($child, $token_head) == 0 ){
                $child = substr $child, $token_head_length;
                $ast->[$i] = [ unpack($token_template, $child) ];
            }
        }
        map { unpack_token_nodes($_) } @$ast;
    }
    return $ast;
}

# stringify AST nodes as (tag text) per Penn Treebank

# structural tags -- need a newline
my %s_tags;
BEGIN { %s_tags = map { $_ => undef } qw{ VP period } }

sub bracket {
    my ($ast) = @_;
    my ($tag, @contents) = @{$ast};
    state $level++;
    my $bracketed = exists $s_tags{$tag} ? ("\n" . ("  " x ($level-1))) : '';
    $tag = '.' if $tag eq 'period';
    if (ref $contents[0] eq "ARRAY"){
        $bracketed .= "($tag " . join(' ', map { bracket($_) } @contents) . ")";
    }
    else {
        $bracketed .= "($tag $contents[0])";
    }
    $level--;
    $bracketed =~ s/ \n/\n/gs;
    return $bracketed;
}
	use 5.010;
	use strict;
	use warnings;

	use Test::More tests => 3;
	use Marpa::R2;

	use Data::Dumper;
	$Data::Dumper::Indent = 0;
	$Data::Dumper::Terse = 1;
	$Data::Dumper::Deepcopy = 1;

	my $dsl = <<'END_OF_SOURCE';

	:default ::= action => [ name, values ]
	lexeme default = action => [ name, value ]

	:discard ~ whitespace

	S ::= NP VP period

	NP ::= NN
	\| NNS
	\| DT NN
	\| NN NNS
	\| NNS CC NNS

	VP ::= VBZ NP
	\| VP VBZ NNS
	\| VP CC VP
	\| VP VP CC VP
	\| VBZ

	CC ~ 'and'
	DT ~ 'a' \| 'an'
	NN ~ 'panda'
	NNS ~ 'shoots' \| 'leaves'
	VBZ ~ 'eats' \| 'shoots' \| 'leaves'
	period ~ '.'
	whitespace ~ [\s]+

	END_OF_SOURCE

	my $sentence = 'a panda eats shoots and leaves.';

	my $bracketed_expected = <<'END_OF_OUTPUT';
	(S (NP (DT a) (NN panda))
	(VP
	(VP (VBZ eats) (NP (NNS shoots))) (CC and)
	(VP (VBZ leaves)))
	(. .))
	(S (NP (DT a) (NN panda))
	(VP
	(VP (VBZ eats))
	(VP (VBZ shoots)) (CC and)
	(VP (VBZ leaves)))
	(. .))
	(S (NP (DT a) (NN panda))
	(VP (VBZ eats) (NP (NNS shoots) (CC and) (NNS leaves)))
	(. .))
	END_OF_OUTPUT

	# parse $sentence
	my $grammar = Marpa::R2::Scanless::G->new( { source => \$dsl, } );
	my $recce = Marpa::R2::Scanless::R->new( { grammar => $grammar } );
	$recce->read( \$sentence );

	# get ASTs by enumerating values
	my @expected_asts = ();
	while ( defined( my $value_ref = $recce->value() ) ) {
	my $value = $value_ref ? ${$value_ref} : 'No parse';
	push @expected_asts, $value;
	}

	is join("\n", sort map { bracket($_) } @expected_asts) . "\n",
	$bracketed_expected,
	"ASTs from value()";

	# get ASTs by traversing the ASF
	my $asts = [];
	$recce->series_restart(); # recognizer needs to be reset before building the ASF
	my $asf = Marpa::R2::ASF->new( { slr => $recce } );
	$asts = $asf->traverse( {}, \&ast_builder );
	@$asts = map { unpack_token_nodes($_) } @$asts;

	# ASTs from the ASF have been built, but their order differs from that
	# of ASTs built by enumerating the parse values. Hence, we expect that
	# 6 tests must fail, and 3 trees must be the same of a total of 9
	my $match = 0;
	for my $i_got (0..@$asts-1){
	for my $i_expected (0..@expected_asts-1){
	$match++ if Dumper($asts->[$i_got]) eq Dumper($expected_asts[$i_expected]);
	}
	}
	is $match, 3, "Ambiguous English phrase: ASTs from ASF";

	is join("\n", sort map { bracket($_) } @$asts) . "\n",
	$bracketed_expected,
	"Ambiguous English phrase: bracketed ASTs from ASF";

	# traverser to build ASTs from the ASF
	# todo: add callbacks for node construction,
	# e.g. for pruning by setting node to undef
	my ($token_head, $token_head_length, $token_template);
	BEGIN {
	$token_head = "__TOKEN_HEAD";
	$token_head_length = length($token_head);
	$token_template = "C/AC/A"; # 2 length-prefixed strings: token’s name and value
	}

	sub ast_builder {

	# This routine converts the glade into a list of elements. It is called recursively.
	my ($glade, $scratch) = @_;
	my $rule_id = $glade->rule_id();
	my $symbol_id = $glade->symbol_id();
	my $symbol_name = $grammar->symbol_name($symbol_id);

	# A token is a single choice, and we know enough to return the node
	if ( not defined $rule_id ) {
	my $literal = $glade->literal();
	# A token node need to be a single value to avoid confusing all_choices()
	# so we pack them to unpack later
	return [ $token_head . pack($token_template, $symbol_name, $literal) ];
	} ## end if ( not defined $rule_id )

	# Our result will be a list of choices
	my @return_value = ();

	CHOICE: while (1) {

	# The results at each position are a list of choices, so
	# to produce a new result list, we need to take a Cartesian
	# product of all the choices
	my @results = $glade->all_choices();

	# Special case for the start rule: return all ASTs as array refs
	if ( $symbol_name eq '[:start]' ) {
	my $asts = [ map { @{$_} } @results ];
	# say "# ASTs:\n", Dumper $asts;
	return $asts;
	}

	# Now we have a list of choices, as a list of lists.
	# Each sub list is a list of elements, which we need
	# to join into a single element. The result will be
	# to collapse one level of lists, and leave us
	# with a list of elements
	push @return_value, map {
	# new AST node construction
	# to check for root $symbol_name eq start symbol of the grammar
	my $node = [ $symbol_name, @{$_} ];
	# say "# node:\n", Dumper $node;
	# $symbol_name eq 'NP' ? $node : undef;
	} @results;

	# Look at the next alternative in this glade, or end the
	# loop if there is none
	last CHOICE if not defined $glade->next();

	} ## end CHOICE: while (1)

	# Return the list of elements for this glade
	return \@return_value;
	} ## end sub ast_builder

	sub unpack_token_nodes{
	my $ast = shift;
	if ( ref $ast eq "ARRAY" ){
	for my $i (0..@$ast-1){
	my $child = $ast->[$i];
	# test if a child is a token node
	if (ref $child ne "ARRAY" and index ($child, $token_head) == 0 ){
	$child = substr $child, $token_head_length;
	$ast->[$i] = [ unpack($token_template, $child) ];
	}
	}
	map { unpack_token_nodes($_) } @$ast;
	}
	return $ast;
	}

	# stringify AST nodes as (tag text) per Penn Treebank

	# structural tags -- need a newline
	my %s_tags;
	BEGIN { %s_tags = map { $_ => undef } qw{ VP period } }

	sub bracket {
	my ($ast) = @_;
	my ($tag, @contents) = @{$ast};
	state $level++;
	my $bracketed = exists $s_tags{$tag} ? ("\n" . (" " x ($level-1))) : '';
	$tag = '.' if $tag eq 'period';
	if (ref $contents[0] eq "ARRAY"){
	$bracketed .= "($tag " . join(' ', map { bracket($_) } @contents) . ")";
	}
	else {
	$bracketed .= "($tag $contents[0])";
	}
	$level--;
	$bracketed =~ s/ \n/\n/gs;
	return $bracketed;
	}