rns/sl_timeflies_sort_parses.t

## sl_timeflies_sort_parses.t
#!/usr/bin/perl
# Copyright 2013 Jeffrey Kegler, part of Marpa::R2.  Marpa::R2 is free software: you can
# redistribute it and/or modify it under the terms of the GNU Lesser
# General Public License as published by the Free Software Foundation,
# either version 3 of the License, or (at your option) any later version.
#
# Marpa::R2 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser
# General Public License along with Marpa::R2.  If not, see
# http://www.gnu.org/licenses/.

# This example is from Ralf Muschall, who clearly knows English
# grammar better than most native speakers.  I've reworked the
# terminology to follow _A Comprehensive Grammar of the English
# Language_, by Quirk, Greenbaum, Leech and Svartvik.  My edition
# was the "Seventh (corrected) impression 1989".
#
# When it is not a verb, I treat "like"
# as a preposition in an adjunct of manner,
# as per 8.79, p. 557; 9.4, pp. 661; and 9.48, pp. 698-699.
#
# The saying "time flies like an arrow; fruit flies like a banana"
# is attributed to Groucho Marx, but there is no reason to believe
# he ever said it.  Apparently, the saying
# first appeared on the Usenet on net.jokes in 1982.
# I've documented this whole thing on Wikipedia:
# http://en.wikipedia.org/wiki/Time_flies_like_an_arrow
#
# The permalink is:
# http://en.wikipedia.org/w/index.php?title=Time_flies_like_an_arrow&oldid=311163283

use 5.010;
use strict;
use warnings;
use English qw( -no_match_vars );

use Test::More tests => 1;
use lib 'inc';
use Marpa::R2::Test;
use Marpa::R2;

## no critic (Subroutines::RequireArgUnpacking)

sub do_paragraph         { shift; \@_ }
sub do_sva_sentence      { return "sva($_[1];$_[2];$_[3])" }
sub do_svo_sentence      { return "svo($_[1];$_[2];$_[3])" }
sub do_nominal_adjunct   { return "adju($_[1];$_[2])" }
sub do_adverbial_adjunct { return "adju($_[1])" }
sub do_adjective         { return "adje($_[1])" }
sub do_qualified_subject { return "s($_[1];$_[2])" }
sub do_bare_subject      { return "s($_[1])" }
sub do_noun              { return "n($_[1])" }
sub do_verb              { return "v($_[1])" }
sub do_adverb            { return "adv($_[1])" }
sub do_object            { return "o($_[1];$_[2])" }
sub do_article           { return "art($_[1])" }
sub do_preposition       { return "pr($_[1])" }

## use critic

my $grammar = Marpa::R2::Scanless::G->new(
    {
    source          => \(<<'END_OF_SOURCE'),

paragraph   ::= sentence+                     action => do_paragraph
sentence    ::= subject verb adjunct period   action => do_sva_sentence
sentence    ::= subject verb object period    action => do_svo_sentence
adjunct     ::= preposition object            action => do_nominal_adjunct
adjunct     ::= adverb                        action => do_adverbial_adjunct
adjective   ::= adjective_noun_lex            action => do_adjective
subject     ::= adjective noun                action => do_qualified_subject
subject     ::= noun                          action => do_bare_subject
noun        ::= adjective_noun_lex            action => do_noun
adverb      ::= adverb_lex                    action => do_adverb
verb        ::= verb_lex                      action => do_verb
object      ::= article noun                  action => do_object
article     ::= article_lex                   action => do_article
preposition ::= preposition_lex               action => do_preposition

preposition_lex ~ unicorn
verb_lex ~ unicorn
adjective_noun_lex ~ unicorn
article_lex ~ unicorn
adverb_lex ~ unicorn
period ~ unicorn

unicorn ~ [^\s\S]

END_OF_SOURCE
    }
);

# this parse has the lowest number of unique constituents (subjects, verbs, objects)
my $expected = <<'EOS';
sva(s(n(Time));v(flies);adju(pr(like);o(art(an);n(arrow))))
sva(s(n(Time));v(flies);adju(adv(fast)))
svo(s(adje(Fruit);n(flies));v(like);o(art(a);n(banana)))
svo(s(adje(Fruit);n(flies));v(spoil);o(art(a);n(banana)))
EOS
my @actual = ();

# setup { word => lex class } mapping
my %lexical_class = (
    'preposition_lex'    => 'like',
    'verb_lex'           => 'like flies spoil',
    'adjective_noun_lex' => 'fruit banana time arrow flies',
    'article_lex'        => 'a an',
    'adverb_lex'         => 'fast',
    'period'             => '.'
);
my %vocabulary = ();
while ( my ( $lexical_class, $words ) = each %lexical_class ) {
    for my $word ( split q{ }, $words ) {
        push @{ $vocabulary{$word} }, $lexical_class;
    }
}

my $recce = Marpa::R2::Scanless::R->new( {
    grammar => $grammar ,
    semantics_package => 'main',
} );
die 'Failed to create recognizer' if not $recce;

my $data = <<END_OF_DATA;
Time flies like an arrow.
Time flies fast.
Fruit flies like a banana.
Fruit flies spoil a banana.
END_OF_DATA

# external parsing
my $start = 0;
$recce->read( \$data, 0, 0 );

# split to words preserving punctuation
my @words = grep { $_ } split /\s+|(\.)/m, $data;
for my $word ( @words ) {
    for my $type ( @{ $vocabulary{lc $word} } ) {
#        say "'$word':$type @ $start";
        eval { $recce->lexeme_alternative( $type, $word ) };
        die $@, $recce->show_progress(), ' at ', __LINE__ if $@;
    }
    eval { $recce->lexeme_complete( $start, length $word ) };
    die $@, $recce->show_progress(), ' at ', __LINE__ if $@;

    $start += length ($word);
} ## end for my $word ( split q{ }, $data )

# get evaluated parses
my @parses;
while ( defined( my $value_ref = $recce->value() ) ) {
    my $value = $value_ref ? ${$value_ref} : 'No parse';
    push @parses, $value;
}

# descending sorting of parses by the number of unique constituents (subjects, verbs, and objects)
@parses = sort {
    my @c;
    for my $parse ($a, $b){
        my $c = 0;
        my %uniq_constituents;
        for my $s (@$parse){
            if ( $s =~ /s\((.+?)\);v\((.+?)\);(o|adju)\((.+?)\)\)$/ ) {
                $uniq_constituents{$_}++ for ($1, $2, $4);
            }
            else{
                die "Can't get constituents from $s";
            }
        }
        $c = keys %uniq_constituents;
        push @c, $c;
    }
#    say join ' ', @c, "\n";
    $c[0] <=> $c[1];
} @parses;

#say join "\n", map { join ("\n", @$_) . "\n" } @parses;

Marpa::R2::Test::is(
    join ("\n", @{$parses[0]}) . "\n",
    $expected,
    'Parses of ambiguous English sentences sorted by constituent count'
);

1;    # In case used as "do" file
	#!/usr/bin/perl
	# Copyright 2013 Jeffrey Kegler, part of Marpa::R2. Marpa::R2 is free software: you can
	# redistribute it and/or modify it under the terms of the GNU Lesser
	# General Public License as published by the Free Software Foundation,
	# either version 3 of the License, or (at your option) any later version.
	#
	# Marpa::R2 is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# Lesser General Public License for more details.
	#
	# You should have received a copy of the GNU Lesser
	# General Public License along with Marpa::R2. If not, see
	# http://www.gnu.org/licenses/.

	# This example is from Ralf Muschall, who clearly knows English
	# grammar better than most native speakers. I've reworked the
	# terminology to follow _A Comprehensive Grammar of the English
	# Language_, by Quirk, Greenbaum, Leech and Svartvik. My edition
	# was the "Seventh (corrected) impression 1989".
	#
	# When it is not a verb, I treat "like"
	# as a preposition in an adjunct of manner,
	# as per 8.79, p. 557; 9.4, pp. 661; and 9.48, pp. 698-699.
	#
	# The saying "time flies like an arrow; fruit flies like a banana"
	# is attributed to Groucho Marx, but there is no reason to believe
	# he ever said it. Apparently, the saying
	# first appeared on the Usenet on net.jokes in 1982.
	# I've documented this whole thing on Wikipedia:
	# http://en.wikipedia.org/wiki/Time_flies_like_an_arrow
	#
	# The permalink is:
	# http://en.wikipedia.org/w/index.php?title=Time_flies_like_an_arrow&oldid=311163283

	use 5.010;
	use strict;
	use warnings;
	use English qw( -no_match_vars );

	use Test::More tests => 1;
	use lib 'inc';
	use Marpa::R2::Test;
	use Marpa::R2;

	## no critic (Subroutines::RequireArgUnpacking)

	sub do_paragraph { shift; \@_ }
	sub do_sva_sentence { return "sva($_[1];$_[2];$_[3])" }
	sub do_svo_sentence { return "svo($_[1];$_[2];$_[3])" }
	sub do_nominal_adjunct { return "adju($_[1];$_[2])" }
	sub do_adverbial_adjunct { return "adju($_[1])" }
	sub do_adjective { return "adje($_[1])" }
	sub do_qualified_subject { return "s($_[1];$_[2])" }
	sub do_bare_subject { return "s($_[1])" }
	sub do_noun { return "n($_[1])" }
	sub do_verb { return "v($_[1])" }
	sub do_adverb { return "adv($_[1])" }
	sub do_object { return "o($_[1];$_[2])" }
	sub do_article { return "art($_[1])" }
	sub do_preposition { return "pr($_[1])" }

	## use critic

	my $grammar = Marpa::R2::Scanless::G->new(
	{
	source => \(<<'END_OF_SOURCE'),

	paragraph ::= sentence+ action => do_paragraph
	sentence ::= subject verb adjunct period action => do_sva_sentence
	sentence ::= subject verb object period action => do_svo_sentence
	adjunct ::= preposition object action => do_nominal_adjunct
	adjunct ::= adverb action => do_adverbial_adjunct
	adjective ::= adjective_noun_lex action => do_adjective
	subject ::= adjective noun action => do_qualified_subject
	subject ::= noun action => do_bare_subject
	noun ::= adjective_noun_lex action => do_noun
	adverb ::= adverb_lex action => do_adverb
	verb ::= verb_lex action => do_verb
	object ::= article noun action => do_object
	article ::= article_lex action => do_article
	preposition ::= preposition_lex action => do_preposition

	preposition_lex ~ unicorn
	verb_lex ~ unicorn
	adjective_noun_lex ~ unicorn
	article_lex ~ unicorn
	adverb_lex ~ unicorn
	period ~ unicorn

	unicorn ~ [^\s\S]

	END_OF_SOURCE
	}
	);

	# this parse has the lowest number of unique constituents (subjects, verbs, objects)
	my $expected = <<'EOS';
	sva(s(n(Time));v(flies);adju(pr(like);o(art(an);n(arrow))))
	sva(s(n(Time));v(flies);adju(adv(fast)))
	svo(s(adje(Fruit);n(flies));v(like);o(art(a);n(banana)))
	svo(s(adje(Fruit);n(flies));v(spoil);o(art(a);n(banana)))
	EOS
	my @actual = ();

	# setup { word => lex class } mapping
	my %lexical_class = (
	'preposition_lex' => 'like',
	'verb_lex' => 'like flies spoil',
	'adjective_noun_lex' => 'fruit banana time arrow flies',
	'article_lex' => 'a an',
	'adverb_lex' => 'fast',
	'period' => '.'
	);
	my %vocabulary = ();
	while ( my ( $lexical_class, $words ) = each %lexical_class ) {
	for my $word ( split q{ }, $words ) {
	push @{ $vocabulary{$word} }, $lexical_class;
	}
	}

	my $recce = Marpa::R2::Scanless::R->new( {
	grammar => $grammar ,
	semantics_package => 'main',
	} );
	die 'Failed to create recognizer' if not $recce;

	my $data = <<END_OF_DATA;
	Time flies like an arrow.
	Time flies fast.
	Fruit flies like a banana.
	Fruit flies spoil a banana.
	END_OF_DATA

	# external parsing
	my $start = 0;
	$recce->read( \$data, 0, 0 );

	# split to words preserving punctuation
	my @words = grep { $_ } split /\s+\|(\.)/m, $data;
	for my $word ( @words ) {
	for my $type ( @{ $vocabulary{lc $word} } ) {
	# say "'$word':$type @ $start";
	eval { $recce->lexeme_alternative( $type, $word ) };
	die $@, $recce->show_progress(), ' at ', __LINE__ if $@;
	}
	eval { $recce->lexeme_complete( $start, length $word ) };
	die $@, $recce->show_progress(), ' at ', __LINE__ if $@;

	$start += length ($word);
	} ## end for my $word ( split q{ }, $data )

	# get evaluated parses
	my @parses;
	while ( defined( my $value_ref = $recce->value() ) ) {
	my $value = $value_ref ? ${$value_ref} : 'No parse';
	push @parses, $value;
	}

	# descending sorting of parses by the number of unique constituents (subjects, verbs, and objects)
	@parses = sort {
	my @c;
	for my $parse ($a, $b){
	my $c = 0;
	my %uniq_constituents;
	for my $s (@$parse){
	if ( $s =~ /s\((.+?)\);v\((.+?)\);(o\|adju)\((.+?)\)\)$/ ) {
	$uniq_constituents{$_}++ for ($1, $2, $4);
	}
	else{
	die "Can't get constituents from $s";
	}
	}
	$c = keys %uniq_constituents;
	push @c, $c;
	}
	# say join ' ', @c, "\n";
	$c[0] <=> $c[1];
	} @parses;

	#say join "\n", map { join ("\n", @$_) . "\n" } @parses;

	Marpa::R2::Test::is(
	join ("\n", @{$parses[0]}) . "\n",
	$expected,
	'Parses of ambiguous English sentences sorted by constituent count'
	);

	1; # In case used as "do" file