Created
November 28, 2013 06:50
-
-
Save rns/7688187 to your computer and use it in GitHub Desktop.
cross-sentence disambiguation by naive parse sorting
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# Copyright 2013 Jeffrey Kegler, part of Marpa::R2. Marpa::R2 is free software: you can | |
# redistribute it and/or modify it under the terms of the GNU Lesser | |
# General Public License as published by the Free Software Foundation, | |
# either version 3 of the License, or (at your option) any later version. | |
# | |
# Marpa::R2 is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
# Lesser General Public License for more details. | |
# | |
# You should have received a copy of the GNU Lesser | |
# General Public License along with Marpa::R2. If not, see | |
# http://www.gnu.org/licenses/. | |
# This example is from Ralf Muschall, who clearly knows English | |
# grammar better than most native speakers. I've reworked the | |
# terminology to follow _A Comprehensive Grammar of the English | |
# Language_, by Quirk, Greenbaum, Leech and Svartvik. My edition | |
# was the "Seventh (corrected) impression 1989". | |
# | |
# When it is not a verb, I treat "like" | |
# as a preposition in an adjunct of manner, | |
# as per 8.79, p. 557; 9.4, pp. 661; and 9.48, pp. 698-699. | |
# | |
# The saying "time flies like an arrow; fruit flies like a banana" | |
# is attributed to Groucho Marx, but there is no reason to believe | |
# he ever said it. Apparently, the saying | |
# first appeared on the Usenet on net.jokes in 1982. | |
# I've documented this whole thing on Wikipedia: | |
# http://en.wikipedia.org/wiki/Time_flies_like_an_arrow | |
# | |
# The permalink is: | |
# http://en.wikipedia.org/w/index.php?title=Time_flies_like_an_arrow&oldid=311163283 | |
use 5.010; | |
use strict; | |
use warnings; | |
use English qw( -no_match_vars ); | |
use Test::More tests => 1; | |
use lib 'inc'; | |
use Marpa::R2::Test; | |
use Marpa::R2; | |
## no critic (Subroutines::RequireArgUnpacking) | |
sub do_paragraph { shift; \@_ } | |
sub do_sva_sentence { return "sva($_[1];$_[2];$_[3])" } | |
sub do_svo_sentence { return "svo($_[1];$_[2];$_[3])" } | |
sub do_nominal_adjunct { return "adju($_[1];$_[2])" } | |
sub do_adverbial_adjunct { return "adju($_[1])" } | |
sub do_adjective { return "adje($_[1])" } | |
sub do_qualified_subject { return "s($_[1];$_[2])" } | |
sub do_bare_subject { return "s($_[1])" } | |
sub do_noun { return "n($_[1])" } | |
sub do_verb { return "v($_[1])" } | |
sub do_adverb { return "adv($_[1])" } | |
sub do_object { return "o($_[1];$_[2])" } | |
sub do_article { return "art($_[1])" } | |
sub do_preposition { return "pr($_[1])" } | |
## use critic | |
my $grammar = Marpa::R2::Scanless::G->new( | |
{ | |
source => \(<<'END_OF_SOURCE'), | |
paragraph ::= sentence+ action => do_paragraph | |
sentence ::= subject verb adjunct period action => do_sva_sentence | |
sentence ::= subject verb object period action => do_svo_sentence | |
adjunct ::= preposition object action => do_nominal_adjunct | |
adjunct ::= adverb action => do_adverbial_adjunct | |
adjective ::= adjective_noun_lex action => do_adjective | |
subject ::= adjective noun action => do_qualified_subject | |
subject ::= noun action => do_bare_subject | |
noun ::= adjective_noun_lex action => do_noun | |
adverb ::= adverb_lex action => do_adverb | |
verb ::= verb_lex action => do_verb | |
object ::= article noun action => do_object | |
article ::= article_lex action => do_article | |
preposition ::= preposition_lex action => do_preposition | |
preposition_lex ~ unicorn | |
verb_lex ~ unicorn | |
adjective_noun_lex ~ unicorn | |
article_lex ~ unicorn | |
adverb_lex ~ unicorn | |
period ~ unicorn | |
unicorn ~ [^\s\S] | |
END_OF_SOURCE | |
} | |
); | |
# this parse has the lowest number of unique constituents (subjects, verbs, objects) | |
my $expected = <<'EOS'; | |
sva(s(n(Time));v(flies);adju(pr(like);o(art(an);n(arrow)))) | |
sva(s(n(Time));v(flies);adju(adv(fast))) | |
svo(s(adje(Fruit);n(flies));v(like);o(art(a);n(banana))) | |
svo(s(adje(Fruit);n(flies));v(spoil);o(art(a);n(banana))) | |
EOS | |
my @actual = (); | |
# setup { word => lex class } mapping | |
my %lexical_class = ( | |
'preposition_lex' => 'like', | |
'verb_lex' => 'like flies spoil', | |
'adjective_noun_lex' => 'fruit banana time arrow flies', | |
'article_lex' => 'a an', | |
'adverb_lex' => 'fast', | |
'period' => '.' | |
); | |
my %vocabulary = (); | |
while ( my ( $lexical_class, $words ) = each %lexical_class ) { | |
for my $word ( split q{ }, $words ) { | |
push @{ $vocabulary{$word} }, $lexical_class; | |
} | |
} | |
my $recce = Marpa::R2::Scanless::R->new( { | |
grammar => $grammar , | |
semantics_package => 'main', | |
} ); | |
die 'Failed to create recognizer' if not $recce; | |
my $data = <<END_OF_DATA; | |
Time flies like an arrow. | |
Time flies fast. | |
Fruit flies like a banana. | |
Fruit flies spoil a banana. | |
END_OF_DATA | |
# external parsing | |
my $start = 0; | |
$recce->read( \$data, 0, 0 ); | |
# split to words preserving punctuation | |
my @words = grep { $_ } split /\s+|(\.)/m, $data; | |
for my $word ( @words ) { | |
for my $type ( @{ $vocabulary{lc $word} } ) { | |
# say "'$word':$type @ $start"; | |
eval { $recce->lexeme_alternative( $type, $word ) }; | |
die $@, $recce->show_progress(), ' at ', __LINE__ if $@; | |
} | |
eval { $recce->lexeme_complete( $start, length $word ) }; | |
die $@, $recce->show_progress(), ' at ', __LINE__ if $@; | |
$start += length ($word); | |
} ## end for my $word ( split q{ }, $data ) | |
# get evaluated parses | |
my @parses; | |
while ( defined( my $value_ref = $recce->value() ) ) { | |
my $value = $value_ref ? ${$value_ref} : 'No parse'; | |
push @parses, $value; | |
} | |
# descending sorting of parses by the number of unique constituents (subjects, verbs, and objects) | |
@parses = sort { | |
my @c; | |
for my $parse ($a, $b){ | |
my $c = 0; | |
my %uniq_constituents; | |
for my $s (@$parse){ | |
if ( $s =~ /s\((.+?)\);v\((.+?)\);(o|adju)\((.+?)\)\)$/ ) { | |
$uniq_constituents{$_}++ for ($1, $2, $4); | |
} | |
else{ | |
die "Can't get constituents from $s"; | |
} | |
} | |
$c = keys %uniq_constituents; | |
push @c, $c; | |
} | |
# say join ' ', @c, "\n"; | |
$c[0] <=> $c[1]; | |
} @parses; | |
#say join "\n", map { join ("\n", @$_) . "\n" } @parses; | |
Marpa::R2::Test::is( | |
join ("\n", @{$parses[0]}) . "\n", | |
$expected, | |
'Parses of ambiguous English sentences sorted by constituent count' | |
); | |
1; # In case used as "do" file | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment