Skip to content

Instantly share code, notes, and snippets.

@rns
Created November 28, 2013 06:50
Show Gist options
  • Save rns/7688187 to your computer and use it in GitHub Desktop.
Save rns/7688187 to your computer and use it in GitHub Desktop.
cross-sentence disambiguation by naive parse sorting
#!/usr/bin/perl
# Copyright 2013 Jeffrey Kegler, part of Marpa::R2. Marpa::R2 is free software: you can
# redistribute it and/or modify it under the terms of the GNU Lesser
# General Public License as published by the Free Software Foundation,
# either version 3 of the License, or (at your option) any later version.
#
# Marpa::R2 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser
# General Public License along with Marpa::R2. If not, see
# http://www.gnu.org/licenses/.
# This example is from Ralf Muschall, who clearly knows English
# grammar better than most native speakers. I've reworked the
# terminology to follow _A Comprehensive Grammar of the English
# Language_, by Quirk, Greenbaum, Leech and Svartvik. My edition
# was the "Seventh (corrected) impression 1989".
#
# When it is not a verb, I treat "like"
# as a preposition in an adjunct of manner,
# as per 8.79, p. 557; 9.4, pp. 661; and 9.48, pp. 698-699.
#
# The saying "time flies like an arrow; fruit flies like a banana"
# is attributed to Groucho Marx, but there is no reason to believe
# he ever said it. Apparently, the saying
# first appeared on the Usenet on net.jokes in 1982.
# I've documented this whole thing on Wikipedia:
# http://en.wikipedia.org/wiki/Time_flies_like_an_arrow
#
# The permalink is:
# http://en.wikipedia.org/w/index.php?title=Time_flies_like_an_arrow&oldid=311163283
use 5.010;
use strict;
use warnings;
use English qw( -no_match_vars );
use Test::More tests => 1;
use lib 'inc';
use Marpa::R2::Test;
use Marpa::R2;
## no critic (Subroutines::RequireArgUnpacking)
sub do_paragraph { shift; \@_ }
sub do_sva_sentence { return "sva($_[1];$_[2];$_[3])" }
sub do_svo_sentence { return "svo($_[1];$_[2];$_[3])" }
sub do_nominal_adjunct { return "adju($_[1];$_[2])" }
sub do_adverbial_adjunct { return "adju($_[1])" }
sub do_adjective { return "adje($_[1])" }
sub do_qualified_subject { return "s($_[1];$_[2])" }
sub do_bare_subject { return "s($_[1])" }
sub do_noun { return "n($_[1])" }
sub do_verb { return "v($_[1])" }
sub do_adverb { return "adv($_[1])" }
sub do_object { return "o($_[1];$_[2])" }
sub do_article { return "art($_[1])" }
sub do_preposition { return "pr($_[1])" }
## use critic
my $grammar = Marpa::R2::Scanless::G->new(
{
source => \(<<'END_OF_SOURCE'),
paragraph ::= sentence+ action => do_paragraph
sentence ::= subject verb adjunct period action => do_sva_sentence
sentence ::= subject verb object period action => do_svo_sentence
adjunct ::= preposition object action => do_nominal_adjunct
adjunct ::= adverb action => do_adverbial_adjunct
adjective ::= adjective_noun_lex action => do_adjective
subject ::= adjective noun action => do_qualified_subject
subject ::= noun action => do_bare_subject
noun ::= adjective_noun_lex action => do_noun
adverb ::= adverb_lex action => do_adverb
verb ::= verb_lex action => do_verb
object ::= article noun action => do_object
article ::= article_lex action => do_article
preposition ::= preposition_lex action => do_preposition
preposition_lex ~ unicorn
verb_lex ~ unicorn
adjective_noun_lex ~ unicorn
article_lex ~ unicorn
adverb_lex ~ unicorn
period ~ unicorn
unicorn ~ [^\s\S]
END_OF_SOURCE
}
);
# this parse has the lowest number of unique constituents (subjects, verbs, objects)
my $expected = <<'EOS';
sva(s(n(Time));v(flies);adju(pr(like);o(art(an);n(arrow))))
sva(s(n(Time));v(flies);adju(adv(fast)))
svo(s(adje(Fruit);n(flies));v(like);o(art(a);n(banana)))
svo(s(adje(Fruit);n(flies));v(spoil);o(art(a);n(banana)))
EOS
my @actual = ();
# setup { word => lex class } mapping
my %lexical_class = (
'preposition_lex' => 'like',
'verb_lex' => 'like flies spoil',
'adjective_noun_lex' => 'fruit banana time arrow flies',
'article_lex' => 'a an',
'adverb_lex' => 'fast',
'period' => '.'
);
my %vocabulary = ();
while ( my ( $lexical_class, $words ) = each %lexical_class ) {
for my $word ( split q{ }, $words ) {
push @{ $vocabulary{$word} }, $lexical_class;
}
}
my $recce = Marpa::R2::Scanless::R->new( {
grammar => $grammar ,
semantics_package => 'main',
} );
die 'Failed to create recognizer' if not $recce;
my $data = <<END_OF_DATA;
Time flies like an arrow.
Time flies fast.
Fruit flies like a banana.
Fruit flies spoil a banana.
END_OF_DATA
# external parsing
my $start = 0;
$recce->read( \$data, 0, 0 );
# split to words preserving punctuation
my @words = grep { $_ } split /\s+|(\.)/m, $data;
for my $word ( @words ) {
for my $type ( @{ $vocabulary{lc $word} } ) {
# say "'$word':$type @ $start";
eval { $recce->lexeme_alternative( $type, $word ) };
die $@, $recce->show_progress(), ' at ', __LINE__ if $@;
}
eval { $recce->lexeme_complete( $start, length $word ) };
die $@, $recce->show_progress(), ' at ', __LINE__ if $@;
$start += length ($word);
} ## end for my $word ( split q{ }, $data )
# get evaluated parses
my @parses;
while ( defined( my $value_ref = $recce->value() ) ) {
my $value = $value_ref ? ${$value_ref} : 'No parse';
push @parses, $value;
}
# descending sorting of parses by the number of unique constituents (subjects, verbs, and objects)
@parses = sort {
my @c;
for my $parse ($a, $b){
my $c = 0;
my %uniq_constituents;
for my $s (@$parse){
if ( $s =~ /s\((.+?)\);v\((.+?)\);(o|adju)\((.+?)\)\)$/ ) {
$uniq_constituents{$_}++ for ($1, $2, $4);
}
else{
die "Can't get constituents from $s";
}
}
$c = keys %uniq_constituents;
push @c, $c;
}
# say join ' ', @c, "\n";
$c[0] <=> $c[1];
} @parses;
#say join "\n", map { join ("\n", @$_) . "\n" } @parses;
Marpa::R2::Test::is(
join ("\n", @{$parses[0]}) . "\n",
$expected,
'Parses of ambiguous English sentences sorted by constituent count'
);
1; # In case used as "do" file
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment