Skip to content

Instantly share code, notes, and snippets.

@jeffreykegler
Created January 4, 2012 04:10
Show Gist options
  • Save jeffreykegler/1558440 to your computer and use it in GitHub Desktop.
Save jeffreykegler/1558440 to your computer and use it in GitHub Desktop.
Example: Subset of SQL-2003 SQL INSERT statement
use strict;
use warnings;
use 5.010;
use Marpa::XS;
use Data::Dumper;
# An SQL INSERT statement for testing
my $text = q{
INSERT INTO `tehtable` ( `field00`, `field01` ) VALUES ( 123, `abcd\'e
+fgh` );
};
# PART 1: LEXICAL ANALYSIS
my %symbol_for_char = (
'(' => 'OPEN_PAREN',
')' => 'CLOSE_PAREN',
';' => 'SEMICOLON',
',' => 'COMMA',
);
sub read_terminals {
my ( $recce, $text ) = @_;
DO_TOKEN: while (1) {
my $token_type = undef;
my $token_value = undef;
FIND_TYPE_AND_VALUE: {
next DO_TOKEN if $text =~ m{\G \s+ }gcxms;
if ($text =~ m{\G \s* ([`'"]) ((?:[^\\\1] | \\.)*?) \1 \s*}gcxms )
{
$token_type = 'STRING';
$token_value = $2;
last FIND_TYPE_AND_VALUE;
} ## end if ( $text =~ ...)
if ( $text =~ m{\G \s* INSERT \s*}gcxmsi ) {
$token_type = 'INSERT';
last FIND_TYPE_AND_VALUE;
}
if ( $text =~ m{\G \s* INTO \s*}gcxmsi ) {
$token_type = 'INTO';
last FIND_TYPE_AND_VALUE;
}
if ( $text =~ m{\G \s* VALUES \s*}gcxmsi ) {
$token_type = 'VALUES';
last FIND_TYPE_AND_VALUE;
}
if ( $text =~ m{\G \s* (\.\d+ | \d+ (?:\.\d*)?) \s*}gcxms ) {
$token_type = 'NUMBER';
$token_value = $1;
last FIND_TYPE_AND_VALUE;
}
if ( $text =~ m{\G (.) }gcxms ) {
my $char = $1;
if ( exists( $symbol_for_char{$char} ) ) {
$token_type = $symbol_for_char{$char};
$token_value = $1;
}
else {
$token_type = 'CHAR';
}
} ## end if ( $text =~ m{\G (.) }gcxms )
} ## end FIND_TYPE_AND_VALUE:
return if not defined $token_type;
if ( defined $recce->read( $token_type, $token_value ) ) {
say "Reading Token: $token_type";
}
else {
die "Error reading Token: $token_type";
}
} ## end while (1)
} ## end sub read_terminals
# Part 2: THE GRAMMAR
my $grammar = Marpa::XS::Grammar->new(
{ start => 'sql',
actions => 'My_Action',
default_action => 'do_what_I_mean',
inaccessible_ok => [qw(SPACE)],
terminals => [
qw(
OPEN_PAREN CLOSE_PAREN
SPACE COMMA
INSERT INTO VALUES
STRING NUMBER
SEMICOLON
)
],
rules => [
{ lhs => 'sql',
rhs => [qw(insert_statement SEMICOLON)],
action => 'child1'
},
{ lhs => 'insert_statement',
rhs => [
qw(INSERT INTO insertion_target insert_columns_and_source)
],
action => 'last_arg'
},
{ lhs => 'insertion_target',
rhs => [qw(table_name)],
},
{ lhs => 'table_name',
rhs => [qw(STRING)],
},
{ lhs => 'insert_columns_and_source',
rhs => [qw(from_constructor)],
},
{ lhs => 'from_constructor',
rhs => [
qw(from_constructor_column_list contextually_typed_value_constructor)
]
},
{ lhs => 'from_constructor_column_list' },
{ lhs => 'from_constructor_column_list',
rhs => [qw(OPEN_PAREN insert_column_list CLOSE_PAREN)],
action => 'child2'
},
{ lhs => 'insert_column_list',
rhs => ['column_name_list'],
},
{ lhs => 'column_name_list',
rhs => ['column_name'],
separator => 'COMMA',
min => 1
},
{ lhs => 'column_name', rhs => ['STRING'] },
{ lhs => 'contextually_typed_value_constructor',
rhs =>
[qw(VALUES contextually_typed_row_value_expression_list)]
},
{ lhs => 'contextually_typed_row_value_expression_list',
rhs => ['contextually_typed_row_value_expression'],
separator => 'COMMA',
min => 1
},
{ lhs => 'contextually_typed_row_value_expression',
rhs => ['contextually_typed_row_value_constructor']
},
{ lhs => 'contextually_typed_row_value_constructor',
rhs => ['common_value_expression']
},
{ lhs => 'contextually_typed_row_value_constructor',
rhs => [
qw(OPEN_PAREN
contextually_typed_row_value_constructor_expression_list CLOSE_PAREN)
],
action => 'child2'
},
{ lhs =>
'contextually_typed_row_value_constructor_expression_list',
rhs => ['contextually_typed_row_value_constructor_element'],
separator => 'COMMA',
min => 1
},
{ lhs => 'contextually_typed_row_value_constructor_element',
rhs => ['value_expression']
},
{ lhs => 'value_expression',
rhs => ['common_value_expression']
},
{ lhs => 'common_value_expression',
rhs => ['STRING']
},
{ lhs => 'common_value_expression',
rhs => ['NUMBER']
}
],
}
);
# Part 3: THE SEMANTICS
# Do What I Mean: That is, return a value
# that is what I want, most of the time.
#
# Specificially:
# Always throw away the per-parse variable,
# which is ignored in this application.
# Throw away any undefined child values as
# well. If only one child value is left, it is
# returned. If muliple child values are left
# a reference to an array of them is returned.
#
sub My_Action::do_what_I_mean {
my @args = grep {defined} @_[ 1 .. $#_ ];
return undef if scalar @args <= 0;
return $args[0] if scalar @args == 1;
return \@args;
} ## end sub My_Action::do_what_I_mean
# Value is the value of the first child
sub My_Action::child1 {
return $_[1];
}
# Value is the value of the second child
sub My_Action::child2 {
return $_[2];
}
# Value is the value of the last child
sub My_Action::last_arg {
return $_[-1];
}
# Part 4: DO IT
$grammar->precompute;
my $rec = Marpa::XS::Recognizer->new( { grammar => $grammar } );
read_terminals( $rec, $text );
my $value_ref = $rec->value();
die "No parse" if not defined $value_ref;
say Data::Dumper->Dump( [ ${$value_ref} ], ['value'] );
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment