Created
July 3, 2015 21:25
-
-
Save skids/d1552470ba7c4ca6d318 to your computer and use it in GitHub Desktop.
On the state of Cursor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# At http://irclog.perlgeek.de/perl6/2015-06-27#i_10814220 I mentioned | |
# the concept of a rule that matches strings contained in a baggy thing | |
# only as many times as they appear in the bag as a litmus test for | |
# the current limitations of using Cursor and/or dynamic state inside | |
# grammars. | |
# On more examination the limitations are not as bad as I initially thought, | |
# but they are still prohibitive to making something like that work. | |
# First, here's why I thought Cursor was a bit broken: | |
grammar bar { | |
regex TOP { <f> <f> { self.pos.say } } | |
regex f { . } | |
} | |
bar.parse("aa"); | |
#-> 0 | |
# S02 says: (modulo the $¢ identifier itself being NYI) | |
# "Within a closure, the instantaneous position within the search is denoted | |
# by the $¢.pos method" | |
# ...which is not what we are getting here. | |
# | |
# This is actually not too bad, though. What Cursor.pos has is the start | |
# of the current rule, and you can indeed get your current position, | |
# through $/.to | |
grammar bar2 { | |
regex TOP { <f> <g> } | |
regex g { <f> <f> { self.pos.say; say $/.from ~ ".." ~ $/.to } } | |
regex f { . } | |
} | |
bar2.parse("aaa"); | |
#-> 1 | |
#-> 1..3 | |
# Per spec, $/ has the hypothetical captures only from your innermost rule. | |
grammar bar3 { | |
regex TOP { <f> <g> } | |
regex g { <f> <f> { $/<f>.elems.say } } | |
regex f { . } | |
} | |
bar3.parse("aaa"); | |
#-> 2 | |
# But you may want them from rules that called you, which is what | |
# Cursor is supposed to be for. There seems to be self.CAPHASH | |
# for that though it does not appear to quite be ready for user-facing | |
# use. | |
grammar bar4 { | |
regex TOP { <f> <g> } | |
regex g { <f> <f> { self.CAPHASH.hash<f>.Str.say; } } | |
regex f { . } | |
} | |
bar4.parse("aaa"); | |
#-> a | |
# So... self.CAPHASH does not have anything that happened in the | |
# current rule, JUST the stuff from previous rules. Well, it is | |
# LTA that you have to glue $/ and self.CAPHASH together but on | |
# the other hand you may want exactly what CAPHASH has. (Though | |
# gluing versus using CALLERS are different stratas of convenience IMO.) | |
# Also there is this: | |
grammar foo { | |
regex TOP { [ <foo> b || <bar> b ]+ }; | |
regex bar { <foo> <foo> <ft> }; | |
regex foo { a { self.CAPHASH.hash.keys.say; } }; | |
regex ft { 42 } | |
}; | |
foo.parse('aa42bab'); | |
#-> bar foo | |
#-> foo | |
#-> foo | |
#-> bar foo | |
# That actually still mystifies me a bit. Sometimes keys show up, | |
# with no contents, where you would not expect them. | |
# So if we do glue $/ and self.CAPHASH together does that give us | |
# enough information to implement the <inbag> rule without carrying | |
# around our own dynamic state, just by brute force deduction? Not | |
# quite: | |
grammar bagonce { | |
# (We'll forget about parameterizing the rule and about positional | |
# captures to keep things simple.) | |
my %b := BagHash.new(<a b b c c c 1 2 2 3 3 3>); | |
my multi already-in-bag (%a, Match $m) { | |
my %q := %a (+) BagHash.new($m.hash<inbag>.values.map: *.Str); | |
for $m.hash.pairs.grep: { | |
$_.value ~~ Match and not $_.key eq "inbag" | |
} { | |
%q := already-in-bag(%q, .value); | |
} | |
return %q; | |
} | |
my multi already-in-bag (%a, Match $m, $caphash) { | |
my %q := already-in-bag(%a, $m); | |
%q := %q (+) BagHash.new($caphash<inbag>.values.map: *.Str); | |
for $caphash.hash.pairs.grep: { | |
$_.value ~~ Match and not $_.key eq "inbag" | |
} { | |
%q := already-in-bag(%q, .value); | |
} | |
return %q; | |
} | |
regex inbag { | |
:my @okhere; | |
{ my %c; | |
%c := %b (-) already-in-bag(BagHash.new(), $/, self.CAPHASH ); | |
@okhere = %c.keys; | |
} | |
@okhere | |
} | |
regex unhiddentop { | |
<inbag>+ | |
} | |
regex TOP { | |
<.inbag> <inbag>+ | |
} | |
} | |
bagonce.parse("abbccc122333").say; # Want this to succeed, does | |
bagonce.parse("abbccc122333a").say; # Want this to fail, succeeds | |
bagonce.parse("abbccc122333a", :rule<unhiddentop>).say; # Want to fail, does | |
# ... not when you have non-capturing rules. And this is why I have | |
# started to develop a major distaste for <.rule>. On the surface | |
# it seems like a convenient way to pre-prune parse trees rather than | |
# build a fully connected .made. But it does not just hide things to | |
# the user of your grammar API, it hides them internally, and if you | |
# are subclassing someone else's grammar, they may be littered around | |
# in inconvenient places in the middle of long rules, so you have to | |
# mirror their code: | |
grammar someone-elses-grammar { | |
# ... | |
regex foo { <bar> <.foo> || [ <compicated> <grr> ]+ <.foo> } | |
# ... | |
} | |
grammar i-just-want-to-see-foo is someone-elses-grammar { | |
regex foo { <bar> <foo> || [ <compicated> <grr> ]+ <foo> } | |
} | |
#... and then when the upstream author changes that rule to fix a bug... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment