Partial Go grammar with correct semicolon insertion
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| grammar Go { | |
| # this is a funny grammar because we're trying to simulate a separate lexer | |
| # so in the main "grammar" part don't parse characters yourself, use only | |
| # lexeme tokens. | |
| # Normally the 'real' cursor always points after whitespace. If the | |
| # 'logical' cursor is to point to an inserted semicolon, the 'real' cursor | |
| # will be placed ONE CHARACTER BEFORE the next token start (and the | |
| # location will be marked in @*ADDSEMI) | |
| token ws() { } # no automatic whitespace processing | |
| token l_space_raw() { | |
| [ <[\x20\x09\x0D\x0A]> | |
| | '//' \H* | |
| | '/*' .*? '*/' | |
| ]* | |
| } | |
| token l_letter { <:Letter> | _ } | |
| token l_decimal_digit { <[ 0 .. 9 ]> } | |
| token l_octal_digit { <[ 0 .. 7 ]> } | |
| token l_hex_digit { <[ 0..9 A..F a..f ]> } | |
| # Because we can't actually modify the incoming character stream, we | |
| # have to simulate semicolon insertion; in particular, non-semicolon | |
| # lexemes cannot match at an inserted semicolon point | |
| method l_space($semi) { | |
| my ($ws) = self.l_space; | |
| if $semi && $ws ~~ /<[\x0D\x0A]>/ { | |
| @*ADDSEMI[$ws.to-1] := True; | |
| return self.cursor($ws.to-1); | |
| } else { | |
| return $ws; | |
| } | |
| } | |
| token l_notbeforesemi() { <?{ !@*ADDSEMI[$¢.pos] }> } | |
| my %kw = <break case chan const continue default defer else fallthrough | |
| for func go goto if import interface map package range return select | |
| struct switch type var> X=> True; | |
| my %space = <break continue fallthrough return ++ -- ) ] }>; | |
| my %opextend = « << >> &^ += -= *= /= %= &= |= ^= <<= >>= &^= && || <- | |
| ++ -- == != <= >= := ... »; | |
| token l_idorkw() { | |
| <!l_notbeforesemi> | |
| $<chars> = [<:Letter> \w*] | |
| <.l_space(!%kw{$<chars>} || %space{$<chars>})> | |
| } | |
| token l_id() { <l_idorkw> <?{ !%kw{~$<l_idorkw><chars>} }> } | |
| token l_kw($str) { <l_idorkw> <?{ $<l_idorkw><chars> eq $str }> } | |
| token l_semi { | |
| { return self.cursor(self.pos+1) if @*ADDSEMI[self.pos] } | |
| ';' | |
| <.l_space(False)> | |
| } | |
| # use l_semi for ';' | |
| token l_op($str) { | |
| <!l_notbeforesemi> | |
| $str | |
| <!before $<next>=[.] <?{ %opextend{$str ~ $<next>} }> > | |
| <.l_space(%space{$str})> | |
| } | |
| token l_int_lit { | |
| <!l_notbeforesemi> | |
| [ <[1..9]> <[0..9]>* | |
| | 0 <[0..7]>* | |
| | 0 <[xX]> <l_hex_digit>+ | |
| ] | |
| <![ e E . i ]> | |
| <.l_space(True)> | |
| } | |
| token l_exp { <[eE]> <[+-]>? <[0..9]>+ } | |
| token l_float_guts { | |
| [ <[0..9]>+ '.' <[0..9]>* <l_exp>? | |
| | <[0..9]>+ <l_exp> | |
| | '.' <[0..9]>+ <l_exp>? | |
| ] | |
| } | |
| token l_float_lit { | |
| <!l_notbeforesemi> | |
| <l_float_guts> | |
| <.l_space(True)> | |
| } | |
| token l_imag_lit { | |
| <!l_notbeforesemi> | |
| [ <[0..9]>+ | <l_float_guts> ] | |
| i | |
| <.l_space(True)> | |
| } | |
| # ... you get the idea. | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment