Created May 10, 2018 04:36
Lite Lexical Rules and Syntax
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Copyright (C) 1998-2015 Gerwin Klein <> *
* All rights reserved. *
* *
* License: BSD *
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/* Java 1.2 language lexer specification */
/* Modified by duangsuse to compat Lite lexical rules */
/* Use together with unicode.flex for Unicode preprocesssing */
/* and java12.cup for a Java 1.2 parser */
/* Note that this lexer specification is not tuned for speed.
It is in fact quite slow on integer and floating point literals,
because the input is read twice and the methods used to parse
the numbers are not very fast.
For a production quality application (e.g. a Java compiler)
this could be optimized */
import beaver.Symbol;
import beaver.Scanner;
%class Lexer
%extends Scanner
%function nextToken
%type Symbol
%yylexthrow Scanner.Exception
return new Symbol(Terminals.EOF, "end of file");
StringBuilder string = new StringBuilder();
private Symbol symbol(int type) {
return new Symbol(type, yyline + 1, yycolumn + 1);
private Symbol symbol(int type, Object value) {
return new Symbol(type, yyline + 1, yycolumn + 1, value);
* assumes correct representation of a long value for
* specified radix in scanner buffer from <code>start</code>
* to <code>end</code>
private long parseLong(int start, int end, int radix) {
long result = 0;
long digit;
for (int i = start; i < end; i++) {
digit = Character.digit(yycharat(i), radix);
result *= radix;
result += digit;
return result;
/* main character classes */
LineTerminator = \r|\n|\r\n
InputCharacter = [^\r\n]
WhiteSpace = [ \t\f]
/* comments */
Comment = {TraditionalComment} | {EndOfLineComment} |
TraditionalComment = ">#" [^*] ~"#<" | ">#" "*"+ "#<"
EndOfLineComment = "#" {InputCharacter}* {LineTerminator}?
DocumentationComment = ">#" "*"+ [^>#] ~"#<"
/* identifiers */
Identifier = [:jletter:][:jletterdigit:]*
/* integer literals */
DecIntegerLiteral = 0 | [1-9][0-9]*
DecLongLiteral = {DecIntegerLiteral} [lL]
HexIntegerLiteral = 0 [xX] 0* {HexDigit} {1,8}
HexLongLiteral = 0 [xX] 0* {HexDigit} {1,16} [lL]
HexDigit = [0-9a-fA-F]
OctIntegerLiteral = 0+ [1-3]? {OctDigit} {1,15}
OctLongLiteral = 0+ 1? {OctDigit} {1,21} [lL]
OctDigit = [0-7]
/* floating point literals */
FloatLiteral = ({FLit1}|{FLit2}|{FLit3}) {Exponent}? [fF]
DoubleLiteral = ({FLit1}|{FLit2}|{FLit3}) {Exponent}?
FLit1 = [0-9]+ \. [0-9]*
FLit2 = \. [0-9]+
FLit3 = [0-9]+
Exponent = [eE] [+-]? [0-9]+
/* string and character literals */
StringCharacter = [^\r\n\"\\]
SingleCharacter = [^\r\n\'\\]
/* keywords */
"def" { return symbol(DEFINE); }
"do" { return symbol(DO); }
"break" { return symbol(BREAK); }
"next" { return symbol(NEXT); }
"return" { return symbol(RETURN); }
"scope" { return symbol(SCOPE); }
"while" { return symbol(WHILE); }
"for" { return symbol(FOR); }
"in" { return symbol(IN); }
"as" { return symbol(AS); }
"to" { return symbol(TO); }
"if" { return symbol(IF); }
"elif" { return symbol(ELIF); }
"else" { return symbol(ELSE); }
"import" { return symbol(IMPORT); }
"require" { return symbol(REQUIRE); }
"end" { return symbol(END); }
"and" { return symbol(ANDK); }
"or" { return symbol(ORK); }
/* boolean literals */
"true" { return symbol(BOOLEAN_LITERAL, true); }
"false" { return symbol(BOOLEAN_LITERAL, false); }
/* null literal */
"nil" { return symbol(NULL_LITERAL); }
/* separators */
"(" { return symbol(LPAREN); }
")" { return symbol(RPAREN); }
"{" { return symbol(LBRACE); }
"}" { return symbol(RBRACE); }
"[" { return symbol(LBRACK); }
"]" { return symbol(RBRACK); }
";" { return symbol(SEMICOLON); }
"," { return symbol(COMMA); }
"." { return symbol(DOT); }
"@" { return symbol(AT); }
/* operators */
"=" { return symbol(EQ); }
">" { return symbol(GT); }
"<" { return symbol(LT); }
"!" { return symbol(NOT); }
":" { return symbol(COLON); }
"==" { return symbol(EQUAL); }
">=" { return symbol(GE); }
"<=" { return symbol(LE); }
"!=" { return symbol(NOTEQ); }
"===" { return symbol(FULLEQ); }
"!==" { return symbol(NOTFULLEQ); }
"++" { return symbol(INC); }
"--" { return symbol(DEC); }
"+" { return symbol(PLUS); }
"-" { return symbol(SUB); }
"*" { return symbol(MULT); }
"/" { return symbol(DIV); }
"&" { return symbol(AND); }
"|" { return symbol(OR); }
"^" { return symbol(XOR); }
"%" { return symbol(MOD); }
"**" { return symbol(PWR); }
"<<" { return symbol(LSHIFT); }
">>" { return symbol(RSHIFT); }
"+=" { return symbol(PLUSEQ); }
"-=" { return symbol(SUBEQ); }
"*=" { return symbol(MULTEQ); }
"::" { return symbol(SQUARE); }
/* string literal */
\" { yybegin(STRING); string.setLength(0); }
\' { yybegin(STRING_SINGLE); string.setLength(0); }
/* numeric literals */
/* This is matched together with the minus, because the number is too big to
be represented by a positive integer. */
"-2147483648" { return symbol(INTEGER_LITERAL, new Integer(Integer.MIN_VALUE)); }
{DecIntegerLiteral} { return symbol(INTEGER_LITERAL, new Integer(yytext())); }
{DecLongLiteral} { return symbol(INTEGER_LITERAL, new Long(yytext().substring(0, yylength() - 1))); }
{HexIntegerLiteral} { return symbol(INTEGER_LITERAL, new Integer((int) parseLong(2, yylength(), 16))); }
{HexLongLiteral} { return symbol(INTEGER_LITERAL, new Long(parseLong(2, yylength() - 1, 16))); }
{OctIntegerLiteral} { return symbol(INTEGER_LITERAL, new Integer((int) parseLong(0, yylength(), 8))); }
{OctLongLiteral} { return symbol(INTEGER_LITERAL, new Long(parseLong(0, yylength() - 1, 8))); }
{FloatLiteral} { return symbol(FLOATING_POINT_LITERAL, new Float(yytext().substring(0, yylength() - 1))); }
{DoubleLiteral} { return symbol(FLOATING_POINT_LITERAL, new Double(yytext())); }
{DoubleLiteral}[dD] { return symbol(FLOATING_POINT_LITERAL, new Double(yytext().substring(0, yylength() - 1))); }
/* comments */
{Comment} { /* ignore */ }
/* whitespace */
{WhiteSpace} { /* ignore */ }
/* newline */
{LineTerminator} { return symbol(NEWLINE); }
/* identifiers */
{Identifier} { return symbol(IDENTIFIER, yytext()); }
\" { yybegin(YYINITIAL); return symbol(STRING_LITERAL, string.toString()); }
{StringCharacter}+ { string.append(yytext()); }
/* escape sequences */
"\\b" { string.append('\b'); }
"\\t" { string.append('\t'); }
"\\n" { string.append('\n'); }
"\\f" { string.append('\f'); }
"\\r" { string.append('\r'); }
"\\\"" { string.append('\"'); }
"\\'" { string.append('\''); }
"\\\\" { string.append('\\'); }
\\[0-3]?{OctDigit}?{OctDigit} { char val = (char) Integer.parseInt(yytext().substring(1), 8);
string.append(val); }
/* error cases */
\\. { throw new RuntimeException("Illegal escape sequence \"" + yytext() + "\""); }
{LineTerminator} { throw new RuntimeException("Unterminated string at end of line"); }
\' { yybegin(YYINITIAL); return symbol(STRING_LITERAL_SINGLE, string.toString()); }
{SingleCharacter}+ { string.append(yytext()); }
/* escape sequences */
"\\b" { string.append('\b'); }
"\\t" { string.append('\t'); }
"\\n" { string.append('\n'); }
"\\'" { string.append('\''); }
/* error cases */
\\. { throw new RuntimeException("Illegal escape sequence \'" + yytext() + "\'"); }
{LineTerminator} { throw new RuntimeException("Unterminated single-quoted string at end of line"); }
/* error fallback */
[^] { throw new RuntimeException("Illegal character \"" + yytext() + "\" at line " + yyline + ", column " + yycolumn); }
// Complete Lite Desugared Syntax (Ohm PEG)
// Lite parser by duangsuse, no rights reserved (lexical rules see
Lite {
// The JavaScript lexical rules
// §A.1 Lexical Grammar --
Program = CompStmt
sourceCharacter = any
// Override Ohm's built-in definition of space.
space := whitespace | comment
whitespace = "\t"
| "\x0B" -- verticalTab
| "\x0C" -- formFeed
| " "
| "\u00A0" -- noBreakSpace
| "\uFEFF" -- byteOrderMark
| unicodeSpaceSeparator
lineTerminator = "\n" | "\r" | "\u2028" | "\u2029"
lineTerminatorSequence = "\n" | "\r" ~"\n" | "\u2028" | "\u2029" | "\r\n"
comment = multiLineComment | singleLineComment
multiLineComment = ">####<" (~"<####>" sourceCharacter)* "<####>"
singleLineComment = "#" (~lineTerminator sourceCharacter)*
identifier (an identifier) = "@"? ~reservedWord identifierName
identifierName = identifierStart identifierPart*
identifierStart = letter | "$" | "_"
| "\\" unicodeEscapeSequence -- escaped
identifierPart = identifierStart | unicodeCombiningMark
| unicodeDigit | unicodeConnectorPunctuation
| "\u200C" | "\u200D"
letter += unicodeCategoryNl
= "\u2160".."\u2182" | "\u3007" | "\u3021".."\u3029"
unicodeDigit (a digit)
= "\u0030".."\u0039" | "\u0660".."\u0669" | "\u06F0".."\u06F9" | "\u0966".."\u096F" | "\u09E6".."\u09EF" | "\u0A66".."\u0A6F" | "\u0AE6".."\u0AEF" | "\u0B66".."\u0B6F" | "\u0BE7".."\u0BEF" | "\u0C66".."\u0C6F" | "\u0CE6".."\u0CEF" | "\u0D66".."\u0D6F" | "\u0E50".."\u0E59" | "\u0ED0".."\u0ED9" | "\u0F20".."\u0F29" | "\uFF10".."\uFF19"
unicodeCombiningMark (a Unicode combining mark)
= "\u0300".."\u0345" | "\u0360".."\u0361" | "\u0483".."\u0486" | "\u0591".."\u05A1" | "\u05A3".."\u05B9" | "\u05BB".."\u05BD" | "\u05BF".."\u05BF" | "\u05C1".."\u05C2" | "\u05C4".."\u05C4" | "\u064B".."\u0652" | "\u0670".."\u0670" | "\u06D6".."\u06DC" | "\u06DF".."\u06E4" | "\u06E7".."\u06E8" | "\u06EA".."\u06ED" | "\u0901".."\u0902" | "\u093C".."\u093C" | "\u0941".."\u0948" | "\u094D".."\u094D" | "\u0951".."\u0954" | "\u0962".."\u0963" | "\u0981".."\u0981" | "\u09BC".."\u09BC" | "\u09C1".."\u09C4" | "\u09CD".."\u09CD" | "\u09E2".."\u09E3" | "\u0A02".."\u0A02" | "\u0A3C".."\u0A3C" | "\u0A41".."\u0A42" | "\u0A47".."\u0A48" | "\u0A4B".."\u0A4D" | "\u0A70".."\u0A71" | "\u0A81".."\u0A82" | "\u0ABC".."\u0ABC" | "\u0AC1".."\u0AC5" | "\u0AC7".."\u0AC8" | "\u0ACD".."\u0ACD" | "\u0B01".."\u0B01" | "\u0B3C".."\u0B3C" | "\u0B3F".."\u0B3F" | "\u0B41".."\u0B43" | "\u0B4D".."\u0B4D" | "\u0B56".."\u0B56" | "\u0B82".."\u0B82" | "\u0BC0".."\u0BC0" | "\u0BCD".."\u0BCD" | "\u0C3E".."\u0C40" | "\u0C46".."\u0C48" | "\u0C4A".."\u0C4D" | "\u0C55".."\u0C56" | "\u0CBF".."\u0CBF" | "\u0CC6".."\u0CC6" | "\u0CCC".."\u0CCD" | "\u0D41".."\u0D43" | "\u0D4D".."\u0D4D" | "\u0E31".."\u0E31" | "\u0E34".."\u0E3A" | "\u0E47".."\u0E4E" | "\u0EB1".."\u0EB1" | "\u0EB4".."\u0EB9" | "\u0EBB".."\u0EBC" | "\u0EC8".."\u0ECD" | "\u0F18".."\u0F19" | "\u0F35".."\u0F35" | "\u0F37".."\u0F37" | "\u0F39".."\u0F39" | "\u0F71".."\u0F7E" | "\u0F80".."\u0F84" | "\u0F86".."\u0F87" | "\u0F90".."\u0F95" | "\u0F97".."\u0F97" | "\u0F99".."\u0FAD" | "\u0FB1".."\u0FB7" | "\u0FB9".."\u0FB9" | "\u20D0".."\u20DC" | "\u20E1".."\u20E1" | "\u302A".."\u302F" | "\u3099".."\u309A" | "\uFB1E".."\uFB1E" | "\uFE20".."\uFE23"
unicodeConnectorPunctuation = "\u005F" | "\u203F".."\u2040" | "\u30FB" | "\uFE33".."\uFE34" | "\uFE4D".."\uFE4F" | "\uFF3F" | "\uFF65"
unicodeSpaceSeparator = "\u2000".."\u200B" | "\u3000"
reservedWord = keyword | nullLiteral | booleanLiteral
// Note: keywords that are the complete prefix of another keyword should
// be prioritized (e.g. 'in' should come before 'instanceof')
keyword = break | do | scope | in
| to | else | elif | if
| as | next | return | endKeyword
| or | for | and | while
| require | def | import
Note: Punctuator and DivPunctuator (see are
not currently used by this grammar.
literal = nullLiteral | booleanLiteral | numericLiteral | stringLiteral
nullLiteral = "nil" ~identifierPart
booleanLiteral = ("true" | "false") ~identifierPart
// For semantics on how decimal literals are constructed, see section 7.8.3
// Note that the ordering of hexIntegerLiteral and decimalLiteral is reversed w.r.t. the spec
// This is intentional: the order decimalLiteral | hexIntegerLiteral will parse
// "0x..." as a decimal literal "0" followed by "x..."
numericLiteral = octalIntegerLiteral | hexIntegerLiteral | decimalLiteral
decimalLiteral = decimalIntegerLiteral "." decimalDigit* exponentPart -- bothParts
| "." decimalDigit+ exponentPart -- decimalsOnly
| decimalIntegerLiteral exponentPart -- integerOnly
decimalIntegerLiteral = nonZeroDigit decimalDigit* -- nonZero
| "0" -- zero
decimalDigit = "0".."9"
nonZeroDigit = "1".."9"
exponentPart = exponentIndicator signedInteger -- present
| -- absent
exponentIndicator = "e" | "E"
signedInteger = "+" decimalDigit* -- positive
| "-" decimalDigit* -- negative
| decimalDigit+ -- noSign
hexIntegerLiteral = "0x" hexDigit+
| "0X" hexDigit+
// hexDigit defined in Ohm's built-in rules (otherwise: hexDigit = "0".."9" | "a".."f" | "A".."F")
octalIntegerLiteral = "0" octalDigit+
octalDigit = "0".."7"
// For semantics on how string literals are constructed, see section 7.8.4
stringLiteral = "\"" doubleStringCharacter* "\""
| "'" singleStringCharacter* "'"
doubleStringCharacter = ~("\"" | "\\" | lineTerminator) sourceCharacter -- nonEscaped
| "\\" escapeSequence -- escaped
| lineContinuation -- lineContinuation
singleStringCharacter = ~("'" | "\\" | lineTerminator) sourceCharacter -- nonEscaped
| "\\" escapeSequence -- escaped
| lineContinuation -- lineContinuation
lineContinuation = "\\" lineTerminatorSequence
escapeSequence = unicodeEscapeSequence
| hexEscapeSequence
| octalEscapeSequence
| characterEscapeSequence // Must come last.
characterEscapeSequence = singleEscapeCharacter
| nonEscapeCharacter
singleEscapeCharacter = "'" | "\"" | "\\" | "b" | "f" | "n" | "r" | "t" | "v"
nonEscapeCharacter = ~(escapeCharacter | lineTerminator) sourceCharacter
escapeCharacter = singleEscapeCharacter | decimalDigit | "x" | "u"
octalEscapeSequence = zeroToThree octalDigit octalDigit -- whole
| fourToSeven octalDigit -- eightTimesfourToSeven
| zeroToThree octalDigit ~decimalDigit -- eightTimesZeroToThree
| octalDigit ~decimalDigit -- octal
hexEscapeSequence = "x" hexDigit hexDigit
unicodeEscapeSequence = "u" hexDigit hexDigit hexDigit hexDigit
zeroToThree = "0".."3"
fourToSeven = "4".."7"
// === Implementation-level rules (not part of the spec) ===
// A semicolon is "automatically inserted" if a newline or the end of the input stream is
// reached, or the offending token is "}".
// See for more information.
// NOTE: Applications of this rule *must* appear in a lexical context -- either in the body of a
// lexical rule, or inside `#()`.
sc = ";" | end | lineTerminator | comment
// Convenience rules for parsing keyword tokens.
break = "break" ~identifierPart
do = "do" ~identifierPart
scope = "scope" ~identifierPart
in = "in" ~identifierPart
else = "else" ~identifierPart
elif = "elif" ~identifierPart
if = "if" ~identifierPart
as = "as" ~identifierPart
next = "next" ~identifierPart
return = "return" ~identifierPart
endKeyword = "end" ~identifierPart
or = "or" ~identifierPart
for = "for" ~identifierPart
and = "and" ~identifierPart
while = "while" ~identifierPart
require = "require" ~identifierPart
def = "def" ~identifierPart
import = "import" ~identifierPart
to = "to" ~identifierPart
// end of modified javascript lexical rules
// start of expressions
// lite operator precedence
// | or in
// & and
// < > <= >= != == !== ===
// <<
// to
// + -
// * / %
// ** :: as
// Unary- ! ++ -- .
// left recursion
= OrExp
= OrExp "|" AndExp -- or
| OrExp or AndExp -- orKeyword
| OrExp in AndExp -- in
| AndExp
= AndExp "&" RelationExp -- and
| AndExp and RelationExp -- andKeyword
| RelationExp
= RelationExp "<" ShiftExp -- lessThan
| RelationExp ">" ShiftExp -- greaterThan
| RelationExp "<=" ShiftExp -- lessEqual
| RelationExp ">=" ShiftExp -- greaterEqual
| RelationExp "!=" ShiftExp -- notEqual
| RelationExp "==" ShiftExp -- equal
| RelationExp "!==" ShiftExp -- notFullEqual
| RelationExp "===" ShiftExp -- fullEqual
| ShiftExp
= ShiftExp "<<" RangeExp -- shift
| RangeExp
= RangeExp to AddExp -- range
| AddExp
= AddExp "+" MulExp -- plus
| AddExp "-" MulExp -- minus
| MulExp
= MulExp "*" ExpExp -- times
| MulExp "/" ExpExp -- divide
| MulExp "%" ExpExp -- remainder
| ExpExp
= ExpExp "**" ExpExp -- power
| ExpExp "::" identifier -- square
| ExpExp as identifier -- as
| PriExp
= "(" Exp ")" -- paren
| "-" PriExp -- neg
| "!" PriExp -- not
| identifier "++" -- inc
| identifier "--" -- dec
| literal -- literal
| Call -- callExp
| LiteExpr -- liteExp
= List | Table | BraceBlock | DoBlock
= (", " | " " | ",")
= "[" ExpList "]" -- simpleList
| ":[" (~"]" sourceCharacter)* "]" -- wordList
= (Exp Divider?)*
= "{" KvList "}"
= (identifier ":" Exp ("," | "\n")?)*
// callEasy has a bug, help wanted. see
= Call "(" ExpList ")" -- call
| Call "." identifier -- callIndex
| Call "[" Exp "]" -- justIndex
| Call ExpList -- callEasy
| identifier ~"=" -- justIdentifier
= "{" NameListB? (SimpleStatement ":"?)* "}"
= "("? (identifier Divider?)* ")"?
= "|" (identifier Divider?)* "|"
= do NameListB? Block
// end Exp part
= Exp -- expressionStatement
| Break -- break
| Next -- continue
| Import -- import
| Require -- require
| Return -- return
| Assign -- assignment
| IndexEq -- indexLet
| Arrow -- arrowLet
= break
= next
= import (~lineTerminator sourceCharacter)*
= require (~lineTerminator sourceCharacter)*
= return Exp?
= identifier "=" Exp
= Exp "[" Exp "]" "=" Exp
= Exp "->" identifier Exp
= SimpleStatement -- simpleStatement
| Def -- defineMethod
| For -- forLoop
| While -- whileLoop
| Scope -- scope
| If -- controlFlow
| "\n" -- nop
= def identifier sc Block -- defEasy
| def identifier sc Exp sc -- defExpr
| def identifier NameList sc Block -- def
= for identifier in Exp sc Block
= while Exp sc Block
= scope identifier? sc Block
= if Exp sc Block -- simpleEnd
| if Exp sc CompStmt else Block -- ifElse
| if Exp sc CompStmt (elif Exp sc CompStmt)* (else CompStmt)? endKeyword -- ifElif
= CompStmt endKeyword
= (Statement sc?)*
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Copyright (C) 1998-2015 Gerwin Klein <> *
* All rights reserved. *
* *
* License: BSD *
* *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/* Section 3.3 of the Java Language Specification :
\ UnicodeMarker HexDigit HexDigit HexDigit HexDigit
UnicodeMarker u
any Unicode character
HexDigit: one of
0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
only an even number of '\' is eligible to start a Unicode escape sequence
%class UnicodeEscapes
%extends FilterReader
%function read
UnicodeEscape = {UnicodeMarker} {HexDigit} {4}
UnicodeMarker = "u"+
HexDigit = [0-9a-fA-F]
%state DIGITS
private boolean even;
private int value() {
int r = 0;
for (int k = zzMarkedPos - 4; k < zzMarkedPos; k++) {
int c = zzBuffer[k];
if (c >= 'a')
c -= 'a' - 10;
else if (c >= 'A')
c -= 'A' - 10;
c -= '0';
r <<= 4;
r += c;
return r;
public int read(char cbuf[], int off, int len) throws IOException {
if (!ready()) return -1;
len += off;
for (int i=off; i<len; i++) {
int c = read();
if (c < 0)
return i - off;
cbuf[i] = (char) c;
return len - off;
public boolean markSupported() {
return false;
public boolean ready() throws IOException {
return !zzAtEOF && (zzCurrentPos < zzEndRead || zzReader.ready());
\\ { even = false; return '\\'; }
\\ / \\ { even = !even; return '\\'; }
\\ / "u" {
if (even) {
even = false;
return '\\';
[^] { return zzBuffer[zzStartRead]; }
<<EOF>> { return -1; }
{UnicodeEscape} { yybegin(YYINITIAL); return value(); }
[^] { throw new Error("Incorrect Unicode escape"); }
<<EOF>> { throw new Error("EOF in Unicode escape"); }
