Last active
February 13, 2024 13:50
-
-
Save GerHobbelt/6400988 to your computer and use it in GitHub Desktop.
snip&snap extracts from our major JISON grammar file, showcasing 'code sections' a la BISON plus a few other bits & tricks. Note the %{ ... %} sections which are JISON's 'code sections'. Also note the code following that last '%%' marker: that is another 'code section' - and the most important one.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%options ranges | |
%options backtrack_lexer | |
/* | |
* lexical grammar | |
* =============== | |
* | |
* This section defines the lexer rules for our formula parser. The rules are checked from top to bottom, so order is import | |
* here! | |
* | |
* [...] | |
*/ | |
%lex | |
/* | |
* Remember that in `jison`, when the `lexer.option.flex` has not been set (i.e. we get default behaviour), | |
* we get a hit on the first matching regex, so the order of the tokenization regexes below is | |
* very important. | |
* | |
* `option.flex` would perform an exhaustive scan of all regexes, thus trying to find the | |
* longest match every time. We do not want that in our lexical scanner! | |
*/ | |
%{ | |
/* | |
* This chunk is included in the lexer action code at the very start of that method. | |
* | |
* `YY_START` is defined then, `YYSTATE` is not! `yy` and `yy_` are also available here. | |
*/ | |
var s, s2, s3; | |
var rv, rv2, e_offset, col, row, len, value; | |
var match, match2; | |
console.log("lexer action: ", yy, yy_, this, yytext, YY_START, $avoiding_name_collisions); | |
var parser = yy.parser; | |
%} | |
/* | |
* WARNING | |
* ------- | |
* | |
* When you use these regex 'macros' below, be aware that JISON surrounds them with () braces | |
* to ensure they always act as a single element. | |
* | |
* Hence, for example, JISON transforms the lexer regex | |
* | |
* ({ID}(\.{ID})*)(\s*\() | |
* | |
* into this JS regex | |
* | |
* /^(?:(([a-zA-Z_][a-zA-Z0-9_]*)(\.([a-zA-Z_][a-zA-Z0-9_]*))*)(\s*\())/ | |
* | |
* which will return more `matches[]` elements than you would expect from the lexer regex itself | |
* as the regex element | |
* | |
* {ID} | |
* | |
* itself expands to a (...)-surrounded regex element | |
* | |
* ([a-zA-Z_][a-zA-Z0-9_]*) | |
* | |
* therefore placing the part matching | |
* | |
* (\s*\() | |
* | |
* at `matches[]` index `[5]` rather than the originally expected `[3]`, so that input | |
* | |
* MIN(x, y) | |
* | |
* will have the example regex match the part | |
* | |
* MIN( | |
* | |
* as intended, while producing a `this.matches[]` array with the following content: | |
* | |
* this.matches = [ | |
* "MIN(", | |
* "MIN", | |
* "MIN", | |
* undefined, | |
* undefined, | |
* "(" | |
* ] | |
* | |
* (note the `undefined` entries at `[3] `and `[4]` in there!), while input | |
* | |
* A.B.C(x) | |
* | |
* will have the example regex match the part | |
* | |
* A.B.C( | |
* | |
* as intended, while producing a `this.matches[]` array with the following content: | |
* | |
* this.matches = ["A.B.C(", "A.B.C", "A", ".C", "C", "("] | |
*/ | |
ID [a-zA-Z_][a-zA-Z0-9_]* | |
DOTTED_ID [a-zA-Z_]([a-zA-Z0-9_.]*[a-zA-Z0-9_])? | |
%% | |
// Recognize any function ID, with optional dotted sections, as a string which is followed by a `(` open brace, e.g. `A.DIST(` | |
{DOTTED_ID}(\s*\() | |
%{ | |
/* | |
* lookup this blurb: it MUST be a (possibly namespaced) function identifier | |
* (e.g. `SUM`, `namespace.user_defined_function42`). | |
* | |
* Note that this is really another kind of lexical hack, just not the well-known | |
* `yacc` / `lex` 'feedback' one, as here we include a part of the GRAMMAR KNOWLEDGE | |
* in the lexer itself: | |
* | |
* since we 'know' now that the blurb `\1` is followed by an open brace `(`, we | |
* can be certain that this is a function identifier and nothing else | |
* that may have the same 'name', e.g. constant `E` or `PI`. | |
* | |
* > ### Note | |
* > | |
* > the braces in the regex are there so we can easily grab that bit, | |
* > and in particular that very last bit: it will ALWAYS be pushed back | |
* > into the lexer queue as that bit is our 'additional look-ahead' at | |
* > work! | |
*/ | |
console.log("looking up function identifier token (+ look-ahead) in symbol table: ", yytext, this, this.matches); | |
/* | |
* **WARNING**: take heed of the comment further above regarding the `ID` etc. | |
* lexer regex 'macros' and JISON's behaviour regarding those! | |
* | |
* Hence we should be able to pick up the `(` at the end at `this.matches[3]`! | |
*/ | |
this.unput(this.matches[3]); | |
s = this.matches[1]; | |
rv = parser.getSymbol4Function(s); | |
if (rv) { | |
yytext = (new ASTopcode(rv)) | |
.setLocationInfo(yylloc) | |
.setCommentsIndex(parser.getNextCommentIndex()) | |
.setLexedText(s); | |
// ASTopcode(FKW_FUNCTION | ...) | |
return 'FUNCTION'; | |
} | |
// when we get here, the blurb didn't match anything sensible... | |
yytext = (new ASTerror(FERR_EXPECTED_FUNCTION_NAME, "Expected a (possibly namespaced) function name.")) | |
.setLocationInfo(yylloc) | |
.setCommentsIndex(parser.getNextCommentIndex()) | |
.setLexedText(s); | |
return 'error'; | |
%} | |
[...] | |
"||" | |
%{ | |
yytext = (new ASTopcode(FKW_BOOLEAN_OR_OPERATOR | FT_BOOLEAN | FU_DERIVED)) | |
.setLocationInfo(yylloc) | |
.setCommentsIndex(parser.getNextCommentIndex()) | |
.setLexedText(yytext); | |
return 'BOOLEAN_OR_OPERATOR'; | |
%} | |
[...] | |
"\u201c"([^\u201d]*)"\u201d" | |
%{ /* “string” */ | |
s = this.matches[1]; | |
yytext = (new ASTvalue(s, FKW_VALUE | FT_STRING | FU_STRING)) | |
.setNotationAttributes(FKA_DELIMITERS_201C) | |
.setLocationInfo(yylloc) | |
.setCommentsIndex(parser.getNextCommentIndex()); | |
return 'STRING'; | |
%} | |
[...] | |
/* | |
* Any input which starts with a string marker is assumed to be a string entirely. | |
* Hence these two full-line regexes must come before the 'detect string anywhere in the input' | |
* regexes which come after these. Those latter regexes will help us parse statements like | |
* | |
* 'CONCAT("THE YEAR", " ", "2013")' | |
* | |
* | |
* Regex notes | |
* ----------- | |
* | |
* (.*?) | |
* | |
* is written like that, i.e. as a NON-greedy regex atom, to ensure that the | |
* optional `'?` / `"?` following it is actually filled when the string terminates | |
* with such a quote. Would the `.*` expression have been greedy, then the regex | |
* engine would legally ignore the following `'?` / `"?` completely as those quotes | |
* would have matched the previous `.*` already, while still producing a legal | |
* match for the quoted string, e.g. `'hello world'` would then produce a | |
* | |
* \1 == "hello world'" (note the trailing quote) | |
* | |
* while we want the regex to 'strip' the outer quotes, if there are any. | |
*/ | |
"'"(.*?)"'"?$ | |
%{ | |
s = this.matches[1]; | |
s2 = parser.dedupQuotedString(s, "'"); | |
yytext = (new ASTvalue(s2, FKW_VALUE | FT_STRING | FU_STRING)) | |
.setNotationAttributes(FKA_DELIMITERS_SINGLEQUOTE) | |
.setLocationInfo(yylloc) | |
.setCommentsIndex(parser.getNextCommentIndex()); | |
return 'STRING'; | |
%} | |
'"'(.*?)'"'?$ | |
%{ | |
s = this.matches[1]; | |
s2 = parser.dedupQuotedString(s, '"'); | |
yytext = (new ASTvalue(s2, FKW_VALUE | FT_STRING | FU_STRING)) | |
.setNotationAttributes(FKA_DELIMITERS_DOUBLEQUOTE) | |
.setLocationInfo(yylloc) | |
.setCommentsIndex(parser.getNextCommentIndex()); | |
return 'STRING'; | |
%} | |
"'"([^']*("''"[^']*)*)"'" | |
%{ | |
s = this.matches[1]; | |
s2 = parser.dedupQuotedString(s, "'"); | |
yytext = (new ASTvalue(s2, FKW_VALUE | FT_STRING | FU_STRING)) | |
.setNotationAttributes(FKA_DELIMITERS_SINGLEQUOTE) | |
.setLocationInfo(yylloc) | |
.setCommentsIndex(parser.getNextCommentIndex()); | |
return 'STRING'; | |
%} | |
'"'([^"]*('""'[^"]*)*)'"' | |
%{ | |
s = this.matches[1]; | |
s2 = parser.dedupQuotedString(s, '"'); | |
yytext = (new ASTvalue(s2, FKW_VALUE | FT_STRING | FU_STRING)) | |
.setNotationAttributes(FKA_DELIMITERS_DOUBLEQUOTE) | |
.setLocationInfo(yylloc) | |
.setCommentsIndex(parser.getNextCommentIndex()); | |
return 'STRING'; | |
%} | |
\s+ | |
/*: skip whitespace */ | |
<<EOF>> | |
%{ | |
yytext = (new lexerToken(FKA_EOF)) | |
.setLocationInfo(yylloc) | |
.setCommentsIndex(parser.getNextCommentIndex()) | |
.setLexedText(yytext); | |
return 'EOF'; | |
%} | |
. | |
%{ | |
yytext = (new ASTerror(FERR_UNSUPPORTED_INPUT, "Don't know what to do with this: it's unsupported input.")) | |
.setLocationInfo(yylloc) | |
.setCommentsIndex(parser.getNextCommentIndex()) | |
.setLexedText(yytext); | |
return 'error'; | |
%} | |
/lex | |
%token NUMBER INTEGER_NUMBER | |
%token STRING | |
%token TRUE FALSE | |
%token CONSTANT | |
/* | |
* functions all produce the `FUNCTION` lexer token: | |
* parameter list validation is performed in the static analysis phase during parsing. | |
*/ | |
%token FUNCTION | |
[...] | |
%start start_parsing | |
%{ | |
/* | |
* This chunk is included in the parser code, before the lexer definition section and after the parser has been defined. | |
* | |
* WARNING: | |
* | |
* Meanwhile, keep in mind that all the parser actions, which will execute inside the `parser.performAction()` function, | |
* will have a `this` pointing to `$$`. | |
* | |
* If you want to access the lexer and/or parser, these are accessible inside the parser rule action code via | |
* the `yy.lexer` and `yy.parser` dereferences respectively. | |
*/ | |
console.log("parser object definition: ", this); | |
%} | |
%% /* language grammar */ | |
start_parsing | |
: init_phase do_the_work EOF | |
{ | |
if (typeof console !== 'undefined') { | |
console.log($2); | |
} | |
return $2; | |
} | |
; | |
init_phase | |
: /* epsilon */ | |
{ | |
/* | |
* The 'init phase' is always reduced for every parse invocation. | |
* | |
* At this point in time, nothing has happened yet: no token has | |
* been lexed, no real statement has been parsed yet. | |
* | |
* The grammar has been constructed such that this rule can be | |
* resolved without any look-ahead, thanks to a 'default action'. | |
*/ | |
//yy.lexer.options.backtrack_lexer = true; | |
//yy.lexer.options.ranges = true; // required for the inline comments to work as the start location is tracked via the `yylloc.range[]` | |
yy.lexer.options.inline_comment_mode = yy.inline_comment_mode || 0; | |
// and make sure the comments store is prepped: | |
yy.parser.clearComments(); | |
// and init the symbol tables if the caller didn't do so already (we do this in order to prevent undesirable crashes) | |
if (typeof yy.parser._symbol2token_lookup_table === "undefined") { | |
yy.parser.initSymbolTable([]); | |
} | |
[...] | |
} | |
; | |
[...] | |
/* | |
* And here endeth the parser proper | |
* --------------------------------- | |
* | |
* This concludes the grammar rules definitions themselves. | |
* What follows is a chunk of support code that JISON will include in the generated parser. | |
*/ | |
%% | |
/* | |
* This chunk is included in the parser object code, | |
* following the 'init' code block that may be set in `%{ ... %}` at the top of this | |
* grammar definition file. | |
*/ | |
[...] | |
/* | |
* Remove duplicated quotes (of the form `quote_str + quote_str`, e.g. `""`) from the input string. | |
*/ | |
parser.dedupQuotedString = function(str, quote_str) { | |
return str.replace(quote_str + quote_str, quote_str); | |
}; | |
/* | |
* Return the definition structure for the given symbol name, or FALSE when the symbol is unknown. | |
* | |
* The definition structure contains these elements: | |
* | |
* - token | |
* - defined_value | |
*/ | |
parser.getSymbol4Function = function(name) { | |
name = name.toUpperCase(); | |
var rv = this._symbol2token_lookup_table.functions[name]; | |
return rv || false; | |
}; | |
parser.getSymbol4DefinedConstant = function(name) { | |
name = name.toUpperCase(); | |
var rv = this._symbol2token_lookup_table.constants[name]; | |
return rv || false; | |
}; | |
[...] | |
/* | |
* Initialize the symbol lookup tables. | |
*/ | |
parser.initSymbolTable = function(custom_symbols) { | |
this._symbol2token_lookup_table = { | |
functions: {}, // hash table ~ dictionary | |
constants: {}, // hash table ~ dictionary | |
[...] | |
}; | |
// first set up the default symbols: constants, etc.; once that is done, register the custom symbols: | |
this.addSymbols(predefined_formula_constants, FSC_PREDEFINED_CONSTANT); | |
this.addSymbols(predefined_formula_functions, FSC_FUNCTION); | |
return this.addSymbols(custom_symbols); | |
}; | |
/* | |
* Register one or more symbols in the symbol lookup tables: | |
*/ | |
parser.addSymbols = function(symbols, default_category) { | |
[...] | |
return this; | |
}; | |
[...] | |
/* | |
* Clear / (re-)initialize the comments' store. | |
*/ | |
parser.clearComments = function() { | |
this.comments = []; | |
return this; | |
}; | |
/* | |
* Return the index to the next available slot in the comment store. | |
* | |
* Consequently returns 0 when the comment store is empty. | |
*/ | |
parser.getNextCommentIndex = function() { | |
var rv = this.comments; | |
if (typeof rv !== "undefined" && rv.length > 0) { | |
return rv.length; | |
} else { | |
return 0; | |
} | |
}; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment