Last active
July 1, 2021 15:33
-
-
Save erezsh/f0745a48f8a7f8a6b6375e798ec7dbfa to your computer and use it in GitHub Desktop.
WIP of Lark js (far from working yet)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function range(start, end) { | |
if (end === undefined) { | |
end = start; | |
start = 0; | |
} | |
res = [] | |
for (let i=start; i<end; i++) res.push(i); | |
return res | |
}; | |
String.prototype.format = function() { | |
var counter = 0; | |
var args = arguments; | |
return this.replace(/%[sr]/g, function() { | |
return args[counter++]; | |
}); | |
}; | |
String.prototype.join = function(a) { return a.join(this) } | |
Set.prototype.union = function(setB) { | |
let _union = new Set(this) | |
for (let elem of setB) { | |
_union.add(elem) | |
} | |
return _union | |
} | |
Set.prototype.intersection = function(setB) { | |
let _intersection = new Set() | |
for (let elem of setB) { | |
if (this.has(elem)) { | |
_intersection.add(elem) | |
} | |
} | |
return _intersection | |
} | |
Object.prototype.items = function() { | |
return Object.entries(this) | |
} | |
function dict(d) { | |
if (d instanceof Map) { | |
return new Map(d) | |
} | |
return new Map(Object.entries(d)) | |
} | |
Map.prototype.pop = function(key) { | |
let value = this.get(key) | |
this.delete(key) | |
return value | |
} | |
class _Decoratable {} | |
class Serialize {} | |
class Str {} | |
class ABC {} | |
NotImplemented = {}; | |
const re = { | |
escapeRegExp(string) { | |
// See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#escaping | |
return string.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string | |
}, | |
compile(regex, flags) { | |
// May throw re.error | |
return new RegExp(regex, flags); | |
}, | |
error: SyntaxError, | |
}; | |
function _get_match(re_, regexp, s, flags) { | |
m = re_.compile(regexp, flags).match(s); | |
if (m != null) return m[0]; | |
} | |
class Scanner { | |
constructor({ | |
terminals, | |
g_regex_flags, | |
re_, | |
use_bytes, | |
match_whole = false, | |
}) { | |
this.terminals = terminals; | |
this.g_regex_flags = g_regex_flags; | |
this.re_ = re_; | |
this.use_bytes = use_bytes; | |
this.match_whole = match_whole; | |
this.allowed_types = new Set(this.terminals.map((t) => t.name)); | |
this._regexp = this._build_mres(terminals); | |
} | |
_build_mres(terminals) { | |
let postfix = this.match_whole ? "$" : ""; | |
let mres = []; | |
let pattern = "|".join( | |
terminals.map((t) => `(?<${t.name}>${t.pattern.to_regexp() + postfix})`) | |
); | |
console.log("SCANNER: ", pattern); | |
return new RegExp(pattern, this.g_regex_flags + "y"); | |
} | |
match(text, pos) { | |
this._regexp.lastIndex = pos; | |
let m = this._regexp.exec(text); | |
if (m) { | |
// Find group. Ugly hack, but javascript is forcing my hand. | |
let group = null; | |
for (let [k, v] of Object.entries(m.groups)) { | |
if (v) { | |
group = k; | |
break; | |
} | |
} | |
return [m[0], group]; | |
} | |
} | |
} | |
class LarkError extends Error { | |
// pass | |
} | |
class ConfigurationError extends LarkError { | |
// pass | |
} | |
function assert_config({ | |
value, | |
options, | |
msg = "Got %r, expected one of %s", | |
} = {}) { | |
if (!(value in options)) { | |
throw new ConfigurationError(msg.format(value, options)); | |
} | |
} | |
class GrammarError extends LarkError { | |
// pass | |
} | |
class ParseError extends LarkError { | |
// pass | |
} | |
class LexError extends LarkError { | |
// pass | |
} | |
class UnexpectedInput extends LarkError { | |
/* | |
UnexpectedInput Error. | |
Used as a base class for the following exceptions: | |
- ``UnexpectedToken``: The parser received an unexpected token | |
- ``UnexpectedCharacters``: The lexer encountered an unexpected string | |
After catching one of these exceptions, you may call the following helper methods to create a nicer error message. | |
*/ | |
get pos_in_stream() { | |
return null; | |
} | |
get _terminals_by_name() { | |
return null; | |
} | |
get_context({ text, span = 40 } = {}) { | |
let after, | |
before; /* | |
Returns a pretty string pinpointing the error in the text, | |
with span amount of context characters around it. | |
Note: | |
The parser doesn't hold a copy of the text it has to parse, | |
so you have to provide it again | |
*/ | |
console.assert(this.pos_in_stream != null, this); | |
let pos = this.pos_in_stream; | |
let start = max(pos - span, 0); | |
let end = pos + span; | |
if (!(text instanceof bytes)) { | |
before = text.slice(start, pos).rsplit("\n", 1)[-1]; | |
after = text.slice(pos, end).split("\n", 1)[0]; | |
return before + after + "\n" + " " * before.expandtabs().length + "^\n"; | |
} else { | |
before = text.slice(start, pos).rsplit("\n", 1)[-1]; | |
after = text.slice(pos, end).split("\n", 1)[0]; | |
return ( | |
before + | |
after + | |
"\n" + | |
" " * before.expandtabs().length + | |
"^\n" | |
).decode("ascii", "backslashreplace"); | |
} | |
} | |
match_examples({ | |
parse_fn, | |
examples, | |
token_type_match_fallback = false, | |
use_accepts = false, | |
} = {}) { | |
/* | |
Allows you to detect what's wrong in the input text by matching | |
against example errors. | |
Given a parser instance and a dictionary mapping some label with | |
some malformed syntax examples, it'll return the label for the | |
example that bests matches the current error. The function will | |
iterate the dictionary until it finds a matching error, and | |
return the corresponding value. | |
For an example usage, see `examples/error_reporting_lalr.py` | |
Parameters: | |
parse_fn: parse function (usually ``lark_instance.parse``) | |
examples: dictionary of ``{'example_string': value}``. | |
use_accepts: Recommended to call this with ``use_accepts=True``. | |
The default is ``False`` for backwards compatibility. | |
*/ | |
console.assert(this.state != null, "Not supported for this exception"); | |
if (examples instanceof dict) { | |
examples = examples.items(); | |
} | |
let candidate = [null, false]; | |
for (let [i, [label, example]] of enumerate(examples)) { | |
console.assert(!(typeof example === "string"), "Expecting a list"); | |
for (let [j, malformed] of enumerate(example)) { | |
try { | |
parse_fn(malformed); | |
} catch (e) { | |
if (e instanceof UnexpectedInput) { | |
ut = e; | |
if (ut.state === this.state) { | |
if ( | |
use_accepts && | |
hasattr(this, "accepts") && | |
ut.accepts != this.accepts | |
) { | |
logger.debug( | |
"Different accepts with same state[%d]: %s != %s at example [%s][%s]".format( | |
this.state, | |
this.accepts, | |
ut.accepts, | |
i, | |
j | |
) | |
); | |
continue; | |
} | |
if ( | |
this.token && | |
logger && | |
this && | |
ut && | |
ut.token && | |
this.token.type && | |
logger.debug && | |
this.token && | |
ut.token && | |
ut.token.type | |
) { | |
if (ut.token === this.token) { | |
// Try exact match first | |
logger.debug("Exact Match at example [%s][%s]".format(i, j)); | |
return label; | |
} | |
if (token_type_match_fallback) { | |
// Fallback to token types match | |
if (ut.token.type === this.token.type && !candidate[-1]) { | |
logger.debug( | |
"Token Type Fallback at example [%s][%s]".format(i, j) | |
); | |
candidate = [label, true]; | |
} | |
} | |
} else { | |
// pass | |
} | |
if (candidate[0] === null) { | |
logger.debug( | |
"Same State match at example [%s][%s]".format(i, j) | |
); | |
candidate = [label, false]; | |
} | |
} | |
} else { | |
throw e; | |
} | |
} | |
} | |
} | |
return candidate[0]; | |
} | |
_format_expected(expected) { | |
let d; | |
if (this._terminals_by_name) { | |
d = this._terminals_by_name; | |
expected = expected.map((t_name) => | |
t_name in d ? d[t_name].user_repr() : t_name | |
); | |
} | |
return "Expected one of: \n\t* %s\n".format("\n\t* ".join(expected)); | |
} | |
} | |
class UnexpectedEOF extends ParseError { | |
constructor({ expected, state = null, terminals_by_name = null } = {}) { | |
super(); | |
this.expected = expected; | |
this.state = state; | |
this.token = new Token("<EOF>", ""); // , line=-1, column=-1, pos_in_stream=-1) | |
this.pos_in_stream = -1; | |
this.line = -1; | |
this.column = -1; | |
this._terminals_by_name = terminals_by_name; | |
} | |
__str__() { | |
let message = "Unexpected end-of-input. "; | |
message += this._format_expected(this.expected); | |
return message; | |
} | |
} | |
class UnexpectedCharacters extends LexError { | |
constructor({ | |
seq, | |
lex_pos, | |
line, | |
column, | |
allowed = null, | |
considered_tokens = null, | |
state = null, | |
token_history = null, | |
terminals_by_name = null, | |
considered_rules = null, | |
} = {}) { | |
super(); // TODO considered_tokens and allowed can be figured out using state | |
this.line = line; | |
this.column = column; | |
this.pos_in_stream = lex_pos; | |
this.state = state; | |
this._terminals_by_name = terminals_by_name; | |
this.allowed = allowed; | |
this.considered_tokens = considered_tokens; | |
this.considered_rules = considered_rules; | |
this.token_history = token_history; | |
if (seq instanceof bytes) { | |
this.char = seq | |
.slice(lex_pos, lex_pos + 1) | |
.decode("ascii", "backslashreplace"); | |
} else { | |
this.char = seq[lex_pos]; | |
} | |
this._context = this.get_context({ seq }); | |
} | |
__str__() { | |
let message = "No terminal matches '%s' in the current parser context, at line %d col %d".format( | |
this.char, | |
this.line, | |
this.column | |
); | |
message += "\n\n" + this._context; | |
if (this.allowed) { | |
message += this._format_expected(this.allowed); | |
} | |
if (this.token_history) { | |
message += "\nPrevious tokens: %s\n".format( | |
", ".join(this.token_history.map((t) => repr(t))) | |
); | |
} | |
return message; | |
} | |
} | |
class UnexpectedToken extends ParseError { | |
/* | |
An exception that is raised by the parser, when the token it received | |
doesn't match any valid step forward. | |
The parser provides an interactive instance through `interactive_parser`, | |
which is initialized to the point of failture, and can be used for debugging and error handling. | |
see: ``InteractiveParser``. | |
*/ | |
constructor({ | |
token, | |
expected, | |
considered_rules = null, | |
state = null, | |
interactive_parser = null, | |
terminals_by_name = null, | |
token_history = null, | |
} = {}) { | |
super(); // TODO considered_rules and expected can be figured out using state | |
this.line = token["line"] || "?"; | |
this.column = token["column"] || "?"; | |
this.pos_in_stream = token["start_pos"] || null; | |
this.state = state; | |
this.token = token; | |
this.expected = expected; // XXX deprecate? `accepts` is better | |
this._accepts = NO_VALUE; | |
this.considered_rules = considered_rules; | |
this.interactive_parser = interactive_parser; | |
this._terminals_by_name = terminals_by_name; | |
this.token_history = token_history; | |
} | |
get accepts() { | |
if (this._accepts === NO_VALUE) { | |
this._accepts = | |
this.interactive_parser && this.interactive_parser.accepts(); | |
} | |
return this._accepts; | |
} | |
__str__() { | |
let message = "Unexpected token %r at line %s, column %s.\n%s".format( | |
this.token, | |
this.line, | |
this.column, | |
this._format_expected(this.accepts || this.expected) | |
); | |
if (this.token_history) { | |
message += "Previous tokens: %r\n".format(this.token_history); | |
} | |
return message; | |
} | |
} | |
class VisitError extends LarkError { | |
/* | |
VisitError is raised when visitors are interrupted by an exception | |
It provides the following attributes for inspection: | |
- obj: the tree node or token it was processing when the exception was raised | |
- orig_exc: the exception that cause it to fail | |
*/ | |
constructor(rule, obj, orig_exc) { | |
let message = 'Error trying to process rule "%s":\n\n%s'.format( | |
rule, | |
orig_exc | |
); | |
super(message); | |
this.obj = obj; | |
this.orig_exc = orig_exc; | |
} | |
} | |
class Meta { | |
constructor() { | |
this.empty = true; | |
} | |
} | |
class Discard extends Error { | |
/* | |
When raising the Discard exception in a transformer callback, | |
that node is discarded and won't appear in the parent. | |
*/ | |
// pass | |
} | |
class Transformer extends _Decoratable { | |
/* | |
Transformers visit each node of the tree, and run the appropriate method on it according to the node's data. | |
Methods are provided by the user via inheritance, and called according to ``tree.data``. | |
The returned value from each method replaces the node in the tree structure. | |
Transformers work bottom-up (or depth-first), starting with the leaves and ending at the root of the tree. | |
Transformers can be used to implement map & reduce patterns. Because nodes are reduced from leaf to root, | |
at any point the callbacks may assume the children have already been transformed (if applicable). | |
``Transformer`` can do anything ``Visitor`` can do, but because it reconstructs the tree, | |
it is slightly less efficient. | |
All these classes implement the transformer interface: | |
- ``Transformer`` - Recursively transforms the tree. This is the one you probably want. | |
- ``Transformer_InPlace`` - Non-recursive. Changes the tree in-place instead of returning new instances | |
- ``Transformer_InPlaceRecursive`` - Recursive. Changes the tree in-place instead of returning new instances | |
Parameters: | |
visit_tokens (bool, optional): Should the transformer visit tokens in addition to rules. | |
Setting this to ``False`` is slightly faster. Defaults to ``True``. | |
(For processing ignored tokens, use the ``lexer_callbacks`` options) | |
NOTE: A transformer without methods essentially performs a non-memoized partial deepcopy. | |
*/ | |
get __visit_tokens__() { | |
return true; | |
} // For backwards compatibility | |
constructor({ visit_tokens = true } = {}) { | |
super(); | |
this.__visit_tokens__ = visit_tokens; | |
} | |
_call_userfunc({ tree, new_children = null } = {}) { | |
let f, wrapper; | |
let // Assumes tree is already transformed | |
children = new_children != null ? new_children : tree.children; | |
if (tree && tree.data && this && this[tree.data]) { | |
f = this[tree.data]; | |
try { | |
wrapper = f["visit_wrapper"] || null; | |
if (wrapper != null) { | |
return f.visit_wrapper(f, tree.data, children, tree.meta); | |
} else { | |
return f(children); | |
} | |
} catch (e) { | |
if (e instanceof [GrammarError, Discard]) { | |
throw None; | |
} else if (e instanceof Error) { | |
throw new VisitError(tree.data, tree, e); | |
} else { | |
throw e; | |
} | |
} | |
} else { | |
return this.__default__(tree.data, children, tree.meta); | |
} | |
} | |
_call_userfunc_token(token) { | |
let f; | |
if (token && token.type && this && this[token.type]) { | |
f = this[token.type]; | |
try { | |
return f(token); | |
} catch (e) { | |
if (e instanceof [GrammarError, Discard]) { | |
throw None; | |
} else if (e instanceof Error) { | |
throw new VisitError(token.type, token, e); | |
} else { | |
throw e; | |
} | |
} | |
} else { | |
return this.__default_token__(token); | |
} | |
} | |
*_transform_children(children) { | |
for (let c of children) { | |
try { | |
if (c instanceof Tree) { | |
yield this._transform_tree(c); | |
} else if (this.__visit_tokens__ && c instanceof Token) { | |
yield this._call_userfunc_token(c); | |
} else { | |
yield c; | |
} | |
} catch (e) { | |
if (e instanceof Discard) { | |
// pass | |
} else { | |
throw e; | |
} | |
} | |
} | |
} | |
_transform_tree(tree) { | |
let children = new Array.from(this._transform_children(tree.children)); | |
return this._call_userfunc({ tree, children }); | |
} | |
transform(tree) { | |
/* | |
Transform the given tree, and return the final result | |
*/ | |
return this._transform_tree(tree); | |
} | |
__mul__(other) { | |
/* | |
Chain two transformers together, returning a new transformer. | |
*/ | |
return new TransformerChain(this, other); | |
} | |
__default__(data, children, meta) { | |
/* | |
Default function that is called if there is no attribute matching ``data`` | |
Can be overridden. Defaults to creating a new copy of the tree node (i.e. ``return Tree(data, children, meta)``) | |
*/ | |
return new Tree(data, children, meta); | |
} | |
__default_token__(token) { | |
/* | |
Default function that is called if there is no attribute matching ``token.type`` | |
Can be overridden. Defaults to returning the token as-is. | |
*/ | |
return token; | |
} | |
} | |
class Transformer_InPlace extends Transformer { | |
/* | |
Same as Transformer, but non-recursive, and changes the tree in-place instead of returning new instances | |
Useful for huge trees. Conservative in memory. | |
*/ | |
_transform_tree(tree) { | |
// Cancel recursion | |
return this._call_userfunc({ tree }); | |
} | |
transform(tree) { | |
for (let subtree of tree.iter_subtrees()) { | |
subtree.children = new Array.from( | |
this._transform_children(subtree.children) | |
); | |
} | |
return this._transform_tree(tree); | |
} | |
} | |
class Transformer_NonRecursive extends Transformer { | |
/* | |
Same as Transformer but non-recursive. | |
Like Transformer, it doesn't change the original tree. | |
Useful for huge trees. | |
*/ | |
transform(tree) { | |
let size, args, t; | |
let // Tree to postfix | |
rev_postfix = []; | |
let q = [tree]; | |
while (q) { | |
t = q.pop(); | |
rev_postfix.push(t); | |
if (t instanceof Tree) { | |
q.push(...t.children); | |
} | |
} | |
let // Postfix to tree | |
stack = []; | |
for (let x of rev_postfix.slice().reverse()) { | |
if (x instanceof Tree) { | |
size = x.children.length; | |
if (size) { | |
args = stack.slice(-size); | |
} else { | |
args = []; | |
} | |
stack.push(this._call_userfunc({ x, args })); | |
} else if (this.__visit_tokens__ && x instanceof Token) { | |
stack.push(this._call_userfunc_token(x)); | |
} else { | |
stack.push(x); | |
} | |
} | |
[t] = stack; // We should have only one tree remaining | |
return t; | |
} | |
} | |
class Transformer_InPlaceRecursive extends Transformer { | |
/* | |
Same as Transformer, recursive, but changes the tree in-place instead of returning new instances | |
*/ | |
_transform_tree(tree) { | |
tree.children = new Array.from(this._transform_children(tree.children)); | |
return this._call_userfunc(tree); | |
} | |
} // Visitors | |
class VisitorBase { | |
_call_userfunc(tree) { | |
return (this[tree.data] || this.__default__)(tree); | |
} | |
__default__(tree) { | |
/* | |
Default function that is called if there is no attribute matching ``tree.data`` | |
Can be overridden. Defaults to doing nothing. | |
*/ | |
return tree; | |
} | |
__class_getitem__(_) { | |
return cls; | |
} | |
} | |
class Visitor extends VisitorBase { | |
/* | |
Tree visitor, non-recursive (can handle huge trees). | |
Visiting a node calls its methods (provided by the user via inheritance) according to ``tree.data`` | |
*/ | |
visit(tree) { | |
/* | |
Visits the tree, starting with the leaves and finally the root (bottom-up) | |
*/ | |
for (let subtree of tree.iter_subtrees()) { | |
this._call_userfunc(subtree); | |
} | |
return tree; | |
} | |
visit_topdown(tree) { | |
/* | |
Visit the tree, starting at the root, and ending at the leaves (top-down) | |
*/ | |
for (let subtree of tree.iter_subtrees_topdown()) { | |
this._call_userfunc(subtree); | |
} | |
return tree; | |
} | |
} | |
class Visitor_Recursive extends VisitorBase { | |
/* | |
Bottom-up visitor, recursive. | |
Visiting a node calls its methods (provided by the user via inheritance) according to ``tree.data`` | |
Slightly faster than the non-recursive version. | |
*/ | |
visit(tree) { | |
/* | |
Visits the tree, starting with the leaves and finally the root (bottom-up) | |
*/ | |
for (let child of tree.children) { | |
if (child instanceof Tree) { | |
this.visit(child); | |
} | |
} | |
this._call_userfunc(tree); | |
return tree; | |
} | |
visit_topdown(tree) { | |
/* | |
Visit the tree, starting at the root, and ending at the leaves (top-down) | |
*/ | |
this._call_userfunc(tree); | |
for (let child of tree.children) { | |
if (child instanceof Tree) { | |
this.visit_topdown(child); | |
} | |
} | |
return tree; | |
} | |
} | |
class Interpreter extends _Decoratable { | |
/* | |
Interpreter walks the tree starting at the root. | |
Visits the tree, starting with the root and finally the leaves (top-down) | |
For each tree node, it calls its methods (provided by user via inheritance) according to ``tree.data``. | |
Unlike ``Transformer`` and ``Visitor``, the Interpreter doesn't automatically visit its sub-branches. | |
The user has to explicitly call ``visit``, ``visit_children``, or use the ``@visit_children_decor``. | |
This allows the user to implement branching and loops. | |
*/ | |
visit(tree) { | |
let f = this[tree.data]; | |
let wrapper = f["visit_wrapper"] || null; | |
if (wrapper != null) { | |
return f.visit_wrapper(f, tree.data, tree.children, tree.meta); | |
} else { | |
return f(tree); | |
} | |
} | |
visit_children(tree) { | |
return tree.children.map((child) => | |
child instanceof Tree ? this.visit(child) : child | |
); | |
} | |
__getattr__(name) { | |
return this.__default__; | |
} | |
__default__(tree) { | |
return this.visit_children(tree); | |
} | |
} | |
class Symbol extends Serialize { | |
get is_term() { | |
return NotImplemented; | |
} | |
constructor(name) { | |
super(); | |
this.name = name; | |
} | |
__eq__(other) { | |
console.assert(other instanceof Symbol, other); | |
return this.is_term === other.is_term && this.name === other.name; | |
} | |
__ne__(other) { | |
return !(this === other); | |
} | |
__hash__() { | |
return hash(this.name); | |
} | |
__repr__() { | |
return "%s(%r)".format(type(this).__name__, this.name); | |
} | |
get fullrepr() { | |
return property(__repr__); | |
} | |
} | |
class Terminal extends Symbol { | |
get is_term() { | |
return true; | |
} | |
constructor({ name, filter_out = false } = {}) { | |
super(); | |
this.name = name; | |
this.filter_out = filter_out; | |
} | |
get fullrepr() { | |
return "%s(%r, %r)".format(type(this).__name__, this.name, this.filter_out); | |
} | |
} | |
class NonTerminal extends Symbol { | |
get is_term() { | |
return false; | |
} | |
} | |
class RuleOptions extends Serialize { | |
constructor({ | |
keep_all_tokens = false, | |
expand1 = false, | |
priority = null, | |
template_source = null, | |
empty_indices = [], | |
} = {}) { | |
super(); | |
this.keep_all_tokens = keep_all_tokens; | |
this.expand1 = expand1; | |
this.priority = priority; | |
this.template_source = template_source; | |
this.empty_indices = empty_indices; | |
} | |
__repr__() { | |
return "RuleOptions(%r, %r, %r, %r)".format( | |
this.keep_all_tokens, | |
this.expand1, | |
this.priority, | |
this.template_source | |
); | |
} | |
} | |
class Rule extends Serialize { | |
/* | |
origin : a symbol | |
expansion : a list of symbols | |
order : index of this expansion amongst all rules of the same name | |
*/ | |
constructor({ | |
origin, | |
expansion, | |
order = 0, | |
alias = null, | |
options = null, | |
} = {}) { | |
super(); | |
this.origin = origin; | |
this.expansion = expansion; | |
this.alias = alias; | |
this.order = order; | |
this.options = options || new RuleOptions(); | |
this._hash = hash([this.origin, tuple(this.expansion)]); | |
} | |
_deserialize() { | |
this._hash = hash([this.origin, tuple(this.expansion)]); | |
} | |
__str__() { | |
return "<%s : %s>".format( | |
this.origin.name, | |
" ".join(this.expansion.map((x) => x.name)) | |
); | |
} | |
__repr__() { | |
return "Rule(%r, %r, %r, %r)".format( | |
this.origin, | |
this.expansion, | |
this.alias, | |
this.options | |
); | |
} | |
__hash__() { | |
return this._hash; | |
} | |
__eq__(other) { | |
if (!(other instanceof Rule)) { | |
return false; | |
} | |
return this.origin === other.origin && this.expansion === other.expansion; | |
} | |
} | |
class Pattern extends Serialize { | |
get raw() { | |
return null; | |
} | |
get type() { | |
return null; | |
} | |
constructor({ value, flags = [], raw = null } = {}) { | |
super(); | |
this.value = value; | |
this.flags = frozenset(flags); | |
this.raw = raw; | |
} | |
__repr__() { | |
return repr(this.to_regexp()); | |
} // Pattern Hashing assumes all subclasses have a different priority! | |
__hash__() { | |
return hash([type(this), this.value, this.flags]); | |
} | |
__eq__(other) { | |
return ( | |
type(this) === type(other) && | |
this.value === other.value && | |
this.flags === other.flags | |
); | |
} | |
to_regexp() { | |
throw new NotImplementedError(); | |
} | |
min_width() { | |
throw new NotImplementedError(); | |
} | |
max_width() { | |
throw new NotImplementedError(); | |
} | |
_get_flags(value) { | |
for (let f of this.flags) { | |
value = "(?%s:%s)".format(f, value); | |
} | |
return value; | |
} | |
} | |
class PatternStr extends Pattern { | |
get type() { | |
return "str"; | |
} | |
to_regexp() { | |
return this._get_flags(re.escape(this.value)); | |
} | |
get min_width() { | |
return this.value.length; | |
} | |
get max_width() { | |
return min_width; | |
} | |
} | |
class PatternRE extends Pattern { | |
get type() { | |
return "re"; | |
} | |
to_regexp() { | |
return this._get_flags(this.value); | |
} | |
get _width() { | |
return null; | |
} | |
_get_width() { | |
if (this._width === null) { | |
this._width = get_regexp_width(this.to_regexp()); | |
} | |
return this._width; | |
} | |
get min_width() { | |
return this._get_width()[0]; | |
} | |
get max_width() { | |
return this._get_width()[1]; | |
} | |
} | |
class Token extends str { | |
/* | |
A string with meta-information, that is produced by the lexer. | |
When parsing text, the resulting chunks of the input that haven't been discarded, | |
will end up in the tree as Token instances. The Token class inherits from Python's ``str``, | |
so normal string comparisons and operations will work as expected. | |
Attributes: | |
type: Name of the token (as specified in grammar) | |
value: Value of the token (redundant, as ``token.value == token`` will always be true) | |
start_pos: The index of the token in the text | |
line: The line of the token in the text (starting with 1) | |
column: The column of the token in the text (starting with 1) | |
end_line: The line where the token ends | |
end_column: The next column after the end of the token. For example, | |
if the token is a single character with a column value of 4, | |
end_column will be 5. | |
end_pos: the index where the token ends (basically ``start_pos + len(token)``) | |
*/ | |
constructor({ | |
type_, | |
value, | |
start_pos = null, | |
line = null, | |
column = null, | |
end_line = null, | |
end_column = null, | |
end_pos = null, | |
} = {}) { | |
super(); | |
let inst = super.__new__(cls, value); | |
inst.type = type_; | |
inst.start_pos = start_pos; | |
inst.value = value; | |
inst.line = line; | |
inst.column = column; | |
inst.end_line = end_line; | |
inst.end_column = end_column; | |
inst.end_pos = end_pos; | |
return inst; | |
} | |
update({ type_ = null, value = null } = {}) { | |
return new Token.new_borrow_pos( | |
type_ != null ? type_ : this.type, | |
value != null ? value : this.value, | |
this | |
); | |
} | |
static new_borrow_pos(type_, value, borrow_t) { | |
cls = Token; | |
return new cls( | |
type_, | |
value, | |
borrow_t.start_pos, | |
borrow_t.line, | |
borrow_t.column, | |
borrow_t.end_line, | |
borrow_t.end_column, | |
borrow_t.end_pos | |
); | |
} | |
__reduce__() { | |
return [ | |
this.__class__, | |
[this.type, this.value, this.start_pos, this.line, this.column], | |
]; | |
} | |
__repr__() { | |
return "Token(%r, %r)".format(this.type, this.value); | |
} | |
__deepcopy__(memo) { | |
return new Token( | |
this.type, | |
this.value, | |
this.start_pos, | |
this.line, | |
this.column | |
); | |
} | |
__eq__(other) { | |
if (other instanceof Token && this.type != other.type) { | |
return false; | |
} | |
return str.__eq__(this, other); | |
} | |
get __hash__() { | |
return str.__hash__; | |
} | |
} | |
class LineCounter { | |
constructor(newline_char) { | |
this.newline_char = newline_char; | |
this.char_pos = 0; | |
this.line = 1; | |
this.column = 1; | |
this.line_start_pos = 0; | |
} | |
__eq__(other) { | |
if (!(other instanceof LineCounter)) { | |
return NotImplemented; | |
} | |
return ( | |
this.char_pos === other.char_pos && | |
this.newline_char === other.newline_char | |
); | |
} | |
feed({ token, test_newline = true } = {}) { | |
let newlines; /* | |
Consume a token and calculate the new line & column. | |
As an optional optimization, set test_newline=False if token doesn't contain a newline. | |
*/ | |
if (test_newline) { | |
newlines = token.count(this.newline_char); | |
if (newlines) { | |
this.line += newlines; | |
this.line_start_pos = | |
this.char_pos + token.rindex(this.newline_char) + 1; | |
} | |
} | |
this.char_pos += token.length; | |
this.column = this.char_pos - this.line_start_pos + 1; | |
} | |
} | |
class UnlessCallback { | |
constructor(scanner) { | |
this.scanner = scanner; | |
} | |
__call__(t) { | |
let res = this.scanner.match(t.value, 0); | |
if (res) { | |
[_value, t.type] = res; | |
} | |
return t; | |
} | |
} | |
class CallChain { | |
constructor(callback1, callback2, cond) { | |
this.callback1 = callback1; | |
this.callback2 = callback2; | |
this.cond = cond; | |
} | |
__call__(t) { | |
let t2 = this.callback1(t); | |
return this.cond(t2) ? this.callback2(t) : t2; | |
} | |
} | |
class Lexer { | |
/* | |
Lexer interface | |
Method Signatures: | |
lex(self, text) -> Iterator[Token] | |
*/ | |
get lex() { | |
return NotImplemented; | |
} | |
make_lexer_state(text) { | |
let line_ctr = new LineCounter(text instanceof bytes ? "\n" : "\n"); | |
return new LexerState({ text, line_ctr }); | |
} | |
} | |
class TraditionalLexer extends Lexer { | |
constructor(conf) { | |
super(); | |
let terminals = new Array.from(conf.terminals); | |
console.assert( | |
all(terminals.map((t) => t instanceof TerminalDef)), | |
terminals | |
); | |
this.re = conf.re_module; | |
if (!conf.skip_validation) { | |
// Sanitization | |
for (let t of terminals) { | |
try { | |
this.re.compile(t.pattern.to_regexp(), conf.g_regex_flags); | |
} catch (e) { | |
if (e instanceof this.re.error) { | |
throw new LexError( | |
"Cannot compile token %s: %s".format(t.name, t.pattern) | |
); | |
} else { | |
throw e; | |
} | |
} | |
if (t.pattern.min_width === 0) { | |
throw new LexError( | |
"Lexer does not allow zero-width terminals. (%s: %s)".format( | |
t.name, | |
t.pattern | |
) | |
); | |
} | |
} | |
if (!(new Set(conf.ignore) <= new Set(terminals.map((t) => t.name)))) { | |
throw new LexError( | |
"Ignore terminals are not defined: %s".format( | |
new Set(conf.ignore) - new Set(terminals.map((t) => t.name)) | |
) | |
); | |
} | |
} // Init | |
this.newline_types = frozenset( | |
terminals | |
.filter((t) => _regexp_has_newline(t.pattern.to_regexp())) | |
.map((t) => t.name) | |
); | |
this.ignore_types = frozenset(conf.ignore); | |
terminals.sort({ | |
key: (x) => [ | |
-x.priority, | |
-x.pattern.max_width, | |
-x.pattern.value.length, | |
x.name, | |
], | |
}); | |
this.terminals = terminals; | |
this.user_callbacks = conf.callbacks; | |
this.g_regex_flags = conf.g_regex_flags; | |
this.use_bytes = conf.use_bytes; | |
this.terminals_by_name = conf.terminals_by_name; | |
this._scanner = null; | |
} | |
_build_scanner() { | |
[terminals, this.callback] = _create_unless( | |
this.terminals, | |
this.g_regex_flags, | |
this.re, | |
this.use_bytes | |
); | |
console.assert(all(this.callback.values())); | |
for (let [type_, f] of this.user_callbacks.items()) { | |
if (type_ in this.callback) { | |
// Already a callback there, probably UnlessCallback | |
this.callback[type_] = new CallChain( | |
this.callback[type_], | |
f, | |
(t) => t.type === type_ | |
); | |
} else { | |
this.callback[type_] = f; | |
} | |
} | |
this._scanner = new Scanner( | |
terminals, | |
this.g_regex_flags, | |
this.re, | |
this.use_bytes | |
); | |
} | |
get scanner() { | |
if (this._scanner === null) { | |
this._build_scanner(); | |
} | |
return this._scanner; | |
} | |
match(text, pos) { | |
return this.scanner.match(text, pos); | |
} | |
*lex(state, parser_state) { | |
try { | |
while (true) { | |
yield this.next_token({ state, parser_state }); | |
} | |
} catch (e) { | |
if (e instanceof EOFError) { | |
// pass | |
} else { | |
throw e; | |
} | |
} | |
} | |
next_token({ lex_state, parser_state = null } = {}) { | |
let allowed, t, t2, res; | |
let line_ctr = lex_state.line_ctr; | |
while (line_ctr.char_pos < lex_state.text.length) { | |
res = this.match(lex_state.text, line_ctr.char_pos); | |
if (!res) { | |
allowed = this.scanner.allowed_types - this.ignore_types; | |
if (!allowed) { | |
allowed = new Set(["<END-OF-FILE>"]); | |
} | |
throw new UnexpectedCharacters({ | |
unknown_param: lex_state.text, | |
unknown_param: line_ctr.char_pos, | |
unknown_param: line_ctr.line, | |
unknown_param: line_ctr.column, | |
allowed: allowed, | |
token_history: lex_state.last_token && [lex_state.last_token], | |
state: parser_state, | |
terminals_by_name: this.terminals_by_name, | |
}); | |
} | |
[value, type_] = res; | |
if (!(type_ in this.ignore_types)) { | |
t = new Token( | |
type_, | |
value, | |
line_ctr.char_pos, | |
line_ctr.line, | |
line_ctr.column | |
); | |
line_ctr.feed(value, type_ in this.newline_types); | |
t.end_line = line_ctr.line; | |
t.end_column = line_ctr.column; | |
t.end_pos = line_ctr.char_pos; | |
if (t.type in this.callback) { | |
t = this.callback[t.type](t); | |
if (!(t instanceof Token)) { | |
throw new LexError( | |
"Callbacks must return a token (returned %r)".format(t) | |
); | |
} | |
} | |
lex_state.last_token = t; | |
return t; | |
} else { | |
if (type_ in this.callback) { | |
t2 = new Token( | |
type_, | |
value, | |
line_ctr.char_pos, | |
line_ctr.line, | |
line_ctr.column | |
); | |
this.callback[type_](t2); | |
} | |
line_ctr.feed(value, type_ in this.newline_types); | |
} | |
} // EOF | |
throw new EOFError(this); | |
} | |
} | |
class LexerState { | |
constructor({ text, line_ctr, last_token = null } = {}) { | |
this.text = text; | |
this.line_ctr = line_ctr; | |
this.last_token = last_token; | |
} | |
__eq__(other) { | |
if (!(other instanceof LexerState)) { | |
return NotImplemented; | |
} | |
return ( | |
this.text === other.text && | |
this.line_ctr === other.line_ctr && | |
this.last_token === other.last_token | |
); | |
} | |
__copy__() { | |
return type(this)(this.text, copy(this.line_ctr), this.last_token); | |
} | |
} | |
class ContextualLexer extends Lexer { | |
constructor({ conf, states, always_accept = [] } = {}) { | |
super(); | |
let key, lexer, lexer_conf, accepts; | |
let terminals = new Array.from(conf.terminals); | |
let terminals_by_name = conf.terminals_by_name; | |
let trad_conf = copy(conf); | |
trad_conf.terminals = terminals; | |
let lexer_by_tokens = {}; | |
this.lexers = {}; | |
for (let [state, accepts] of states.items()) { | |
key = frozenset(accepts); | |
try { | |
lexer = lexer_by_tokens[key]; | |
} catch (e) { | |
if (e instanceof KeyError) { | |
accepts = new Set(accepts).union([ | |
...new Set(conf.ignore), | |
...new Set(always_accept), | |
]); | |
lexer_conf = copy(trad_conf); | |
lexer_conf.terminals = accepts | |
.filter((n) => n in terminals_by_name) | |
.map((n) => terminals_by_name[n]); | |
lexer = new TraditionalLexer(lexer_conf); | |
lexer_by_tokens[key] = lexer; | |
} else { | |
throw e; | |
} | |
} | |
this.lexers[state] = lexer; | |
} | |
console.assert(trad_conf.terminals === terminals); | |
this.root_lexer = new TraditionalLexer(trad_conf); | |
} | |
make_lexer_state(text) { | |
return this.root_lexer.make_lexer_state(text); | |
} | |
*lex(lexer_state, parser_state) { | |
let lexer, // Save last_token. Calling root_lexer.next_token will change this to the wrong token | |
token, | |
last_token; | |
try { | |
while (true) { | |
lexer = this.lexers[parser_state.position]; | |
yield lexer.next_token({ lexer_state, parser_state }); | |
} | |
} catch (e) { | |
if (e instanceof EOFError) { | |
// pass | |
} else if (e instanceof UnexpectedCharacters) { | |
// In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. | |
// This tests the input against the global context, to provide a nicer error. | |
try { | |
last_token = lexer_state.last_token; // Save last_token. Calling root_lexer.next_token will change this to the wrong token | |
token = this.root_lexer.next_token({ lexer_state, parser_state }); | |
throw new UnexpectedToken({ | |
token, | |
unknown_param: e.allowed, | |
state: parser_state, | |
token_history: [last_token], | |
terminals_by_name: this.root_lexer.terminals_by_name, | |
}); | |
} catch (e) { | |
if (e instanceof UnexpectedCharacters) { | |
throw e; | |
} else { | |
throw e; | |
} | |
} | |
} else { | |
throw e; | |
} | |
} | |
} | |
} // Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set. | |
class LexerThread { | |
/* | |
A thread that ties a lexer instance and a lexer state, to be used by the parser | |
*/ | |
constructor(lexer, text) { | |
this.lexer = lexer; | |
this.state = lexer.make_lexer_state(text); | |
} | |
lex(parser_state) { | |
return this.lexer.lex(this.state, parser_state); | |
} | |
__copy__() { | |
let copied = object.__new__(LexerThread); | |
copied.lexer = this.lexer; | |
copied.state = copy(this.state); | |
return copied; | |
} | |
} | |
class LexerConf extends Serialize { | |
constructor({ | |
terminals, | |
re_module, | |
ignore = [], | |
postlex = null, | |
callbacks = null, | |
g_regex_flags = 0, | |
skip_validation = false, | |
use_bytes = false, | |
} = {}) { | |
super(); | |
this.terminals = terminals; | |
this.terminals_by_name = new Map(this.terminals.map((t) => [t.name, t])); | |
console.assert(this.terminals.length === this.terminals_by_name.length); | |
this.ignore = ignore; | |
this.postlex = postlex; | |
this.callbacks = callbacks || {}; | |
this.g_regex_flags = g_regex_flags; | |
this.re_module = re_module; | |
this.skip_validation = skip_validation; | |
this.use_bytes = use_bytes; | |
this.lexer_type = null; | |
} | |
_deserialize() { | |
this.terminals_by_name = new Map(this.terminals.map((t) => [t.name, t])); | |
} | |
} | |
class ParserConf extends Serialize { | |
constructor(rules, callbacks, start) { | |
super(); | |
console.assert(start instanceof list); | |
this.rules = rules; | |
this.callbacks = callbacks; | |
this.start = start; | |
this.parser_type = null; | |
} | |
} | |
class ExpandSingleChild { | |
constructor(node_builder) { | |
this.node_builder = node_builder; | |
} | |
__call__(children) { | |
if (children.length === 1) { | |
return children[0]; | |
} else { | |
return this.node_builder(children); | |
} | |
} | |
} | |
class PropagatePositions { | |
constructor({ node_builder, node_filter = null } = {}) { | |
this.node_builder = node_builder; | |
this.node_filter = node_filter; | |
} | |
__call__(children) { | |
let res_meta, last_meta, first_meta; | |
let res = this.node_builder(children); // local reference to Tree.meta reduces number of presence checks | |
if (res instanceof Tree) { | |
res_meta = res.meta; | |
first_meta = this._pp_get_meta(children); | |
if (first_meta != null) { | |
res_meta.line = first_meta.line; | |
res_meta.column = first_meta.column; | |
res_meta.start_pos = first_meta.start_pos; | |
res_meta.empty = false; | |
} | |
last_meta = this._pp_get_meta(children.slice().reverse()); | |
if (last_meta != null) { | |
res_meta.end_line = last_meta.end_line; | |
res_meta.end_column = last_meta.end_column; | |
res_meta.end_pos = last_meta.end_pos; | |
res_meta.empty = false; | |
} | |
} | |
return res; | |
} | |
_pp_get_meta(children) { | |
for (let c of children) { | |
if (this.node_filter != null && !this.node_filter(c)) { | |
continue; | |
} | |
if (c instanceof Tree) { | |
if (!c.meta.empty) { | |
return c.meta; | |
} | |
} else if (c instanceof Token) { | |
return c; | |
} | |
} | |
} | |
} | |
function make_propagate_positions(option) { | |
if (callable(option)) { | |
return partial({ PropagatePositions, node_filter: option }); | |
} else if (option === true) { | |
return PropagatePositions; | |
} else if (option === false) { | |
return null; | |
} | |
throw new ConfigurationError( | |
"Invalid option for propagate_positions: %r".format(option) | |
); | |
} | |
class ChildFilter { | |
constructor(to_include, append_none, node_builder) { | |
this.node_builder = node_builder; | |
this.to_include = to_include; | |
this.append_none = append_none; | |
} | |
__call__(children) { | |
let filtered = []; | |
for (let [i, to_expand, add_none] of this.to_include) { | |
if (add_none) { | |
filtered += [null] * add_none; | |
} | |
if (to_expand) { | |
filtered += children[i].children; | |
} else { | |
filtered.push(children[i]); | |
} | |
} | |
if (this.append_none) { | |
filtered += [null] * this.append_none; | |
} | |
return this.node_builder(filtered); | |
} | |
} | |
class ChildFilterLALR extends ChildFilter { | |
/* | |
Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it) | |
*/ | |
__call__(children) { | |
let filtered = []; | |
for (let [i, to_expand, add_none] of this.to_include) { | |
if (add_none) { | |
filtered += [null] * add_none; | |
} | |
if (to_expand) { | |
if (filtered) { | |
filtered += children[i].children; | |
} else { | |
// Optimize for left-recursion | |
filtered = children[i].children; | |
} | |
} else { | |
filtered.push(children[i]); | |
} | |
} | |
if (this.append_none) { | |
filtered += [null] * this.append_none; | |
} | |
return this.node_builder(filtered); | |
} | |
} | |
class ChildFilterLALR_NoPlaceholders extends ChildFilter { | |
/* | |
Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it) | |
*/ | |
constructor(to_include, node_builder) { | |
super(); | |
this.node_builder = node_builder; | |
this.to_include = to_include; | |
} | |
__call__(children) { | |
let filtered = []; | |
for (let [i, to_expand] of this.to_include) { | |
if (to_expand) { | |
if (filtered) { | |
filtered += children[i].children; | |
} else { | |
// Optimize for left-recursion | |
filtered = children[i].children; | |
} | |
} else { | |
filtered.push(children[i]); | |
} | |
} | |
return this.node_builder(filtered); | |
} | |
} | |
function _should_expand(sym) { | |
return !sym.is_term && sym.name.startswith("_"); | |
} | |
function maybe_create_child_filter( | |
expansion, | |
keep_all_tokens, | |
ambiguous, | |
_empty_indices | |
) { | |
let s, empty_indices; // Prepare empty_indices as: How many Nones to insert at each index? | |
if (_empty_indices) { | |
console.assert(_empty_indices.count(false) === expansion.length); | |
s = "".join(_empty_indices.map((b) => str(int(b)))); | |
empty_indices = s.split("0").map((ones) => ones.length); | |
console.assert(empty_indices.length === expansion.length + 1, [ | |
empty_indices, | |
expansion.length, | |
]); | |
} else { | |
empty_indices = [0] * (expansion.length + 1); | |
} | |
let to_include = []; | |
let nones_to_add = 0; | |
for (let [i, sym] of enumerate(expansion)) { | |
nones_to_add += empty_indices[i]; | |
if (keep_all_tokens || !(sym.is_term && sym.filter_out)) { | |
to_include.push([i, _should_expand(sym), nones_to_add]); | |
nones_to_add = 0; | |
} | |
} | |
nones_to_add += empty_indices[expansion.length]; | |
if ( | |
_empty_indices || | |
to_include.length < expansion.length || | |
any(to_include.map(([i, to_expand, _]) => to_expand)) | |
) { | |
if (_empty_indices || ambiguous) { | |
return partial( | |
ambiguous ? ChildFilter : ChildFilterLALR, | |
to_include, | |
nones_to_add | |
); | |
} else { | |
// LALR without placeholders | |
return partial( | |
ChildFilterLALR_NoPlaceholders, | |
to_include.map(([i, x, _]) => [i, x]) | |
); | |
} | |
} | |
} | |
class AmbiguousExpander { | |
/* | |
Deal with the case where we're expanding children ('_rule') into a parent but the children | |
are ambiguous. i.e. (parent->_ambig->_expand_this_rule). In this case, make the parent itself | |
ambiguous with as many copies as their are ambiguous children, and then copy the ambiguous children | |
into the right parents in the right places, essentially shifting the ambiguity up the tree. | |
*/ | |
constructor(to_expand, tree_class, node_builder) { | |
this.node_builder = node_builder; | |
this.tree_class = tree_class; | |
this.to_expand = to_expand; | |
} | |
__call__(children) { | |
let to_expand; | |
function _is_ambig_tree(t) { | |
return hasattr(t, "data") && t.data === "_ambig"; | |
} | |
let // -- When we're repeatedly expanding ambiguities we can end up with nested ambiguities. | |
// All children of an _ambig node should be a derivation of that ambig node, hence | |
// it is safe to assume that if we see an _ambig node nested within an ambig node | |
// it is safe to simply expand it into the parent _ambig node as an alternative derivation. | |
ambiguous = []; | |
for (let [i, child] of enumerate(children)) { | |
if (_is_ambig_tree(child)) { | |
if (i in this.to_expand) { | |
ambiguous.push(i); | |
} | |
to_expand = enumerate(child.children) | |
.filter(([j, grandchild]) => _is_ambig_tree(grandchild)) | |
.map(([j, grandchild]) => j); | |
child.expand_kids_by_index(...to_expand); | |
} | |
} | |
if (!ambiguous) { | |
return this.node_builder(children); | |
} | |
let expand = enumerate(children).map(([i, child]) => | |
i in ambiguous ? iter(child.children) : repeat(child) | |
); | |
return this.tree_class( | |
"_ambig", | |
product(zip(...expand)).map((f) => | |
this.node_builder(new Array.from(f[0])) | |
) | |
); | |
} | |
} | |
function maybe_create_ambiguous_expander( | |
tree_class, | |
expansion, | |
keep_all_tokens | |
) { | |
let to_expand = enumerate(expansion) | |
.filter( | |
([i, sym]) => | |
keep_all_tokens || | |
(!(sym.is_term && sym.filter_out) && _should_expand(sym)) | |
) | |
.map(([i, sym]) => i); | |
if (to_expand) { | |
return partial(AmbiguousExpander, to_expand, tree_class); | |
} | |
} | |
class AmbiguousIntermediateExpander { | |
/* | |
Propagate ambiguous intermediate nodes and their derivations up to the | |
current rule. | |
In general, converts | |
rule | |
_iambig | |
_inter | |
someChildren1 | |
... | |
_inter | |
someChildren2 | |
... | |
someChildren3 | |
... | |
to | |
_ambig | |
rule | |
someChildren1 | |
... | |
someChildren3 | |
... | |
rule | |
someChildren2 | |
... | |
someChildren3 | |
... | |
rule | |
childrenFromNestedIambigs | |
... | |
someChildren3 | |
... | |
... | |
propagating up any nested '_iambig' nodes along the way. | |
*/ | |
constructor(tree_class, node_builder) { | |
this.node_builder = node_builder; | |
this.tree_class = tree_class; | |
} | |
__call__(children) { | |
let processed_nodes, iambig_node, collapsed, result, new_tree; | |
function _is_iambig_tree(child) { | |
return hasattr(child, "data") && child.data === "_iambig"; | |
} | |
function _collapse_iambig(children) { | |
let iambig_node, | |
collapsed, | |
result, | |
new_tree; /* | |
Recursively flatten the derivations of the parent of an '_iambig' | |
node. Returns a list of '_inter' nodes guaranteed not | |
to contain any nested '_iambig' nodes, or None if children does | |
not contain an '_iambig' node. | |
*/ // Due to the structure of the SPPF, | |
// an '_iambig' node can only appear as the first child | |
if (children && _is_iambig_tree(children[0])) { | |
iambig_node = children[0]; | |
result = []; | |
for (let grandchild of iambig_node.children) { | |
collapsed = _collapse_iambig(grandchild.children); | |
if (collapsed) { | |
for (let child of collapsed) { | |
child.children += children.slice(1); | |
} | |
result += collapsed; | |
} else { | |
new_tree = this.tree_class( | |
"_inter", | |
grandchild.children + children.slice(1) | |
); | |
result.push(new_tree); | |
} | |
} | |
return result; | |
} | |
} | |
collapsed = _collapse_iambig(children); | |
if (collapsed) { | |
processed_nodes = collapsed.map((c) => this.node_builder(c.children)); | |
return this.tree_class("_ambig", processed_nodes); | |
} | |
return this.node_builder(children); | |
} | |
} | |
function inplace_transformer(func) { | |
let // function name in a Transformer is a rule name. | |
tree; | |
function f(children) { | |
let // function name in a Transformer is a rule name. | |
tree = new Tree(func.__name__, children); | |
return func(tree); | |
} | |
f = wraps(func)(f); | |
return f; | |
} | |
function apply_visit_wrapper(func, name, wrapper) { | |
if (wrapper === _vargs_meta || wrapper === _vargs_meta_inline) { | |
throw new NotImplementedError( | |
"Meta args not supported for internal transformer" | |
); | |
} | |
function f(children) { | |
return wrapper(func, name, children, null); | |
} | |
f = wraps(func)(f); | |
return f; | |
} | |
class ParseTreeBuilder { | |
constructor({ | |
rules, | |
tree_class, | |
propagate_positions = false, | |
ambiguous = false, | |
maybe_placeholders = false, | |
} = {}) { | |
this.tree_class = tree_class; | |
this.propagate_positions = propagate_positions; | |
this.ambiguous = ambiguous; | |
this.maybe_placeholders = maybe_placeholders; | |
this.rule_builders = new Array.from(this._init_builders(rules)); | |
} | |
*_init_builders(rules) { | |
let expand_single_child, wrapper_chain, options, keep_all_tokens; | |
let propagate_positions = make_propagate_positions( | |
this.propagate_positions | |
); | |
for (let rule of rules) { | |
options = rule.options; | |
keep_all_tokens = options.keep_all_tokens; | |
expand_single_child = options.expand1; | |
wrapper_chain = new Array.from( | |
filter(null, [ | |
expand_single_child && !rule.alias && ExpandSingleChild, | |
maybe_create_child_filter( | |
rule.expansion, | |
keep_all_tokens, | |
this.ambiguous, | |
this.maybe_placeholders ? options.empty_indices : null | |
), | |
propagate_positions, | |
this.ambiguous && | |
maybe_create_ambiguous_expander( | |
this.tree_class, | |
rule.expansion, | |
keep_all_tokens | |
), | |
this.ambiguous && | |
partial(AmbiguousIntermediateExpander, this.tree_class), | |
]) | |
); | |
yield rule, wrapper_chain; | |
} | |
} | |
create_callback({ transformer = null } = {}) { | |
let f, wrapper, user_callback_name; | |
let callbacks = {}; | |
for (let [rule, wrapper_chain] of this.rule_builders) { | |
user_callback_name = | |
rule.alias || rule.options.template_source || rule.origin.name; | |
if (transformer && transformer[user_callback_name]) { | |
f = transformer[user_callback_name]; | |
wrapper = f["visit_wrapper"] || null; | |
if (wrapper != null) { | |
f = apply_visit_wrapper(f, user_callback_name, wrapper); | |
} else if (transformer instanceof Transformer_InPlace) { | |
f = inplace_transformer(f); | |
} | |
} else { | |
f = partial(this.tree_class, user_callback_name); | |
} | |
for (let w of wrapper_chain) { | |
f = w(f); | |
} | |
if (rule in callbacks) { | |
throw new GrammarError("Rule '%s' already exists".format(rule)); | |
} | |
callbacks[rule] = f; | |
} | |
return callbacks; | |
} | |
} | |
class LALR_Parser extends Serialize { | |
constructor({ parser_conf, debug = false } = {}) { | |
super(); | |
let analysis = new LALR_Analyzer({ parser_conf, debug: debug }); | |
analysis.compute_lalr(); | |
let callbacks = parser_conf.callbacks; | |
this._parse_table = analysis.parse_table; | |
this.parser_conf = parser_conf; | |
this.parser = _Parser({ | |
parse_table: analysis.parse_table, | |
callbacks, | |
debug, | |
}); | |
} | |
static deserialize({ data, memo, callbacks, debug = false } = {}) { | |
cls = LALR_Parser; | |
let inst = cls.__new__(cls); | |
inst._parse_table = new IntParseTable.deserialize(data, memo); | |
inst.parser = _Parser({ parse_table: inst._parse_table, callbacks, debug }); | |
return inst; | |
} | |
serialize(memo) { | |
return this._parse_table.serialize(memo); | |
} | |
parse_interactive(lexer, start) { | |
return this.parser.parse({ lexer, start, start_interactive: true }); | |
} | |
parse({ lexer, start, on_error = null } = {}) { | |
let p, s, e; | |
try { | |
return this.parser.parse({ lexer, start }); | |
} catch (e) { | |
if (e instanceof UnexpectedInput) { | |
if (on_error === null) { | |
throw None; | |
} | |
while (true) { | |
if (e instanceof UnexpectedCharacters) { | |
s = e.interactive_parser.lexer_state.state; | |
p = s.line_ctr.char_pos; | |
} | |
if (!on_error(e)) { | |
throw e; | |
} | |
if (e instanceof UnexpectedCharacters) { | |
// If user didn't change the character position, then we should | |
if (p === s.line_ctr.char_pos) { | |
s.line_ctr.feed(s.text.slice(p, p + 1)); | |
} | |
} | |
try { | |
return e.interactive_parser.resume_parse(); | |
} catch (e) { | |
if (e instanceof UnexpectedToken) { | |
e2 = e; | |
if ( | |
e instanceof UnexpectedToken && | |
e.token.type === e2.token.type && | |
e2.token.type === "$END" && | |
e.interactive_parser === e2.interactive_parser | |
) { | |
// Prevent infinite loop | |
throw e2; | |
} | |
e = e2; | |
} else if (e instanceof UnexpectedCharacters) { | |
e2 = e; | |
e = e2; | |
} else { | |
throw e; | |
} | |
} | |
} | |
} else { | |
throw e; | |
} | |
} | |
} | |
} | |
class ParseConf { | |
constructor(parse_table, callbacks, start) { | |
this.parse_table = parse_table; | |
this.start_state = this.parse_table.start_states[start]; | |
this.end_state = this.parse_table.end_states[start]; | |
this.states = this.parse_table.states; | |
this.callbacks = callbacks; | |
this.start = start; | |
} | |
} | |
class ParserState { | |
constructor({ | |
parse_conf, | |
lexer, | |
state_stack = null, | |
value_stack = null, | |
} = {}) { | |
this.parse_conf = parse_conf; | |
this.lexer = lexer; | |
this.state_stack = state_stack || [this.parse_conf.start_state]; | |
this.value_stack = value_stack || []; | |
} | |
get position() { | |
return this.state_stack[-1]; | |
} // Necessary for match_examples() to work | |
__eq__(other) { | |
if (!(other instanceof ParserState)) { | |
return NotImplemented; | |
} | |
return ( | |
this.state_stack.length === other.state_stack.length && | |
this.position === other.position | |
); | |
} | |
__copy__() { | |
return type(this)( | |
this.parse_conf, | |
this.lexer, // XXX copy | |
copy(this.state_stack), | |
deepcopy(this.value_stack) | |
); | |
} | |
copy() { | |
return copy(this); | |
} | |
feed_token({ token, is_end = false } = {}) { | |
let s, | |
state, | |
size, | |
expected, // reduce+shift as many times as necessary | |
rule, | |
value; | |
let state_stack = this.state_stack; | |
let value_stack = this.value_stack; | |
let states = this.parse_conf.states; | |
let end_state = this.parse_conf.end_state; | |
let callbacks = this.parse_conf.callbacks; | |
while (true) { | |
state = state_stack[-1]; | |
try { | |
[action, arg] = states[state][token.type]; | |
} catch (e) { | |
if (e instanceof KeyError) { | |
expected = new Set( | |
states[state] | |
.keys() | |
.filter((s) => s.isupper()) | |
.map((s) => s) | |
); | |
throw new UnexpectedToken({ | |
token, | |
expected, | |
state: this, | |
interactive_parser: null, | |
}); | |
} else { | |
throw e; | |
} | |
} | |
console.assert(arg != end_state); | |
if (action === Shift) { | |
// shift once and return | |
console.assert(!is_end); | |
state_stack.push(arg); | |
value_stack.push( | |
!(token.type in callbacks) ? token : callbacks[token.type](token) | |
); | |
return; | |
} else { | |
// reduce+shift as many times as necessary | |
rule = arg; | |
size = rule.expansion.length; | |
if (size) { | |
s = value_stack.slice(-size); | |
} else { | |
s = []; | |
} | |
value = callbacks[rule](s); | |
[_action, new_state] = states[state_stack[-1]][rule.origin.name]; | |
console.assert(_action === Shift); | |
state_stack.push(new_state); | |
value_stack.push(value); | |
if (is_end && state_stack[-1] === end_state) { | |
return value_stack[-1]; | |
} | |
} | |
} | |
} | |
} | |
class _Parser { | |
constructor({ parse_table, callbacks, debug = false } = {}) { | |
this.parse_table = parse_table; | |
this.callbacks = callbacks; | |
this.debug = debug; | |
} | |
parse({ | |
lexer, | |
start, | |
value_stack = null, | |
state_stack = null, | |
start_interactive = false, | |
} = {}) { | |
let parse_conf = new ParseConf(this.parse_table, this.callbacks, start); | |
let parser_state = new ParserState({ | |
parse_conf, | |
lexer, | |
state_stack, | |
value_stack, | |
}); | |
if (start_interactive) { | |
return new InteractiveParser(this, parser_state, parser_state.lexer); | |
} | |
return this.parse_from_state(parser_state); | |
} | |
parse_from_state(state) { | |
let token, end_token; // Main LALR-parser loop | |
try { | |
token = null; | |
for (let token of state.lexer.lex(state)) { | |
state.feed_token(token); | |
} | |
end_token = token | |
? new Token.new_borrow_pos("$END", "", token) | |
: new Token("$END", "", 0, 1, 1); | |
return state.feed_token(end_token, true); | |
} catch (e) { | |
if (e instanceof UnexpectedInput) { | |
try { | |
e.interactive_parser = new InteractiveParser( | |
this, | |
state, | |
state.lexer | |
); | |
} catch (e) { | |
if (e instanceof ReferenceError) { | |
// pass | |
} else { | |
throw e; | |
} | |
} | |
throw e; | |
} else if (e instanceof Error) { | |
if (this.debug) { | |
console.log(""); | |
console.log("STATE STACK DUMP"); | |
console.log("----------------"); | |
for (let [i, s] of enumerate(state.state_stack)) { | |
console.log("%d)".format(i), s); | |
} | |
console.log(""); | |
} | |
throw None; | |
} else { | |
throw e; | |
} | |
} | |
} | |
} | |
class Action { | |
constructor(name) { | |
this.name = name; | |
} | |
__str__() { | |
return this.name; | |
} | |
__repr__() { | |
return str(this); | |
} | |
} | |
Shift = new Action("Shift"); | |
Reduce = new Action("Reduce"); | |
class ParseTable { | |
constructor(states, start_states, end_states) { | |
this.states = states; | |
this.start_states = start_states; | |
this.end_states = end_states; | |
} | |
serialize(memo) { | |
let tokens = new Enumerator(); | |
let rules = new Enumerator(); | |
let states = new Map( | |
this.states | |
.items() | |
.map(([state, actions]) => [ | |
state, | |
new Map( | |
actions | |
.items() | |
.map(([token, [action, arg]]) => [ | |
tokens.get(token), | |
action === Reduce ? [1, arg.serialize(memo)] : [0, arg], | |
]) | |
), | |
]) | |
); | |
return { | |
tokens: tokens.reversed(), | |
states: states, | |
start_states: this.start_states, | |
end_states: this.end_states, | |
}; | |
} | |
static deserialize(data, memo) { | |
cls = ParseTable; | |
let tokens = data["tokens"]; | |
let states = new Map( | |
data["states"] | |
.items() | |
.map(([state, actions]) => [ | |
state, | |
new Map( | |
actions | |
.items() | |
.map(([token, [action, arg]]) => [ | |
tokens[token], | |
action === 1 | |
? [Reduce, new Rule.deserialize(arg, memo)] | |
: [Shift, arg], | |
]) | |
), | |
]) | |
); | |
return new cls(states, data["start_states"], data["end_states"]); | |
} | |
} | |
class IntParseTable extends ParseTable { | |
static from_ParseTable(parse_table) { | |
cls = IntParseTable; | |
let la; | |
let enum_ = new Array.from(parse_table.states); | |
let state_to_idx = new Map(enumerate(enum_).map(([i, s]) => [s, i])); | |
let int_states = {}; | |
for (let [s, la] of parse_table.states.items()) { | |
la = new Map( | |
la | |
.items() | |
.map(([k, v]) => [k, v[0] === Shift ? [v[0], state_to_idx[v[1]]] : v]) | |
); | |
int_states[state_to_idx[s]] = la; | |
} | |
let start_states = new Map( | |
parse_table.start_states | |
.items() | |
.map(([start, s]) => [start, state_to_idx[s]]) | |
); | |
let end_states = new Map( | |
parse_table.end_states | |
.items() | |
.map(([start, s]) => [start, state_to_idx[s]]) | |
); | |
return new cls(int_states, start_states, end_states); | |
} | |
} | |
function _wrap_lexer(lexer_class) { | |
let future_interface = lexer_class["__future_interface__"] || false; | |
if (future_interface) { | |
return lexer_class; | |
} else { | |
class CustomLexerWrapper extends Lexer { | |
constructor(lexer_conf) { | |
super(); | |
this.lexer = lexer_class(lexer_conf); | |
} | |
lex(lexer_state, parser_state) { | |
return this.lexer.lex(lexer_state.text); | |
} | |
} | |
return CustomLexerWrapper; | |
} | |
} | |
class MakeParsingFrontend { | |
constructor(parser_type, lexer_type) { | |
this.parser_type = parser_type; | |
this.lexer_type = lexer_type; | |
} | |
__call__(lexer_conf, parser_conf, options) { | |
console.assert(lexer_conf instanceof LexerConf); | |
console.assert(parser_conf instanceof ParserConf); | |
parser_conf.parser_type = this.parser_type; | |
lexer_conf.lexer_type = this.lexer_type; | |
return new ParsingFrontend({ lexer_conf, parser_conf, options }); | |
} | |
static deserialize(data, memo, lexer_conf, callbacks, options) { | |
cls = MakeParsingFrontend; | |
let parser_conf = new ParserConf.deserialize(data["parser_conf"], memo); | |
let parser = new LALR_Parser.deserialize( | |
data["parser"], | |
memo, | |
callbacks, | |
options.debug | |
); | |
parser_conf.callbacks = callbacks; | |
return new ParsingFrontend({ | |
lexer_conf, | |
parser_conf, | |
options, | |
parser: parser, | |
}); | |
} | |
} | |
class ParsingFrontend extends Serialize { | |
constructor({ lexer_conf, parser_conf, options, parser = null } = {}) { | |
super(); | |
let create_parser, create_lexer; | |
this.parser_conf = parser_conf; | |
this.lexer_conf = lexer_conf; | |
this.options = options; // Set-up parser | |
if (parser) { | |
// From cache | |
this.parser = parser; | |
} else { | |
create_parser = { | |
lalr: create_lalr_parser, | |
earley: create_earley_parser, | |
cyk: CYK_FrontEnd, | |
}[parser_conf.parser_type]; | |
this.parser = create_parser({ lexer_conf, parser_conf, options }); | |
} | |
let // Set-up lexer | |
lexer_type = lexer_conf.lexer_type; | |
this.skip_lexer = false; | |
if (lexer_type in ["dynamic", "dynamic_complete"]) { | |
console.assert(lexer_conf.postlex === null); | |
this.skip_lexer = true; | |
return; | |
} | |
exc = null; | |
try { | |
create_lexer = { | |
standard: create_traditional_lexer, | |
contextual: create_contextual_lexer, | |
}[lexer_type]; | |
} catch (e) { | |
exc = e; | |
if (e instanceof KeyError) { | |
console.assert(issubclass(lexer_type, Lexer), lexer_type); | |
this.lexer = _wrap_lexer(lexer_type)(lexer_conf); | |
} else { | |
throw e; | |
} | |
} | |
if (!exc) { | |
this.lexer = create_lexer(lexer_conf, this.parser, lexer_conf.postlex); | |
} | |
if (lexer_conf.postlex) { | |
this.lexer = new PostLexConnector(this.lexer, lexer_conf.postlex); | |
} | |
} | |
_verify_start({ start = null } = {}) { | |
let start_decls; | |
if (start === null) { | |
start_decls = this.parser_conf.start; | |
if (start_decls.length > 1) { | |
throw new ConfigurationError( | |
"Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", | |
start_decls | |
); | |
} | |
[start] = start_decls; | |
} else if (!(start in this.parser_conf.start)) { | |
throw new ConfigurationError( | |
"Unknown start rule %s. Must be one of %r".format( | |
start, | |
this.parser_conf.start | |
) | |
); | |
} | |
return start; | |
} | |
parse({ text, start = null, on_error = null } = {}) { | |
let chosen_start = this._verify_start({ start }); | |
let stream = this.skip_lexer ? text : new LexerThread(this.lexer, text); | |
let kw = on_error === null ? {} : { on_error: on_error }; | |
return this.parser.parse({ stream, chosen_start, ...kw }); | |
} | |
parse_interactive({ text = null, start = null } = {}) { | |
let chosen_start = this._verify_start(start); | |
if (this.parser_conf.parser_type != "lalr") { | |
throw new ConfigurationError( | |
"parse_interactive() currently only works with parser='lalr' " | |
); | |
} | |
let stream = this.skip_lexer ? text : new LexerThread(this.lexer, text); | |
return this.parser.parse_interactive(stream, chosen_start); | |
} | |
} | |
function get_frontend(parser, lexer) { | |
let // not custom lexer? | |
expected; | |
assert_config(parser, ["lalr", "earley", "cyk"]); | |
if (!(typeof lexer === "object")) { | |
// not custom lexer? | |
expected = { | |
lalr: ["standard", "contextual"], | |
earley: ["standard", "dynamic", "dynamic_complete"], | |
cyk: ["standard"], | |
}[parser]; | |
assert_config( | |
lexer, | |
expected, | |
"Parser %r does not support lexer %%r, expected one of %%s".format(parser) | |
); | |
} | |
return new MakeParsingFrontend(parser, lexer); | |
} | |
function _get_lexer_callbacks(transformer, terminals) { | |
let callback; | |
let result = {}; | |
for (let terminal of terminals) { | |
callback = transformer[terminal.name] || null; | |
if (callback != null) { | |
result[terminal.name] = callback; | |
} | |
} | |
return result; | |
} | |
class PostLexConnector { | |
constructor(lexer, postlexer) { | |
this.lexer = lexer; | |
this.postlexer = postlexer; | |
} | |
make_lexer_state(text) { | |
return this.lexer.make_lexer_state(text); | |
} | |
lex(lexer_state, parser_state) { | |
let i = this.lexer.lex(lexer_state, parser_state); | |
return this.postlexer.process(i); | |
} | |
} | |
function create_traditional_lexer(lexer_conf, parser, postlex) { | |
return new TraditionalLexer(lexer_conf); | |
} | |
function create_contextual_lexer(lexer_conf, parser, postlex) { | |
let states = new Map( | |
parser._parse_table.states | |
.items() | |
.map(([idx, t]) => [idx, new Array.from(t.keys())]) | |
); | |
let always_accept = postlex ? postlex.always_accept : []; | |
return new ContextualLexer({ | |
lexer_conf, | |
states, | |
always_accept: always_accept, | |
}); | |
} | |
function create_lalr_parser({ lexer_conf, parser_conf, options = null } = {}) { | |
let debug = options ? options.debug : false; | |
return new LALR_Parser({ parser_conf, debug: debug }); | |
} | |
create_earley_parser = NotImplemented; | |
CYK_FrontEnd = NotImplemented; | |
class LarkOptions extends Serialize { | |
/* | |
Specifies the options for Lark | |
*/ | |
get OPTIONS_DOC() { | |
return ` | |
**=== General Options ===** | |
start | |
The start symbol. Either a string, or a list of strings for multiple possible starts (Default: "start") | |
debug | |
Display debug information and extra warnings. Use only when debugging (default: False) | |
When used with Earley, it generates a forest graph as "sppf.png", if 'dot' is installed. | |
transformer | |
Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster) | |
propagate_positions | |
Propagates (line, column, end_line, end_column) attributes into all tree branches. | |
Accepts ````False````, ````True````, or a callable, which will filter which nodes to ignore when propagating. | |
maybe_placeholders | |
When ````True````, the ````[]```` operator returns ````None```` when not matched. | |
When ````False````, ````[]```` behaves like the ````?```` operator, and returns no value at all. | |
(default= ````False````. Recommended to set to ````True````) | |
cache | |
Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now. | |
- When ````False````, does nothing (default) | |
- When ````True````, caches to a temporary file in the local directory | |
- When given a string, caches to the path pointed by the string | |
regex | |
When True, uses the ````regex```` module instead of the stdlib ````re````. | |
g_regex_flags | |
Flags that are applied to all terminals (both regex and strings) | |
keep_all_tokens | |
Prevent the tree builder from automagically removing "punctuation" tokens (default: False) | |
tree_class | |
Lark will produce trees comprised of instances of this class instead of the default ````lark.Tree````. | |
**=== Algorithm Options ===** | |
parser | |
Decides which parser engine to use. Accepts "earley" or "lalr". (Default: "earley"). | |
(there is also a "cyk" option for legacy) | |
lexer | |
Decides whether or not to use a lexer stage | |
- "auto" (default): Choose for me based on the parser | |
- "standard": Use a standard lexer | |
- "contextual": Stronger lexer (only works with parser="lalr") | |
- "dynamic": Flexible and powerful (only with parser="earley") | |
- "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible. | |
ambiguity | |
Decides how to handle ambiguity in the parse. Only relevant if parser="earley" | |
- "resolve": The parser will automatically choose the simplest derivation | |
(it chooses consistently: greedy for tokens, non-greedy for rules) | |
- "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). | |
- "forest": The parser will return the root of the shared packed parse forest. | |
**=== Misc. / Domain Specific Options ===** | |
postlex | |
Lexer post-processing (Default: None) Only works with the standard and contextual lexers. | |
priority | |
How priorities should be evaluated - auto, none, normal, invert (Default: auto) | |
lexer_callbacks | |
Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. | |
use_bytes | |
Accept an input of type ````bytes```` instead of ````str```` (Python 3 only). | |
edit_terminals | |
A callback for editing the terminals before parse. | |
import_paths | |
A List of either paths or loader functions to specify from where grammars are imported | |
source_path | |
Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading | |
**=== End Options ===** | |
`; | |
} | |
get // Adding a new option needs to be done in multiple places: | |
// - In the dictionary below. This is the primary truth of which options `Lark.__init__` accepts | |
// - In the docstring above. It is used both for the docstring of `LarkOptions` and `Lark`, and in readthedocs | |
// - In `lark-stubs/lark.pyi`: | |
// - As attribute to `LarkOptions` | |
// - As parameter to `Lark.__init__` | |
// - Potentially in `_LOAD_ALLOWED_OPTIONS` below this class, when the option doesn't change how the grammar is loaded | |
// - Potentially in `lark.tools.__init__`, if it makes sense, and it can easily be passed as a cmd argument | |
_defaults() { | |
return { | |
debug: false, | |
keep_all_tokens: false, | |
tree_class: null, | |
cache: false, | |
postlex: null, | |
parser: "earley", | |
lexer: "auto", | |
transformer: null, | |
start: "start", | |
priority: "auto", | |
ambiguity: "auto", | |
regex: false, | |
propagate_positions: false, | |
lexer_callbacks: {}, | |
maybe_placeholders: false, | |
edit_terminals: null, | |
g_regex_flags: 0, | |
use_bytes: false, | |
import_paths: [], | |
source_path: null, | |
}; | |
} | |
constructor(options_dict) { | |
super(); | |
let value; | |
let o = dict(options_dict); | |
let options = {}; | |
for (let [name, default_] of this._defaults.items()) { | |
if (name in o) { | |
value = o.pop(name); | |
if ( | |
default_ instanceof bool && | |
!(name in ["cache", "use_bytes", "propagate_positions"]) | |
) { | |
value = bool(value); | |
} | |
} else { | |
value = default_; | |
} | |
options[name] = value; | |
} | |
if (typeof options["start"] === "string") { | |
options["start"] = [options["start"]]; | |
} | |
this["options"] = options; | |
assert_config(this.parser, ["earley", "lalr", "cyk", null]); | |
if (this.parser === "earley" && this.transformer) { | |
throw new ConfigurationError( | |
"Cannot specify an embedded transformer when using the Earley algorithm. " + | |
"Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)" | |
); | |
} | |
if (o) { | |
throw new ConfigurationError("Unknown options: %s".format(o.keys())); | |
} | |
} | |
__getattr__(name) { | |
try { | |
return this["options"][name]; | |
} catch (e) { | |
if (e instanceof KeyError) { | |
throw new AttributeError(e); | |
} else { | |
throw e; | |
} | |
} | |
} | |
__setattr__(name, value) { | |
assert_config( | |
name, | |
this.options.keys(), | |
"%r isn't a valid option. Expected one of: %s" | |
); | |
this.options[name] = value; | |
} | |
serialize(memo) { | |
return this.options; | |
} | |
static deserialize(data, memo) { | |
cls = LarkOptions; | |
return new cls(data); | |
} | |
} // Options that can be passed to the Lark parser, even when it was loaded from cache/standalone. | |
// These option are only used outside of `load_grammar`. | |
_LOAD_ALLOWED_OPTIONS = new Set([ | |
"postlex", | |
"transformer", | |
"lexer_callbacks", | |
"use_bytes", | |
"debug", | |
"g_regex_flags", | |
"regex", | |
"propagate_positions", | |
"tree_class", | |
]); | |
_VALID_PRIORITY_OPTIONS = ["auto", "normal", "invert", null]; | |
_VALID_AMBIGUITY_OPTIONS = ["auto", "resolve", "explicit", "forest"]; | |
class PostLex extends ABC { | |
process(stream) { | |
return stream; | |
} | |
get always_accept() { | |
return []; | |
} | |
} | |
class Lark extends Serialize { | |
/* | |
Main interface for the library. | |
It's mostly a thin wrapper for the many different parsers, and for the tree constructor. | |
Parameters: | |
grammar: a string or file-object containing the grammar spec (using Lark's ebnf syntax) | |
options: a dictionary controlling various aspects of Lark. | |
Example: | |
>>> Lark(r'''start: "foo" ''') | |
Lark(...) | |
*/ | |
constructor({ grammar, ...options } = {}) { | |
super(); | |
let cached_parser_data, | |
read, | |
s, | |
terminals_to_keep, | |
file_md5, | |
options_str, | |
re_module, | |
cached_used_files, | |
old_options, | |
unhashable; | |
this.options = new LarkOptions(options); | |
let // Set regex or re module | |
use_regex = this.options.regex; | |
if (use_regex) { | |
if (regex) { | |
re_module = regex; | |
} else { | |
throw new ImportError( | |
"`regex` module must be installed if calling `Lark(regex=True)`." | |
); | |
} | |
} else { | |
re_module = re; | |
} // Some, but not all file-like objects have a 'name' attribute | |
if (this.options.source_path === null) { | |
if (grammar && grammar.name) { | |
this.source_path = grammar.name; | |
} else { | |
this.source_path = "<string>"; | |
} | |
} else { | |
this.source_path = this.options.source_path; | |
} // Drain file-like objects to get their contents | |
if (grammar && grammar.read) { | |
// Drain file-like objects to get their contents | |
read = grammar.read; | |
grammar = read(); | |
} else { | |
// pass | |
} | |
let cache_fn = null; | |
let cache_md5 = null; | |
if (typeof grammar === "string") { | |
this.source_grammar = grammar; | |
if (this.options.use_bytes) { | |
if (!isascii(grammar)) { | |
throw new ConfigurationError( | |
"Grammar must be ascii only, when use_bytes=True" | |
); | |
} | |
} | |
if (this.options.cache) { | |
if (this.options.parser != "lalr") { | |
throw new ConfigurationError( | |
"cache only works with parser='lalr' for now" | |
); | |
} | |
unhashable = [ | |
"transformer", | |
"postlex", | |
"lexer_callbacks", | |
"edit_terminals", | |
]; | |
options_str = "".join( | |
options | |
.items() | |
.filter(([k, v]) => !(k in unhashable)) | |
.map(([k, v]) => k + str(v)) | |
); | |
s = | |
grammar + options_str + __version__ + str(sys.version_info.slice(2)); | |
cache_md5 = hashlib.md5(s.encode("utf8")).hexdigest(); | |
if (typeof this.options.cache === "string") { | |
cache_fn = this.options.cache; | |
} else { | |
if (this.options.cache != true) { | |
throw new ConfigurationError("cache argument must be bool or str"); | |
} // Python2.7 doesn't support * syntax in tuples | |
cache_fn = | |
tempfile.gettempdir() + | |
"/.lark_cache_%s_%s_%s.tmp".format( | |
[cache_md5] + sys.version_info.slice(2) | |
); | |
} | |
if (new FS.exists(cache_fn)) { | |
logger.debug("Loading grammar from cache: %s", cache_fn); // Remove options that aren't relevant for loading from cache | |
for (let name of new Set(options) - _LOAD_ALLOWED_OPTIONS) { | |
} | |
old_options = this.options; | |
try { | |
file_md5 = f.readline().rstrip("\n"); | |
cached_used_files = pickle.load(f); | |
if ( | |
file_md5 === cache_md5.encode("utf8") && | |
verify_used_files(cached_used_files) | |
) { | |
cached_parser_data = pickle.load(f); | |
this._load({ cached_parser_data, ...options }); | |
return; | |
} | |
} catch (e) { | |
if (e instanceof Error) { | |
// We should probably narrow done which errors we catch here. | |
logger.exception( | |
"Failed to load Lark from cache: %r. We will try to carry on.".format( | |
cache_fn | |
) | |
); // In theory, the Lark instance might have been messed up by the call to `_load`. | |
// In practice the only relevant thing that might have been overriden should be `options` | |
this.options = old_options; | |
} else { | |
throw e; | |
} | |
} | |
} | |
} | |
[ | |
// Parse the grammar file and compose the grammars | |
this.grammar, | |
used_files, | |
] = load_grammar( | |
grammar, | |
this.source_path, | |
this.options.import_paths, | |
this.options.keep_all_tokens | |
); | |
} else { | |
console.assert(grammar instanceof Grammar); | |
this.grammar = grammar; | |
} | |
if (this.options.lexer === "auto") { | |
if (this.options.parser === "lalr") { | |
this.options.lexer = "contextual"; | |
} else if (this.options.parser === "earley") { | |
if (this.options.postlex != null) { | |
logger.info( | |
"postlex can't be used with the dynamic lexer, so we use standard instead. " + | |
"Consider using lalr with contextual instead of earley" | |
); | |
this.options.lexer = "standard"; | |
} else { | |
this.options.lexer = "dynamic"; | |
} | |
} else if (this.options.parser === "cyk") { | |
this.options.lexer = "standard"; | |
} else { | |
console.assert(false, this.options.parser); | |
} | |
} | |
let lexer = this.options.lexer; | |
if (typeof lexer === "object") { | |
console.assert(issubclass(lexer, Lexer)); | |
} else { | |
// XXX Is this really important? Maybe just ensure interface compliance | |
assert_config(lexer, [ | |
"standard", | |
"contextual", | |
"dynamic", | |
"dynamic_complete", | |
]); | |
if (this.options.postlex != null && "dynamic" in lexer) { | |
throw new ConfigurationError( | |
"Can't use postlex with a dynamic lexer. Use standard or contextual instead" | |
); | |
} | |
} | |
if (this.options.ambiguity === "auto") { | |
if (this.options.parser === "earley") { | |
this.options.ambiguity = "resolve"; | |
} | |
} else { | |
assert_config( | |
this.options.parser, | |
["earley", "cyk"], | |
"%r doesn't support disambiguation. Use one of these parsers instead: %s" | |
); | |
} | |
if (this.options.priority === "auto") { | |
this.options.priority = "normal"; | |
} | |
if (!(this.options.priority in _VALID_PRIORITY_OPTIONS)) { | |
throw new ConfigurationError( | |
"invalid priority option: %r. Must be one of %r".format( | |
this.options.priority, | |
_VALID_PRIORITY_OPTIONS | |
) | |
); | |
} | |
console.assert( | |
!(this.options.ambiguity in ["resolve__antiscore_sum"]), | |
'resolve__antiscore_sum has been replaced with the option priority="invert"' | |
); | |
if (!(this.options.ambiguity in _VALID_AMBIGUITY_OPTIONS)) { | |
throw new ConfigurationError( | |
"invalid ambiguity option: %r. Must be one of %r".format( | |
this.options.ambiguity, | |
_VALID_AMBIGUITY_OPTIONS | |
) | |
); | |
} | |
if (this.options.postlex != null) { | |
terminals_to_keep = new Set(this.options.postlex.always_accept); | |
} else { | |
terminals_to_keep = new Set(); | |
} | |
[ | |
// Compile the EBNF grammar into BNF | |
this.terminals, | |
this.rules, | |
this.ignore_tokens, | |
] = this.grammar.compile(this.options.start, terminals_to_keep); | |
if (this.options.edit_terminals) { | |
for (let t of this.terminals) { | |
this.options.edit_terminals(t); | |
} | |
} | |
this._terminals_dict = new Map(this.terminals.map((t) => [t.name, t])); // If the user asked to invert the priorities, negate them all here. | |
// This replaces the old 'resolve__antiscore_sum' option. | |
if (this.options.priority === "invert") { | |
for (let rule of this.rules) { | |
if (rule.options.priority != null) { | |
rule.options.priority = -rule.options.priority; | |
} | |
} | |
} | |
// Else, if the user asked to disable priorities, strip them from the | |
// rules. This allows the Earley parsers to skip an extra forest walk | |
// for improved performance, if you don't need them (or didn't specify any). | |
else if (this.options.priority === null) { | |
for (let rule of this.rules) { | |
if (rule.options.priority != null) { | |
rule.options.priority = null; | |
} | |
} | |
} // TODO Deprecate lexer_callbacks? | |
this.lexer_conf = new LexerConf({ | |
unknown_param: this.terminals, | |
re_module, | |
unknown_param: this.ignore_tokens, | |
unknown_param: this.options.postlex, | |
unknown_param: this.options.lexer_callbacks, | |
unknown_param: this.options.g_regex_flags, | |
use_bytes: this.options.use_bytes, | |
}); | |
if (this.options.parser) { | |
this.parser = this._build_parser(); | |
} else if (lexer) { | |
this.lexer = this._build_lexer(); | |
} | |
if (cache_fn) { | |
logger.debug("Saving grammar to cache: %s", cache_fn); | |
f.write(cache_md5.encode("utf8") + "\n"); | |
pickle.dump(used_files, f); | |
this.save(f); | |
} | |
} | |
_build_lexer({ dont_ignore = false } = {}) { | |
let lexer_conf = this.lexer_conf; | |
if (dont_ignore) { | |
lexer_conf = copy(lexer_conf); | |
lexer_conf.ignore = []; | |
} | |
return new TraditionalLexer(lexer_conf); | |
} | |
_prepare_callbacks() { | |
this._callbacks = {}; // we don't need these callbacks if we aren't building a tree | |
if (this.options.ambiguity != "forest") { | |
this._parse_tree_builder = new ParseTreeBuilder( | |
this.rules, | |
this.options.tree_class || Tree, | |
this.options.propagate_positions, | |
this.options.parser != "lalr" && this.options.ambiguity === "explicit", | |
this.options.maybe_placeholders | |
); | |
this._callbacks = this._parse_tree_builder.create_callback( | |
this.options.transformer | |
); | |
} | |
this._callbacks.update( | |
_get_lexer_callbacks(this.options.transformer, this.terminals) | |
); | |
} | |
_build_parser() { | |
this._prepare_callbacks(); | |
let parser_class = get_frontend(this.options.parser, this.options.lexer); | |
let parser_conf = new ParserConf( | |
this.rules, | |
this._callbacks, | |
this.options.start | |
); | |
return parser_class({ | |
unknown_param: this.lexer_conf, | |
parser_conf, | |
options: this.options, | |
}); | |
} | |
save(f) { | |
/* | |
Saves the instance into the given file object | |
Useful for caching and multiprocessing. | |
*/ | |
[data, m] = this.memo_serialize([TerminalDef, Rule]); | |
pickle.dump({ | |
unknown_param: { data: data, memo: m }, | |
f, | |
protocol: pickle.HIGHEST_PROTOCOL, | |
}); | |
} | |
static load(f) { | |
cls = Lark; /* | |
Loads an instance from the given file object | |
Useful for caching and multiprocessing. | |
*/ | |
let inst = cls.__new__(cls); | |
return inst._load(f); | |
} | |
_deserialize_lexer_conf(data, memo, options) { | |
let lexer_conf = new LexerConf.deserialize(data["lexer_conf"], memo); | |
lexer_conf.callbacks = options.lexer_callbacks || {}; | |
lexer_conf.re_module = options.regex ? regex : re; | |
lexer_conf.use_bytes = options.use_bytes; | |
lexer_conf.g_regex_flags = options.g_regex_flags; | |
lexer_conf.skip_validation = true; | |
lexer_conf.postlex = options.postlex; | |
return lexer_conf; | |
} | |
_load({ f, ...kwargs } = {}) { | |
let d; | |
if (f instanceof dict) { | |
d = f; | |
} else { | |
d = pickle.load(f); | |
} | |
let memo_json = d["memo"]; | |
let data = d["data"]; | |
console.assert(memo_json); | |
let memo = new SerializeMemoizer.deserialize( | |
memo_json, | |
{ Rule: Rule, TerminalDef: TerminalDef }, | |
{} | |
); | |
let options = dict(data["options"]); | |
if ( | |
(new Set(kwargs) - _LOAD_ALLOWED_OPTIONS) & | |
new Set(LarkOptions._defaults) | |
) { | |
throw new ConfigurationError( | |
"Some options are not allowed when loading a Parser: {}".format( | |
new Set(kwargs) - _LOAD_ALLOWED_OPTIONS | |
) | |
); | |
} | |
options.update(kwargs); | |
this.options = new LarkOptions.deserialize(options, memo); | |
this.rules = data["rules"].map((r) => new Rule.deserialize(r, memo)); | |
this.source_path = "<deserialized>"; | |
let parser_class = get_frontend(this.options.parser, this.options.lexer); | |
this.lexer_conf = this._deserialize_lexer_conf( | |
data["parser"], | |
memo, | |
this.options | |
); | |
this.terminals = this.lexer_conf.terminals; | |
this._prepare_callbacks(); | |
this._terminals_dict = new Map(this.terminals.map((t) => [t.name, t])); | |
this.parser = parser_class.deserialize( | |
data["parser"], | |
memo, | |
this.lexer_conf, | |
this._callbacks, | |
this.options | |
); | |
return this; | |
} | |
static _load_from_dict({ data, memo, ...kwargs } = {}) { | |
cls = Lark; | |
let inst = cls.__new__(cls); | |
return inst._load({ unknown_param: { data: data, memo: memo }, ...kwargs }); | |
} | |
static open({ grammar_filename, rel_to = null, ...options } = {}) { | |
cls = Lark; | |
let basepath; /* | |
Create an instance of Lark with the grammar given by its filename | |
If ``rel_to`` is provided, the function will find the grammar filename in relation to it. | |
Example: | |
>>> Lark.open("grammar_file.lark", rel_to=__file__, parser="lalr") | |
Lark(...) | |
*/ | |
if (rel_to) { | |
basepath = os.path.dirname(rel_to); | |
grammar_filename = os.path.join(basepath, grammar_filename); | |
} | |
return new cls({ f, ...options }); | |
} | |
static open_from_package({ | |
package_, | |
grammar_path, | |
search_paths = [""], | |
...options | |
} = {}) { | |
cls = Lark; /* | |
Create an instance of Lark with the grammar loaded from within the package `package`. | |
This allows grammar loading from zipapps. | |
Imports in the grammar will use the `package` and `search_paths` provided, through `FromPackageLoader` | |
Example: | |
Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...) | |
*/ | |
let package_loader = new FromPackageLoader(package_, search_paths); | |
[full_path, text] = package_loader(null, grammar_path); | |
options.setdefault("source_path", full_path); | |
options.setdefault("import_paths", []); | |
options["import_paths"].push(package_loader); | |
return new cls({ text, ...options }); | |
} | |
__repr__() { | |
return "Lark(open(%r), parser=%r, lexer=%r, ...)".format( | |
this.source_path, | |
this.options.parser, | |
this.options.lexer | |
); | |
} | |
lex({ text, dont_ignore = false } = {}) { | |
let lexer; /* | |
Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard' | |
When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. | |
*/ | |
if (!hasattr(this, "lexer") || dont_ignore) { | |
lexer = this._build_lexer({ dont_ignore }); | |
} else { | |
lexer = this.lexer; | |
} | |
let lexer_thread = new LexerThread(lexer, text); | |
let stream = lexer_thread.lex(null); | |
if (this.options.postlex) { | |
return this.options.postlex.process(stream); | |
} | |
return stream; | |
} | |
get_terminal(name) { | |
/* | |
Get information about a terminal | |
*/ | |
return this._terminals_dict[name]; | |
} | |
parse_interactive({ text = null, start = null } = {}) { | |
/* | |
Start an interactive parsing session. | |
Parameters: | |
text (str, optional): Text to be parsed. Required for ``resume_parse()``. | |
start (str, optional): Start symbol | |
Returns: | |
A new InteractiveParser instance. | |
See Also: ``Lark.parse()`` | |
*/ | |
return this.parser.parse_interactive({ text, start: start }); | |
} | |
parse({ text, start = null, on_error = null } = {}) { | |
/* | |
Parse the given text, according to the options provided. | |
Parameters: | |
text (str): Text to be parsed. | |
start (str, optional): Required if Lark was given multiple possible start symbols (using the start option). | |
on_error (function, optional): if provided, will be called on UnexpectedToken error. Return true to resume parsing. | |
LALR only. See examples/advanced/error_handling.py for an example of how to use on_error. | |
Returns: | |
If a transformer is supplied to ``__init__``, returns whatever is the | |
result of the transformation. Otherwise, returns a Tree instance. | |
*/ | |
return this.parser.parse({ text, start: start, on_error: on_error }); | |
} | |
} | |
class DedentError extends LarkError { | |
// pass | |
} | |
class Indenter extends PostLex { | |
constructor() { | |
super(); | |
this.paren_level = null; | |
this.indent_level = null; | |
console.assert(this.tab_len > 0); | |
} | |
*handle_NL(token) { | |
if (this.paren_level > 0) { | |
return; | |
} | |
yield token; | |
let indent_str = token.rsplit("\n", 1)[1]; | |
let // Tabs and spaces | |
indent = indent_str.count(" ") + indent_str.count("\t") * this.tab_len; | |
if (indent > this.indent_level[-1]) { | |
this.indent_level.push(indent); | |
yield new Token.new_borrow_pos(this.INDENT_type, indent_str, token); | |
} else { | |
while (indent < this.indent_level[-1]) { | |
this.indent_level.pop(); | |
yield new Token.new_borrow_pos(this.DEDENT_type, indent_str, token); | |
} | |
if (indent != this.indent_level[-1]) { | |
throw new DedentError( | |
"Unexpected dedent to column %s. Expected dedent to %s".format( | |
indent, | |
this.indent_level[-1] | |
) | |
); | |
} | |
} | |
} | |
*_process(stream) { | |
for (let token of stream) { | |
if (token.type === this.NL_type) { | |
for (let t of this.handle_NL(token)) { | |
yield t; | |
} | |
} else { | |
yield token; | |
} | |
if (token.type in this.OPEN_PAREN_types) { | |
this.paren_level += 1; | |
} else if (token.type in this.CLOSE_PAREN_types) { | |
this.paren_level -= 1; | |
console.assert(this.paren_level >= 0); | |
} | |
} | |
while (this.indent_level.length > 1) { | |
this.indent_level.pop(); | |
yield new Token(this.DEDENT_type, ""); | |
} | |
console.assert(this.indent_level === [0], this.indent_level); | |
} | |
process(stream) { | |
this.paren_level = 0; | |
this.indent_level = [0]; | |
return this._process(stream); | |
} | |
get always_accept() { | |
return [this.NL_type]; | |
} | |
} | |
module.exports = { | |
LarkError, | |
ConfigurationError, | |
assert_config, | |
GrammarError, | |
ParseError, | |
LexError, | |
UnexpectedInput, | |
UnexpectedEOF, | |
UnexpectedCharacters, | |
UnexpectedToken, | |
VisitError, | |
Meta, | |
Discard, | |
Transformer, | |
Transformer_InPlace, | |
Transformer_NonRecursive, | |
Transformer_InPlaceRecursive, | |
VisitorBase, | |
Visitor, | |
Visitor_Recursive, | |
Interpreter, | |
Symbol, | |
Terminal, | |
NonTerminal, | |
RuleOptions, | |
Rule, | |
Pattern, | |
PatternStr, | |
PatternRE, | |
Token, | |
LineCounter, | |
UnlessCallback, | |
CallChain, | |
Lexer, | |
TraditionalLexer, | |
LexerState, | |
ContextualLexer, | |
LexerThread, | |
LexerConf, | |
ParserConf, | |
ExpandSingleChild, | |
PropagatePositions, | |
make_propagate_positions, | |
ChildFilter, | |
ChildFilterLALR, | |
ChildFilterLALR_NoPlaceholders, | |
_should_expand, | |
maybe_create_child_filter, | |
AmbiguousExpander, | |
maybe_create_ambiguous_expander, | |
AmbiguousIntermediateExpander, | |
inplace_transformer, | |
apply_visit_wrapper, | |
ParseTreeBuilder, | |
LALR_Parser, | |
ParseConf, | |
ParserState, | |
_Parser, | |
Action, | |
ParseTable, | |
IntParseTable, | |
_wrap_lexer, | |
MakeParsingFrontend, | |
ParsingFrontend, | |
get_frontend, | |
_get_lexer_callbacks, | |
PostLexConnector, | |
create_traditional_lexer, | |
create_contextual_lexer, | |
create_lalr_parser, | |
LarkOptions, | |
PostLex, | |
Lark, | |
DedentError, | |
Indenter, | |
}; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment