A template-based BNF parser inspired by @kach/nearly.
I'm starting by cleaning up the nearly/nearly.js source code.
export class Rule {
static highestId = 0;
id = ++Rule.highestId;
constructor(name, symbols, postprocess) {
this.name = name;
this.symbols = symbols;
this.postprocess = postprocess;
}
toString(withCursorAt) {
return `${this.name} → ${
typeof withCursorAt === "undefined"
? this.symbols.map(Parser.getSymbolShortDisplay).join(" ")
: `${this.symbols
.slice(0, withCursorAt)
.map(Parser.getSymbolShortDisplay)
.join(" ")} ● ${this.symbols
.slice(withCursorAt)
.map(Parser.getSymbolShortDisplay)
.join(" ")}`
}`;
}
static from(rule) {
return new this(rule.name, rule.symbol, rule.postprocess);
}
}
class State {
constructor(rule, dot, reference, wantedBy) {
this.rule = rule;
this.dot = dot;
this.reference = reference;
this.data = [];
this.wantedBy = wantedBy;
this.isComplete = this.dot === rule.symbols.length;
}
toString() {
return `{${this.rule.toString(this.dot)}}, from: ${this.reference || 0}`;
}
nextState(child) {
const state = new State(
this.rule,
this.dot + 1,
this.reference,
this.wantedBy
);
state.left = this;
state.right = child;
if (state.isComplete) {
state.data = state.build();
// Having right set here will prevent the right state and its children form being garbage collected
state.right = undefined;
}
return state;
}
build() {
const children = [];
for (let node = this; node.left; node = node.left)
children.push(node.right.data);
return children;
}
finish() {
if (this.rule.postprocess)
this.data = this.rule.postprocess(this.data, this.reference, Parser.fail);
}
}
class Column {
constructor(grammar, index) {
this.grammar = grammar;
this.index = index;
this.states = [];
this.wants = {}; // states indexed by the non-terminal they expect
this.scannable = []; // list of states that expect a token
this.completed = {}; // states that are nullable
}
process(nextColumn) {
const [states, wants, completed] = this;
for (let w = 0, state; w < states.length; w++) {
// nb. we push() during iteration
state = states[w];
if (state.isComplete) {
state.finish();
if (state.data !== Parser.fail) {
// complete
let wantedBy = state.wantedBy;
for (
let i = wantedBy.length;
i--;
this.complete(/* left */ wantedBy[i], state)
); // this line is hot
// special-case nullables
if (state.reference === this.index) {
// make sure future predictors of this rule get completed.
const exp = state.rule.name;
(this.completed[exp] = this.completed[exp] || []).push(state);
}
}
} else {
// queue scannable states
const exp = state.rule.symbols[state.dot];
if (typeof exp !== "string") {
this.scannable.push(state);
continue;
}
// predict
if (wants[exp]) {
wants[exp].push(state);
if (completed.hasOwnProperty(exp))
for (
let nulls = completed[exp], i = 0;
i < nulls.length;
this.complete(state, /* right */ nulls[i++])
);
} else {
wants[exp] = [state];
this.predict(exp);
}
}
}
}
predict(ext) {
const rules = this.grammar.byName[exp];
if (rules)
for (
let i = 0;
i < rules.length;
this.states.push(
new State(rules[i++], 0, this.index, /* wantedBy */ this.wants[exp])
)
);
}
complete(left, right) {
this.states.push(/* copy */ left.nextState(right));
}
}
export class Grammar {
constructor(rules, start) {
this.rules = rules;
this.start = start || rules[0].name;
const byName = {};
for (const rule of rules)
(byName[rule.name] || (byName[rule.name] = [])).push(rule);
this.byName = byName;
}
fromCompiled(rules, start) {
// So we can allow passing (rules, start) directly to Parser for backwards compatibility
const grammar = rules.ParserStart
? new Grammar(rules.ParserRules.map(Rule.from, Rule), rules.ParserStart)
: new Grammar(rules.map(Rule.from, Rule), start);
// nb. storing lexer on Grammar is iffy, but unavoidable
grammar.lexer = rules.Lexer;
return grammar;
}
}
class StreamLexer {
buffer = "";
index = 0;
line = 1;
lastLineBreak = 0;
reset(data, state) {
this.buffer = data;
this.index = 0;
this.line = state ? state.line : 1;
this.lastLineBreak = state ? -state.col : 0;
}
next() {
if (this.index < this.buffer.length) {
const ch = this.buffer[this.index++];
if (ch === "\n") {
this.line += 1;
this.lastLineBreak = this.index;
}
return { value: ch };
}
}
save() {
return {
line: this.line,
col: this.index - this.lastLineBreak,
};
}
formatError(token, message) {
// nb. this gets called after consuming the offending token, so the culprit is index-1
const { buffer, line, index, lastLineBreak } = this;
if (typeof buffer === "string") {
const col = index - lastLineBreak;
const lastLineDigits = `${line}`.length;
const lines = buffer.split("\n").slice(Math.max(0, line - 5), line);
// let nextLineBreak = buffer.indexOf("\n", index);
// if (nextLineBreak === -1) nextLineBreak = buffer.length;
for (
let i = 0;
i < lines.length;
lines[i] = `${`${line - lines.length + i + 1}`.padStart(
lastLineDigits
)}${lines[i]}`
);
return `${message} at line ${line} col ${col}:\n\n${lines.join(
"\n"
)}\n${" ".repeat(lastLineDigits + col)}^\n`;
return message;
} else {
return `${message} at index ${index - 1}`;
}
}
}
export class Parser {
static fail = {};
constructor(grammar, options) {
this.grammar = grammar;
this.options = {
keepHistory: false,
lexer: grammar.lexer || new StreamLexer(),
...options,
};
// Setup lexer
this.lexerState = undefined;
// Setup a table
const column = new Column(grammar, 0);
// I could be expecting anything.
column.wants[grammar.start] = [];
column.predict(grammar.start);
// TODO what if start rule is nullable?
column.process();
this.table = [column];
this.current = 0; // token index
}
get lexer() {
return this.options.lexer;
}
feed(chunk) {
const { lexer, table } = this;
lexer.reset(chunk, this.lexerState);
let token,
current,
error,
column,
nextColumn,
literal,
value,
scannable,
state,
nextState,
expected;
while (true) {
token =
current =
error =
column =
nextColumn =
literal =
value =
scannable =
state =
nextState =
expected =
undefined;
try {
token = lexer.next();
if (!token) break;
} catch (exception) {
// Create the next column so that the error reporter
// can display the correctly predicted states.
this.table.push(
(nextColumn = new Column(this.grammar, this.current + 1))
);
error = new Error(this.reportLexerError(exception));
error.offset = this.current;
error.token = exception.token;
throw error;
}
// We add new states to table[current+1]
column = this.table[this.current];
current = this.current + 1;
nextColumn = new Column(this.grammar, current);
// GC unused states
if (!this.options.keepHistory) delete this.table[this.current - 1];
this.table.push(nextColumn);
// Advance all tokens that expect the symbol
literal = token.text !== undefined ? token.text : token.value;
value = lexer.constructor === StreamLexer ? token.value : token;
scannable = column.scannable;
for (let w = scannable.length; w--; ) {
state = scannable[w];
expect = state.rule.symbols[state.dot];
// Try to consume the token
// either regex or literal
if (
expect.test
? expect.test(value)
: expect.type
? expect.type === token.type
: expect.literal === literal
) {
// Add it
nextState = state.nextState({
data: value,
token: token,
isToken: true,
reference: current - 1,
});
nextColumn.states.push(nextState);
}
}
// Next, for each of the rules, we either
// (a) complete it, and try to see if the reference row expected that
// rule
// (b) predict the next nonterminal it expects by adding that
// nonterminal's start state
// To prevent duplication, we also keep track of rules we have already
// added
nextColumn.process();
// If needed, throw an error:
if (nextColumn.states.length === 0) {
// No states at all! This is not good.
error = new Error(this.reportError(token));
error.offset = this.current;
error.token = token;
throw error;
}
// maybe save lexer state
if (this.options.keepHistory) {
column.lexerState = lexer.save();
}
this.current++;
}
if (column) {
this.lexerState = lexer.save();
}
// Incrementally keep track of results
this.results = this.finish();
// Allow chaining, for whatever it's worth
return this;
}
save() {
const column = this.table[this.current];
column.lexerState = this.lexerState;
return column;
}
restore(column) {
const index = column.index;
this.current = index;
this.table[index] = column;
this.table.splice(index + 1);
this.lexerState = column.lexerState;
// Incrementally keep track of results
this.results = this.finish();
}
rewind(index) {
if (!this.options.keepHistory)
throw new Error("set option `keepHistory` to enable rewinding");
// nb. recall column (table) indicies fall between token indicies.
// col 0 -- token 0 -- col 1
this.restore(this.table[index]);
}
finish() {
// Return the possible parsings
const considerations = [];
const data = [];
const start = this.grammar.start;
for (const state of this.table[this.table.length - 1].states)
state.rule.name === start &&
state.dot === state.rule.symbols.length &&
state.reference === 0 &&
state.data !== Parser.fail &&
(considerations.push(state), data.push(state.data));
return data;
}
reportLexerError(lexerError) {
// Planning to add a token property to moo's thrown error
// even on erroring tokens to be used in error display below
const token = lexerError.token;
return token
? this.reportErrorCommon(
this.lexer.formatError(token, "Syntax error"),
`input ${JSON.stringify(token.text[0])} (lexer error)`
)
: this.reportErrorCommon(lexerError.message, "input (lexer error)");
}
reportError(token) {
return this.reportErrorCommon(
this.lexer.formatError(token, "Syntax error"),
`${token.type ? token.type + " token: " : ""}${JSON.stringify(
token.value !== undefined ? token.value : token
)}`
);
}
reportErrorCommon(lexerMessage, tokenDisplay) {
const lines = [],
expectantStates = [],
stateStacks = [],
lastColumnIndex = this.table.length - 2,
lastColumn = this.table[lastColumnIndex];
let stateStack;
lines.push(lexerMessage);
for (const state of lastColumn.states)
state.rule.symbols[state.dot] &&
typeof state.rule.symbols[state.dot] !== "string" &&
expectantStates.push(state);
if (expectantStates.length === 0) {
lines.push(
`Unexpected ${tokenDisplay}. I did not expect any more input. Here is the state of my parse table:\n`
);
this.displayStateStack(lastColumn.states, lines);
} else {
lines.push(
`Unexpected ${tokenDisplay}. Instead, I was expecting to see one of the following:\n`
);
for (const state of expectantStates) {
// Display a "state stack" for each expectant state
// - which shows you how this state came to be, step by step.
// If there is more than one derivation, we only display the first one.
stateStack = this.buildFirstStateStack(state, []) || [state];
stateStacks.push(stateStack);
// Display each state that is expecting a terminal symbol next.
lines.push(
`A ${
/* symbolDisplay */ Parser.getSymbolDisplay(
/* nextSymbol */ stateStack[0].rule.symbols[stateStack[0].dot]
)
} based on:`
);
this.displayStateStack(stateStack, lines);
}
}
lines.push("");
return lines.join("\n");
}
displayStateStack(stateStack, lines) {
for (
let j = 0, lastDisplay, sameDisplayCount = 0, state, display;
j < stateStack.length;
j++
) {
(state = stateStack[j]), (display = state.rule.toString(state.dot));
if (display === lastDisplay) {
sameDisplayCount++;
} else {
if (sameDisplayCount > 0) {
lines.push(
" ^ " + sameDisplayCount + " more lines identical to this"
);
}
sameDisplayCount = 0;
lines.push(" " + display);
}
lastDisplay = display;
}
}
buildFirstStateStack(state, visited) {
// Found cycle, return null
// to eliminate this path from the results, because
// we don't know how to display it meaningfully
if (visited.indexOf(state) !== -1) return null;
if (state.wantedBy.length === 0) return [state];
const prevState = state.wantedBy[0];
const childVisited = [state].concat(visited);
const childResult = this.buildFirstStateStack(prevState, childVisited);
if (childResult === null) return null;
return [state].concat(childResult);
}
static getSymbolLongDisplay(symbol) {
if (typeof symbol === "string") return symbol;
else if (typeof symbol === "object")
if (symbol.literal) return JSON.stringify(symbol.literal);
else if (symbol instanceof RegExp) return `character matching ${symbol}`;
else if (symbol.type) return `${symbol.type} token`;
else if (symbol.test) return `token matching ${symbol.test}`;
throw new Error(`Unknown symbol type: ${symbol}`);
}
static getSymbolShortDisplay(symbol) {
if (typeof symbol === "string") return symbol;
else if (typeof symbol === "object")
if (symbol.literal) return JSON.stringify(symbol.literal);
else if (symbol instanceof RegExp) return `${symbol}`;
else if (symbol.type) return `﹪${symbol.type}`;
else if (symbol.test) return `‹${symbol.test}›`;
throw new Error(`Unknown symbol type: ${symbol}`);
}
}