Skip to content

Instantly share code, notes, and snippets.

@SMotaal
Last active January 6, 2022 15:55
Show Gist options
  • Save SMotaal/d6c46d0cada8ec5d11db116177f24ab9 to your computer and use it in GitHub Desktop.
Save SMotaal/d6c46d0cada8ec5d11db116177f24ab9 to your computer and use it in GitHub Desktop.
The Farley BNF Parser

Farley

A template-based BNF parser inspired by @kach/nearly.

Nearley

I'm starting by cleaning up the nearly/nearly.js source code.

export class Rule {
  static highestId = 0;

  id = ++Rule.highestId;

  constructor(name, symbols, postprocess) {
    this.name = name;
    this.symbols = symbols;
    this.postprocess = postprocess;
  }

  toString(withCursorAt) {
    return `${this.name}${
      typeof withCursorAt === "undefined"
        ? this.symbols.map(Parser.getSymbolShortDisplay).join(" ")
        : `${this.symbols
            .slice(0, withCursorAt)
            .map(Parser.getSymbolShortDisplay)
            .join(" ")}${this.symbols
            .slice(withCursorAt)
            .map(Parser.getSymbolShortDisplay)
            .join(" ")}`
    }`;
  }

  static from(rule) {
    return new this(rule.name, rule.symbol, rule.postprocess);
  }
}

class State {
  constructor(rule, dot, reference, wantedBy) {
    this.rule = rule;
    this.dot = dot;
    this.reference = reference;
    this.data = [];
    this.wantedBy = wantedBy;
    this.isComplete = this.dot === rule.symbols.length;
  }

  toString() {
    return `{${this.rule.toString(this.dot)}}, from: ${this.reference || 0}`;
  }

  nextState(child) {
    const state = new State(
      this.rule,
      this.dot + 1,
      this.reference,
      this.wantedBy
    );
    state.left = this;
    state.right = child;
    if (state.isComplete) {
      state.data = state.build();
      // Having right set here will prevent the right state and its children form being garbage collected
      state.right = undefined;
    }
    return state;
  }

  build() {
    const children = [];
    for (let node = this; node.left; node = node.left)
      children.push(node.right.data);
    return children;
  }

  finish() {
    if (this.rule.postprocess)
      this.data = this.rule.postprocess(this.data, this.reference, Parser.fail);
  }
}

class Column {
  constructor(grammar, index) {
    this.grammar = grammar;
    this.index = index;
    this.states = [];
    this.wants = {}; // states indexed by the non-terminal they expect
    this.scannable = []; // list of states that expect a token
    this.completed = {}; // states that are nullable
  }

  process(nextColumn) {
    const [states, wants, completed] = this;

    for (let w = 0, state; w < states.length; w++) {
      // nb. we push() during iteration

      state = states[w];

      if (state.isComplete) {
        state.finish();
        if (state.data !== Parser.fail) {
          // complete
          let wantedBy = state.wantedBy;
          for (
            let i = wantedBy.length;
            i--;
            this.complete(/* left */ wantedBy[i], state)
          ); // this line is hot

          // special-case nullables
          if (state.reference === this.index) {
            // make sure future predictors of this rule get completed.
            const exp = state.rule.name;
            (this.completed[exp] = this.completed[exp] || []).push(state);
          }
        }
      } else {
        // queue scannable states
        const exp = state.rule.symbols[state.dot];
        if (typeof exp !== "string") {
          this.scannable.push(state);
          continue;
        }

        // predict
        if (wants[exp]) {
          wants[exp].push(state);
          if (completed.hasOwnProperty(exp))
            for (
              let nulls = completed[exp], i = 0;
              i < nulls.length;
              this.complete(state, /* right */ nulls[i++])
            );
        } else {
          wants[exp] = [state];
          this.predict(exp);
        }
      }
    }
  }

  predict(ext) {
    const rules = this.grammar.byName[exp];
    if (rules)
      for (
        let i = 0;
        i < rules.length;
        this.states.push(
          new State(rules[i++], 0, this.index, /* wantedBy */ this.wants[exp])
        )
      );
  }

  complete(left, right) {
    this.states.push(/* copy */ left.nextState(right));
  }
}

export class Grammar {
  constructor(rules, start) {
    this.rules = rules;
    this.start = start || rules[0].name;
    const byName = {};
    for (const rule of rules)
      (byName[rule.name] || (byName[rule.name] = [])).push(rule);
    this.byName = byName;
  }

  fromCompiled(rules, start) {
    // So we can allow passing (rules, start) directly to Parser for backwards compatibility
    const grammar = rules.ParserStart
      ? new Grammar(rules.ParserRules.map(Rule.from, Rule), rules.ParserStart)
      : new Grammar(rules.map(Rule.from, Rule), start);
    // nb. storing lexer on Grammar is iffy, but unavoidable
    grammar.lexer = rules.Lexer;
    return grammar;
  }
}

class StreamLexer {
  buffer = "";
  index = 0;
  line = 1;
  lastLineBreak = 0;

  reset(data, state) {
    this.buffer = data;
    this.index = 0;
    this.line = state ? state.line : 1;
    this.lastLineBreak = state ? -state.col : 0;
  }

  next() {
    if (this.index < this.buffer.length) {
      const ch = this.buffer[this.index++];
      if (ch === "\n") {
        this.line += 1;
        this.lastLineBreak = this.index;
      }
      return { value: ch };
    }
  }

  save() {
    return {
      line: this.line,
      col: this.index - this.lastLineBreak,
    };
  }

  formatError(token, message) {
    // nb. this gets called after consuming the offending token, so the culprit is index-1
    const { buffer, line, index, lastLineBreak } = this;

    if (typeof buffer === "string") {
      const col = index - lastLineBreak;
      const lastLineDigits = `${line}`.length;
      const lines = buffer.split("\n").slice(Math.max(0, line - 5), line);

      // let nextLineBreak = buffer.indexOf("\n", index);
      // if (nextLineBreak === -1) nextLineBreak = buffer.length;

      for (
        let i = 0;
        i < lines.length;
        lines[i] = `${`${line - lines.length + i + 1}`.padStart(
          lastLineDigits
        )}${lines[i]}`
      );

      return `${message} at line ${line} col ${col}:\n\n${lines.join(
        "\n"
      )}\n${" ".repeat(lastLineDigits + col)}^\n`;
      return message;
    } else {
      return `${message} at index ${index - 1}`;
    }
  }
}

export class Parser {
  static fail = {};

  constructor(grammar, options) {
    this.grammar = grammar;

    this.options = {
      keepHistory: false,
      lexer: grammar.lexer || new StreamLexer(),
      ...options,
    };

    // Setup lexer
    this.lexerState = undefined;

    // Setup a table
    const column = new Column(grammar, 0);

    // I could be expecting anything.
    column.wants[grammar.start] = [];
    column.predict(grammar.start);

    // TODO what if start rule is nullable?
    column.process();

    this.table = [column];

    this.current = 0; // token index
  }

  get lexer() {
    return this.options.lexer;
  }

  feed(chunk) {
    const { lexer, table } = this;

    lexer.reset(chunk, this.lexerState);

    let token,
      current,
      error,
      column,
      nextColumn,
      literal,
      value,
      scannable,
      state,
      nextState,
      expected;

    while (true) {
      token =
        current =
        error =
        column =
        nextColumn =
        literal =
        value =
        scannable =
        state =
        nextState =
        expected =
          undefined;

      try {
        token = lexer.next();
        if (!token) break;
      } catch (exception) {
        // Create the next column so that the error reporter
        // can display the correctly predicted states.
        this.table.push(
          (nextColumn = new Column(this.grammar, this.current + 1))
        );

        error = new Error(this.reportLexerError(exception));
        error.offset = this.current;
        error.token = exception.token;

        throw error;
      }

      // We add new states to table[current+1]
      column = this.table[this.current];
      current = this.current + 1;
      nextColumn = new Column(this.grammar, current);

      // GC unused states
      if (!this.options.keepHistory) delete this.table[this.current - 1];

      this.table.push(nextColumn);

      // Advance all tokens that expect the symbol
      literal = token.text !== undefined ? token.text : token.value;
      value = lexer.constructor === StreamLexer ? token.value : token;
      scannable = column.scannable;
      for (let w = scannable.length; w--; ) {
        state = scannable[w];
        expect = state.rule.symbols[state.dot];
        // Try to consume the token
        // either regex or literal
        if (
          expect.test
            ? expect.test(value)
            : expect.type
            ? expect.type === token.type
            : expect.literal === literal
        ) {
          // Add it
          nextState = state.nextState({
            data: value,
            token: token,
            isToken: true,
            reference: current - 1,
          });
          nextColumn.states.push(nextState);
        }
      }

      // Next, for each of the rules, we either
      // (a) complete it, and try to see if the reference row expected that
      //     rule
      // (b) predict the next nonterminal it expects by adding that
      //     nonterminal's start state
      // To prevent duplication, we also keep track of rules we have already
      // added

      nextColumn.process();

      // If needed, throw an error:
      if (nextColumn.states.length === 0) {
        // No states at all! This is not good.
        error = new Error(this.reportError(token));
        error.offset = this.current;
        error.token = token;
        throw error;
      }

      // maybe save lexer state
      if (this.options.keepHistory) {
        column.lexerState = lexer.save();
      }

      this.current++;
    }
    if (column) {
      this.lexerState = lexer.save();
    }

    // Incrementally keep track of results
    this.results = this.finish();

    // Allow chaining, for whatever it's worth
    return this;
  }

  save() {
    const column = this.table[this.current];
    column.lexerState = this.lexerState;
    return column;
  }

  restore(column) {
    const index = column.index;
    this.current = index;
    this.table[index] = column;
    this.table.splice(index + 1);
    this.lexerState = column.lexerState;

    // Incrementally keep track of results
    this.results = this.finish();
  }

  rewind(index) {
    if (!this.options.keepHistory)
      throw new Error("set option `keepHistory` to enable rewinding");

    // nb. recall column (table) indicies fall between token indicies.
    //        col 0   --   token 0   --   col 1
    this.restore(this.table[index]);
  }

  finish() {
    // Return the possible parsings
    const considerations = [];
    const data = [];
    const start = this.grammar.start;
    for (const state of this.table[this.table.length - 1].states)
      state.rule.name === start &&
        state.dot === state.rule.symbols.length &&
        state.reference === 0 &&
        state.data !== Parser.fail &&
        (considerations.push(state), data.push(state.data));

    return data;
  }

  reportLexerError(lexerError) {
    // Planning to add a token property to moo's thrown error
    // even on erroring tokens to be used in error display below
    const token = lexerError.token;
    return token
      ? this.reportErrorCommon(
          this.lexer.formatError(token, "Syntax error"),
          `input ${JSON.stringify(token.text[0])} (lexer error)`
        )
      : this.reportErrorCommon(lexerError.message, "input (lexer error)");
  }

  reportError(token) {
    return this.reportErrorCommon(
      this.lexer.formatError(token, "Syntax error"),
      `${token.type ? token.type + " token: " : ""}${JSON.stringify(
        token.value !== undefined ? token.value : token
      )}`
    );
  }

  reportErrorCommon(lexerMessage, tokenDisplay) {
    const lines = [],
      expectantStates = [],
      stateStacks = [],
      lastColumnIndex = this.table.length - 2,
      lastColumn = this.table[lastColumnIndex];

    let stateStack;

    lines.push(lexerMessage);

    for (const state of lastColumn.states)
      state.rule.symbols[state.dot] &&
        typeof state.rule.symbols[state.dot] !== "string" &&
        expectantStates.push(state);

    if (expectantStates.length === 0) {
      lines.push(
        `Unexpected ${tokenDisplay}. I did not expect any more input. Here is the state of my parse table:\n`
      );
      this.displayStateStack(lastColumn.states, lines);
    } else {
      lines.push(
        `Unexpected ${tokenDisplay}. Instead, I was expecting to see one of the following:\n`
      );
      for (const state of expectantStates) {
        // Display a "state stack" for each expectant state
        // - which shows you how this state came to be, step by step.
        // If there is more than one derivation, we only display the first one.
        stateStack = this.buildFirstStateStack(state, []) || [state];
        stateStacks.push(stateStack);

        // Display each state that is expecting a terminal symbol next.
        lines.push(
          `A ${
            /* symbolDisplay */ Parser.getSymbolDisplay(
              /* nextSymbol */ stateStack[0].rule.symbols[stateStack[0].dot]
            )
          } based on:`
        );
        this.displayStateStack(stateStack, lines);
      }
    }
    lines.push("");
    return lines.join("\n");
  }

  displayStateStack(stateStack, lines) {
    for (
      let j = 0, lastDisplay, sameDisplayCount = 0, state, display;
      j < stateStack.length;
      j++
    ) {
      (state = stateStack[j]), (display = state.rule.toString(state.dot));
      if (display === lastDisplay) {
        sameDisplayCount++;
      } else {
        if (sameDisplayCount > 0) {
          lines.push(
            "    ^ " + sameDisplayCount + " more lines identical to this"
          );
        }
        sameDisplayCount = 0;
        lines.push("    " + display);
      }
      lastDisplay = display;
    }
  }

  buildFirstStateStack(state, visited) {
    // Found cycle, return null
    // to eliminate this path from the results, because
    // we don't know how to display it meaningfully
    if (visited.indexOf(state) !== -1) return null;
    if (state.wantedBy.length === 0) return [state];

    const prevState = state.wantedBy[0];
    const childVisited = [state].concat(visited);
    const childResult = this.buildFirstStateStack(prevState, childVisited);

    if (childResult === null) return null;

    return [state].concat(childResult);
  }

  static getSymbolLongDisplay(symbol) {
    if (typeof symbol === "string") return symbol;
    else if (typeof symbol === "object")
      if (symbol.literal) return JSON.stringify(symbol.literal);
      else if (symbol instanceof RegExp) return `character matching ${symbol}`;
      else if (symbol.type) return `${symbol.type} token`;
      else if (symbol.test) return `token matching ${symbol.test}`;

    throw new Error(`Unknown symbol type: ${symbol}`);
  }

  static getSymbolShortDisplay(symbol) {
    if (typeof symbol === "string") return symbol;
    else if (typeof symbol === "object")
      if (symbol.literal) return JSON.stringify(symbol.literal);
      else if (symbol instanceof RegExp) return `${symbol}`;
      else if (symbol.type) return `﹪${symbol.type}`;
      else if (symbol.test) return `‹${symbol.test}›`;

    throw new Error(`Unknown symbol type: ${symbol}`);
  }
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment