atg/tok.ts Secret

## tok.ts
import assert from 'assert';
import fs from 'fs';

type int = number;

interface Rule {
  tt: TT,
  branch: boolean;
  regex: RegExp;
  endRegex?: RegExp;
  subrules?: Ruleset;
}
type Ruleset = Rule[];

export const enum TT {
  Cl = 11, // line comment
  Cn = 12, // nesting comment
  N = 21, // numeric
  S1 = 31, S2 = 32, S3 = 33, // strings
  I = 41, // ident
  SY = 51, // symbol
  NL = 61, // newline
  Z = 71, // whitespace
  Misc = 81, // misc
}

function leaf(tt: TT, regex: RegExp): Rule {
  assert(regex.sticky);
  assert(regex.unicode);
  return {
    tt,
    branch: false,
    regex,
  };
}
function branch(tt: TT, regex: RegExp, endRegex: RegExp, subrules: Ruleset): Rule {
  assert(regex.sticky);
  assert(regex.unicode);
  assert(endRegex.sticky);
  assert(endRegex.unicode);
  return {
    tt,
    branch: true,
    regex,
    endRegex,
    subrules,
  };
}

// === Syntax Rules ===
let nl = leaf(TT.NL, /\r?\n/uy); // form feed, unicode whitespace and classic macos linebreaks are explicitly rejected as deviant
let rootRuleset: Ruleset = []; // defined later
let lineComment = leaf(TT.Cl, /\/\/[^\n]+/uy); // does not match end \n
let nestComment = branch(TT.Cn, /\/\*/uy, /\*\//uy, [ nl, leaf(TT.Misc, /[^*\n]+|\*/uy), ]); // nl goes here so that line counting works
let numeric = leaf(TT.N, /\d+(?:\.\d+)?/uy);

// See: https://en.wikipedia.org/wiki/Template:General_Category_(Unicode)

// '1c-_  may be expanded in the future but cannot include ] } ) , ; . etc
let string1 = leaf(TT.S1, /'[\p{L}\p{M}\p{N}\p{Pc}\p{Pd}\-_]+/uy);
// string2 TODO
let string2 = branch(TT.S2, /"/uy, /"/uy, [
  nl,
  leaf(TT.Misc, /\\./uy), // TODO: unicode etc escapes
  branch(TT.Misc, /\{/uy, /\}/uy, rootRuleset),
  leaf(TT.Misc, /[^\\\{\n]+|./uy),
]);
let string3 = branch(TT.S3, /«/uy, /»/uy, [ nl, leaf(TT.Misc, /[^»\n]/uy) ]); // fancy strings
let ident = leaf(TT.I, /[\p{L}\p{M}_][\p{L}\p{M}_0-9]*/uy);

// Necessary to make string interpolation work
let curlies = branch(TT.Misc, /\{/uy, /\}/uy, rootRuleset);

// The rationale here is that some symbols can be mixed e.g. +=
// and some symbols must be kept separate e.g. ((
// but it's not perfect yet. TODO: mixes of unary operators will coalese e.g. !!
let symbol1 = leaf(TT.SY, /[\p{Ps}\p{Pe}<>,;:!?]/uy); // match one by one
let symbolN = leaf(TT.SY, /[\p{S}\-@#$%^&*+=\\|~\/.]+/uy); // take care not to match _

let whitespace = leaf(TT.Z, /[ \t]+/uy);

rootRuleset.push(...[
  nl,
  curlies,
  lineComment,
  nestComment,
  numeric,
  string1,
  string2,
  string3,
  ident,
  symbol1,
  symbolN,
  whitespace,
]);

export interface Token {
  tt: TT,
  ss: int; se: int; // start start and start end
  es: int; ee: int; // end start and end end
  subs?: Token[],
  endToken?: Token,
  line: int; // the line number that this token is on
  lineStart: int; // the start offset of the line
}

export function dumpTokens(source: string, tokens: Token[]) {
  for (let token of tokens) {
    console.log(token.tt, source.slice(token.ss, token.se));
    if (token.subs) dumpTokens(source, token.subs);
    if (token.endToken) {
      console.log(token.tt, source.slice(token.endToken.ss, token.endToken.se));
    }
  }
}

/// Generates a token tree from input
export class Tokenizer {
  static run(source: string): Token[] {
    let tokenizer = new Tokenizer(source, rootRuleset);
    let tokens = tokenizer.applyRoot();
    dumpTokens(source, tokens);
    return tokens;
  }

  index: int = 0;
  line: int = 1;
  lineStart: int = 0;
  constructor(readonly source: string, readonly rootRuleset: Ruleset) {}

  applyRoot(): Token[] {
    return this.apply(0, this.rootRuleset)[0];
  }
  apply(atIndex: int, ruleset: Ruleset, parentRule?: Rule): [Token[], Token|undefined] {
    const n = this.source.length;
    let tokens: Token[] = [];
    let endToken: Token|undefined;

    while (1) {
      console.log('atIndex', atIndex);
      if (atIndex >= this.source.length) {
        break; // reached the end of the source file
      }

      if (parentRule) {
        // Try to match an end token
        const endRegex = parentRule.endRegex!;
        endRegex.lastIndex = atIndex;
        const m = endRegex.exec(this.source);
        if (m) {
          const ss = m.index;
          const se = ss + m[0].length;
          assert(se > ss);
          endToken = {
            tt: TT.Misc,
            ss,
            se,
            es: se,
            ee: se,
            line: this.line,
            lineStart: this.lineStart,
          };
          break; // stop parsing
        }
      }

      let foundMatch = false;
      for (const rule of ruleset) {
        let re = rule.regex;
        re.lastIndex = atIndex;
        const m = re.exec(this.source);
        assert(!m || m.index === atIndex);
        if (!m) {
          continue; // no match, no problem
        }
        console.log('Success', atIndex, rule);

        foundMatch = true;
        const ss = m.index;
        const se = ss + m[0].length;
        assert(se > ss);
        let tok: Token;
        if (rule.branch) {
          const [subs, innerEndToken] = this.apply(se, rule.subrules!, rule);
          tok = {
            tt: rule.tt,
            ss,
            se,
            es: innerEndToken ? innerEndToken.ss : n,
            ee: innerEndToken ? innerEndToken.se : n,
            subs,
            endToken: innerEndToken,
            line: this.line,
            lineStart: this.lineStart,
          };
        } else {
          tok = {
            tt: rule.tt,
            ss,
            se,
            es: se,
            ee: se,
            line: this.line,
            lineStart: this.lineStart,
          };
        }
        if (rule.tt === TT.NL) {
          this.line++;
          this.lineStart = tok.ee;
        }
        atIndex = tok.ee;
        tokens.push(tok);
        break; // parse again
      }

      if (!foundMatch) {
        throw new Error(`Unknown symbol in source at character #${atIndex}`);
      }
    }
    return [tokens, endToken];
  }
}

console.log(Tokenizer.run('foo = 42\nbar = 3.14'));
	import assert from 'assert';
	import fs from 'fs';

	type int = number;

	interface Rule {
	tt: TT,
	branch: boolean;
	regex: RegExp;
	endRegex?: RegExp;
	subrules?: Ruleset;
	}
	type Ruleset = Rule[];

	export const enum TT {
	Cl = 11, // line comment
	Cn = 12, // nesting comment
	N = 21, // numeric
	S1 = 31, S2 = 32, S3 = 33, // strings
	I = 41, // ident
	SY = 51, // symbol
	NL = 61, // newline
	Z = 71, // whitespace
	Misc = 81, // misc
	}

	function leaf(tt: TT, regex: RegExp): Rule {
	assert(regex.sticky);
	assert(regex.unicode);
	return {
	tt,
	branch: false,
	regex,
	};
	}
	function branch(tt: TT, regex: RegExp, endRegex: RegExp, subrules: Ruleset): Rule {
	assert(regex.sticky);
	assert(regex.unicode);
	assert(endRegex.sticky);
	assert(endRegex.unicode);
	return {
	tt,
	branch: true,
	regex,
	endRegex,
	subrules,
	};
	}

	// === Syntax Rules ===
	let nl = leaf(TT.NL, /\r?\n/uy); // form feed, unicode whitespace and classic macos linebreaks are explicitly rejected as deviant
	let rootRuleset: Ruleset = []; // defined later
	let lineComment = leaf(TT.Cl, /\/\/[^\n]+/uy); // does not match end \n
	let nestComment = branch(TT.Cn, /\/\/uy, /\\//uy, [ nl, leaf(TT.Misc, /[^\n]+\|\/uy), ]); // nl goes here so that line counting works
	let numeric = leaf(TT.N, /\d+(?:\.\d+)?/uy);

	// See: https://en.wikipedia.org/wiki/Template:General_Category_(Unicode)

	// '1c-_ may be expanded in the future but cannot include ] } ) , ; . etc
	let string1 = leaf(TT.S1, /'[\p{L}\p{M}\p{N}\p{Pc}\p{Pd}\-_]+/uy);
	// string2 TODO
	let string2 = branch(TT.S2, /"/uy, /"/uy, [
	nl,
	leaf(TT.Misc, /\\./uy), // TODO: unicode etc escapes
	branch(TT.Misc, /\{/uy, /\}/uy, rootRuleset),
	leaf(TT.Misc, /[^\\\{\n]+\|./uy),
	]);
	let string3 = branch(TT.S3, /«/uy, /»/uy, [ nl, leaf(TT.Misc, /[^»\n]/uy) ]); // fancy strings
	let ident = leaf(TT.I, /[\p{L}\p{M}_][\p{L}\p{M}_0-9]*/uy);

	// Necessary to make string interpolation work
	let curlies = branch(TT.Misc, /\{/uy, /\}/uy, rootRuleset);

	// The rationale here is that some symbols can be mixed e.g. +=
	// and some symbols must be kept separate e.g. ((
	// but it's not perfect yet. TODO: mixes of unary operators will coalese e.g. !!
	let symbol1 = leaf(TT.SY, /[\p{Ps}\p{Pe}<>,;:!?]/uy); // match one by one
	let symbolN = leaf(TT.SY, /[\p{S}\-@#$%^&*+=\\\|~\/.]+/uy); // take care not to match _

	let whitespace = leaf(TT.Z, /[ \t]+/uy);

	rootRuleset.push(...[
	nl,
	curlies,
	lineComment,
	nestComment,
	numeric,
	string1,
	string2,
	string3,
	ident,
	symbol1,
	symbolN,
	whitespace,
	]);

	export interface Token {
	tt: TT,
	ss: int; se: int; // start start and start end
	es: int; ee: int; // end start and end end
	subs?: Token[],
	endToken?: Token,
	line: int; // the line number that this token is on
	lineStart: int; // the start offset of the line
	}

	export function dumpTokens(source: string, tokens: Token[]) {
	for (let token of tokens) {
	console.log(token.tt, source.slice(token.ss, token.se));
	if (token.subs) dumpTokens(source, token.subs);
	if (token.endToken) {
	console.log(token.tt, source.slice(token.endToken.ss, token.endToken.se));
	}
	}
	}

	/// Generates a token tree from input
	export class Tokenizer {
	static run(source: string): Token[] {
	let tokenizer = new Tokenizer(source, rootRuleset);
	let tokens = tokenizer.applyRoot();
	dumpTokens(source, tokens);
	return tokens;
	}

	index: int = 0;
	line: int = 1;
	lineStart: int = 0;
	constructor(readonly source: string, readonly rootRuleset: Ruleset) {}

	applyRoot(): Token[] {
	return this.apply(0, this.rootRuleset)[0];
	}
	apply(atIndex: int, ruleset: Ruleset, parentRule?: Rule): [Token[], Token\|undefined] {
	const n = this.source.length;
	let tokens: Token[] = [];
	let endToken: Token\|undefined;

	while (1) {
	console.log('atIndex', atIndex);
	if (atIndex >= this.source.length) {
	break; // reached the end of the source file
	}

	if (parentRule) {
	// Try to match an end token
	const endRegex = parentRule.endRegex!;
	endRegex.lastIndex = atIndex;
	const m = endRegex.exec(this.source);
	if (m) {
	const ss = m.index;
	const se = ss + m[0].length;
	assert(se > ss);
	endToken = {
	tt: TT.Misc,
	ss,
	se,
	es: se,
	ee: se,
	line: this.line,
	lineStart: this.lineStart,
	};
	break; // stop parsing
	}
	}

	let foundMatch = false;
	for (const rule of ruleset) {
	let re = rule.regex;
	re.lastIndex = atIndex;
	const m = re.exec(this.source);
	assert(!m \|\| m.index === atIndex);
	if (!m) {
	continue; // no match, no problem
	}
	console.log('Success', atIndex, rule);

	foundMatch = true;
	const ss = m.index;
	const se = ss + m[0].length;
	assert(se > ss);
	let tok: Token;
	if (rule.branch) {
	const [subs, innerEndToken] = this.apply(se, rule.subrules!, rule);
	tok = {
	tt: rule.tt,
	ss,
	se,
	es: innerEndToken ? innerEndToken.ss : n,
	ee: innerEndToken ? innerEndToken.se : n,
	subs,
	endToken: innerEndToken,
	line: this.line,
	lineStart: this.lineStart,
	};
	} else {
	tok = {
	tt: rule.tt,
	ss,
	se,
	es: se,
	ee: se,
	line: this.line,
	lineStart: this.lineStart,
	};
	}
	if (rule.tt === TT.NL) {
	this.line++;
	this.lineStart = tok.ee;
	}
	atIndex = tok.ee;
	tokens.push(tok);
	break; // parse again
	}

	if (!foundMatch) {
	throw new Error(`Unknown symbol in source at character #${atIndex}`);
	}
	}
	return [tokens, endToken];
	}
	}

	console.log(Tokenizer.run('foo = 42\nbar = 3.14'));