Skip to content

Instantly share code, notes, and snippets.

@atg

atg/tok.ts Secret

Created March 5, 2019 19:11
Show Gist options
  • Save atg/8bc53b27ebcdee7d98ce7993ad2e763b to your computer and use it in GitHub Desktop.
Save atg/8bc53b27ebcdee7d98ce7993ad2e763b to your computer and use it in GitHub Desktop.
import assert from 'assert';
import fs from 'fs';
type int = number;
interface Rule {
tt: TT,
branch: boolean;
regex: RegExp;
endRegex?: RegExp;
subrules?: Ruleset;
}
type Ruleset = Rule[];
export const enum TT {
Cl = 11, // line comment
Cn = 12, // nesting comment
N = 21, // numeric
S1 = 31, S2 = 32, S3 = 33, // strings
I = 41, // ident
SY = 51, // symbol
NL = 61, // newline
Z = 71, // whitespace
Misc = 81, // misc
}
function leaf(tt: TT, regex: RegExp): Rule {
assert(regex.sticky);
assert(regex.unicode);
return {
tt,
branch: false,
regex,
};
}
function branch(tt: TT, regex: RegExp, endRegex: RegExp, subrules: Ruleset): Rule {
assert(regex.sticky);
assert(regex.unicode);
assert(endRegex.sticky);
assert(endRegex.unicode);
return {
tt,
branch: true,
regex,
endRegex,
subrules,
};
}
// === Syntax Rules ===
let nl = leaf(TT.NL, /\r?\n/uy); // form feed, unicode whitespace and classic macos linebreaks are explicitly rejected as deviant
let rootRuleset: Ruleset = []; // defined later
let lineComment = leaf(TT.Cl, /\/\/[^\n]+/uy); // does not match end \n
let nestComment = branch(TT.Cn, /\/\*/uy, /\*\//uy, [ nl, leaf(TT.Misc, /[^*\n]+|\*/uy), ]); // nl goes here so that line counting works
let numeric = leaf(TT.N, /\d+(?:\.\d+)?/uy);
// See: https://en.wikipedia.org/wiki/Template:General_Category_(Unicode)
// '1c-_ may be expanded in the future but cannot include ] } ) , ; . etc
let string1 = leaf(TT.S1, /'[\p{L}\p{M}\p{N}\p{Pc}\p{Pd}\-_]+/uy);
// string2 TODO
let string2 = branch(TT.S2, /"/uy, /"/uy, [
nl,
leaf(TT.Misc, /\\./uy), // TODO: unicode etc escapes
branch(TT.Misc, /\{/uy, /\}/uy, rootRuleset),
leaf(TT.Misc, /[^\\\{\n]+|./uy),
]);
let string3 = branch(TT.S3, /«/uy, /»/uy, [ nl, leaf(TT.Misc, /[^»\n]/uy) ]); // fancy strings
let ident = leaf(TT.I, /[\p{L}\p{M}_][\p{L}\p{M}_0-9]*/uy);
// Necessary to make string interpolation work
let curlies = branch(TT.Misc, /\{/uy, /\}/uy, rootRuleset);
// The rationale here is that some symbols can be mixed e.g. +=
// and some symbols must be kept separate e.g. ((
// but it's not perfect yet. TODO: mixes of unary operators will coalese e.g. !!
let symbol1 = leaf(TT.SY, /[\p{Ps}\p{Pe}<>,;:!?]/uy); // match one by one
let symbolN = leaf(TT.SY, /[\p{S}\-@#$%^&*+=\\|~\/.]+/uy); // take care not to match _
let whitespace = leaf(TT.Z, /[ \t]+/uy);
rootRuleset.push(...[
nl,
curlies,
lineComment,
nestComment,
numeric,
string1,
string2,
string3,
ident,
symbol1,
symbolN,
whitespace,
]);
export interface Token {
tt: TT,
ss: int; se: int; // start start and start end
es: int; ee: int; // end start and end end
subs?: Token[],
endToken?: Token,
line: int; // the line number that this token is on
lineStart: int; // the start offset of the line
}
export function dumpTokens(source: string, tokens: Token[]) {
for (let token of tokens) {
console.log(token.tt, source.slice(token.ss, token.se));
if (token.subs) dumpTokens(source, token.subs);
if (token.endToken) {
console.log(token.tt, source.slice(token.endToken.ss, token.endToken.se));
}
}
}
/// Generates a token tree from input
export class Tokenizer {
static run(source: string): Token[] {
let tokenizer = new Tokenizer(source, rootRuleset);
let tokens = tokenizer.applyRoot();
dumpTokens(source, tokens);
return tokens;
}
index: int = 0;
line: int = 1;
lineStart: int = 0;
constructor(readonly source: string, readonly rootRuleset: Ruleset) {}
applyRoot(): Token[] {
return this.apply(0, this.rootRuleset)[0];
}
apply(atIndex: int, ruleset: Ruleset, parentRule?: Rule): [Token[], Token|undefined] {
const n = this.source.length;
let tokens: Token[] = [];
let endToken: Token|undefined;
while (1) {
console.log('atIndex', atIndex);
if (atIndex >= this.source.length) {
break; // reached the end of the source file
}
if (parentRule) {
// Try to match an end token
const endRegex = parentRule.endRegex!;
endRegex.lastIndex = atIndex;
const m = endRegex.exec(this.source);
if (m) {
const ss = m.index;
const se = ss + m[0].length;
assert(se > ss);
endToken = {
tt: TT.Misc,
ss,
se,
es: se,
ee: se,
line: this.line,
lineStart: this.lineStart,
};
break; // stop parsing
}
}
let foundMatch = false;
for (const rule of ruleset) {
let re = rule.regex;
re.lastIndex = atIndex;
const m = re.exec(this.source);
assert(!m || m.index === atIndex);
if (!m) {
continue; // no match, no problem
}
console.log('Success', atIndex, rule);
foundMatch = true;
const ss = m.index;
const se = ss + m[0].length;
assert(se > ss);
let tok: Token;
if (rule.branch) {
const [subs, innerEndToken] = this.apply(se, rule.subrules!, rule);
tok = {
tt: rule.tt,
ss,
se,
es: innerEndToken ? innerEndToken.ss : n,
ee: innerEndToken ? innerEndToken.se : n,
subs,
endToken: innerEndToken,
line: this.line,
lineStart: this.lineStart,
};
} else {
tok = {
tt: rule.tt,
ss,
se,
es: se,
ee: se,
line: this.line,
lineStart: this.lineStart,
};
}
if (rule.tt === TT.NL) {
this.line++;
this.lineStart = tok.ee;
}
atIndex = tok.ee;
tokens.push(tok);
break; // parse again
}
if (!foundMatch) {
throw new Error(`Unknown symbol in source at character #${atIndex}`);
}
}
return [tokens, endToken];
}
}
console.log(Tokenizer.run('foo = 42\nbar = 3.14'));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment