-
-
Save atg/8bc53b27ebcdee7d98ce7993ad2e763b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import assert from 'assert'; | |
import fs from 'fs'; | |
type int = number; | |
interface Rule { | |
tt: TT, | |
branch: boolean; | |
regex: RegExp; | |
endRegex?: RegExp; | |
subrules?: Ruleset; | |
} | |
type Ruleset = Rule[]; | |
export const enum TT { | |
Cl = 11, // line comment | |
Cn = 12, // nesting comment | |
N = 21, // numeric | |
S1 = 31, S2 = 32, S3 = 33, // strings | |
I = 41, // ident | |
SY = 51, // symbol | |
NL = 61, // newline | |
Z = 71, // whitespace | |
Misc = 81, // misc | |
} | |
function leaf(tt: TT, regex: RegExp): Rule { | |
assert(regex.sticky); | |
assert(regex.unicode); | |
return { | |
tt, | |
branch: false, | |
regex, | |
}; | |
} | |
function branch(tt: TT, regex: RegExp, endRegex: RegExp, subrules: Ruleset): Rule { | |
assert(regex.sticky); | |
assert(regex.unicode); | |
assert(endRegex.sticky); | |
assert(endRegex.unicode); | |
return { | |
tt, | |
branch: true, | |
regex, | |
endRegex, | |
subrules, | |
}; | |
} | |
// === Syntax Rules === | |
let nl = leaf(TT.NL, /\r?\n/uy); // form feed, unicode whitespace and classic macos linebreaks are explicitly rejected as deviant | |
let rootRuleset: Ruleset = []; // defined later | |
let lineComment = leaf(TT.Cl, /\/\/[^\n]+/uy); // does not match end \n | |
let nestComment = branch(TT.Cn, /\/\*/uy, /\*\//uy, [ nl, leaf(TT.Misc, /[^*\n]+|\*/uy), ]); // nl goes here so that line counting works | |
let numeric = leaf(TT.N, /\d+(?:\.\d+)?/uy); | |
// See: https://en.wikipedia.org/wiki/Template:General_Category_(Unicode) | |
// '1c-_ may be expanded in the future but cannot include ] } ) , ; . etc | |
let string1 = leaf(TT.S1, /'[\p{L}\p{M}\p{N}\p{Pc}\p{Pd}\-_]+/uy); | |
// string2 TODO | |
let string2 = branch(TT.S2, /"/uy, /"/uy, [ | |
nl, | |
leaf(TT.Misc, /\\./uy), // TODO: unicode etc escapes | |
branch(TT.Misc, /\{/uy, /\}/uy, rootRuleset), | |
leaf(TT.Misc, /[^\\\{\n]+|./uy), | |
]); | |
let string3 = branch(TT.S3, /«/uy, /»/uy, [ nl, leaf(TT.Misc, /[^»\n]/uy) ]); // fancy strings | |
let ident = leaf(TT.I, /[\p{L}\p{M}_][\p{L}\p{M}_0-9]*/uy); | |
// Necessary to make string interpolation work | |
let curlies = branch(TT.Misc, /\{/uy, /\}/uy, rootRuleset); | |
// The rationale here is that some symbols can be mixed e.g. += | |
// and some symbols must be kept separate e.g. (( | |
// but it's not perfect yet. TODO: mixes of unary operators will coalese e.g. !! | |
let symbol1 = leaf(TT.SY, /[\p{Ps}\p{Pe}<>,;:!?]/uy); // match one by one | |
let symbolN = leaf(TT.SY, /[\p{S}\-@#$%^&*+=\\|~\/.]+/uy); // take care not to match _ | |
let whitespace = leaf(TT.Z, /[ \t]+/uy); | |
rootRuleset.push(...[ | |
nl, | |
curlies, | |
lineComment, | |
nestComment, | |
numeric, | |
string1, | |
string2, | |
string3, | |
ident, | |
symbol1, | |
symbolN, | |
whitespace, | |
]); | |
export interface Token { | |
tt: TT, | |
ss: int; se: int; // start start and start end | |
es: int; ee: int; // end start and end end | |
subs?: Token[], | |
endToken?: Token, | |
line: int; // the line number that this token is on | |
lineStart: int; // the start offset of the line | |
} | |
export function dumpTokens(source: string, tokens: Token[]) { | |
for (let token of tokens) { | |
console.log(token.tt, source.slice(token.ss, token.se)); | |
if (token.subs) dumpTokens(source, token.subs); | |
if (token.endToken) { | |
console.log(token.tt, source.slice(token.endToken.ss, token.endToken.se)); | |
} | |
} | |
} | |
/// Generates a token tree from input | |
export class Tokenizer { | |
static run(source: string): Token[] { | |
let tokenizer = new Tokenizer(source, rootRuleset); | |
let tokens = tokenizer.applyRoot(); | |
dumpTokens(source, tokens); | |
return tokens; | |
} | |
index: int = 0; | |
line: int = 1; | |
lineStart: int = 0; | |
constructor(readonly source: string, readonly rootRuleset: Ruleset) {} | |
applyRoot(): Token[] { | |
return this.apply(0, this.rootRuleset)[0]; | |
} | |
apply(atIndex: int, ruleset: Ruleset, parentRule?: Rule): [Token[], Token|undefined] { | |
const n = this.source.length; | |
let tokens: Token[] = []; | |
let endToken: Token|undefined; | |
while (1) { | |
console.log('atIndex', atIndex); | |
if (atIndex >= this.source.length) { | |
break; // reached the end of the source file | |
} | |
if (parentRule) { | |
// Try to match an end token | |
const endRegex = parentRule.endRegex!; | |
endRegex.lastIndex = atIndex; | |
const m = endRegex.exec(this.source); | |
if (m) { | |
const ss = m.index; | |
const se = ss + m[0].length; | |
assert(se > ss); | |
endToken = { | |
tt: TT.Misc, | |
ss, | |
se, | |
es: se, | |
ee: se, | |
line: this.line, | |
lineStart: this.lineStart, | |
}; | |
break; // stop parsing | |
} | |
} | |
let foundMatch = false; | |
for (const rule of ruleset) { | |
let re = rule.regex; | |
re.lastIndex = atIndex; | |
const m = re.exec(this.source); | |
assert(!m || m.index === atIndex); | |
if (!m) { | |
continue; // no match, no problem | |
} | |
console.log('Success', atIndex, rule); | |
foundMatch = true; | |
const ss = m.index; | |
const se = ss + m[0].length; | |
assert(se > ss); | |
let tok: Token; | |
if (rule.branch) { | |
const [subs, innerEndToken] = this.apply(se, rule.subrules!, rule); | |
tok = { | |
tt: rule.tt, | |
ss, | |
se, | |
es: innerEndToken ? innerEndToken.ss : n, | |
ee: innerEndToken ? innerEndToken.se : n, | |
subs, | |
endToken: innerEndToken, | |
line: this.line, | |
lineStart: this.lineStart, | |
}; | |
} else { | |
tok = { | |
tt: rule.tt, | |
ss, | |
se, | |
es: se, | |
ee: se, | |
line: this.line, | |
lineStart: this.lineStart, | |
}; | |
} | |
if (rule.tt === TT.NL) { | |
this.line++; | |
this.lineStart = tok.ee; | |
} | |
atIndex = tok.ee; | |
tokens.push(tok); | |
break; // parse again | |
} | |
if (!foundMatch) { | |
throw new Error(`Unknown symbol in source at character #${atIndex}`); | |
} | |
} | |
return [tokens, endToken]; | |
} | |
} | |
console.log(Tokenizer.run('foo = 42\nbar = 3.14')); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment