Skip to content

Instantly share code, notes, and snippets.

@buhichan
Created November 30, 2020 11:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save buhichan/b1e58da28a4bf0a389d86dd451513787 to your computer and use it in GitHub Desktop.
Save buhichan/b1e58da28a4bf0a389d86dd451513787 to your computer and use it in GitHub Desktop.
An attempt to implement antlr-like functionality in typescript.
const EOF = Symbol("EOF")
type TokenStream = {
token: Token
start: number
end: number
}[]
type Token = {
kind: "token"
name: string
regex: RegExp
}
type ParserRule = {
kind: "rule"
name: string
tokens: () => SubRule
}
type SubRule = Token | ParserRule | OrSubRule | AndSubRule | OneOrMoreRule | ZeroOrMoreRule | ZeroOrOneRule
type OrSubRule = ["and", ...SubRule[]]
type AndSubRule = ["or", ...SubRule[]]
type OneOrMoreRule = ["+", SubRule]
type ZeroOrOneRule = ["?", SubRule]
type ZeroOrMoreRule = ["*", SubRule]
function and(...r: SubRule[]) {
return (["and", ...r] as unknown) as AndSubRule
}
function or(...r: SubRule[]) {
return (["or", ...r] as unknown) as OrSubRule
}
function oneOrMore(r: SubRule) {
return ["+", r] as OneOrMoreRule
}
function zeroOrMore(r: SubRule) {
return ["*", r] as ZeroOrMoreRule
}
function zeroOrOne(r: SubRule) {
return ["?", r] as ZeroOrOneRule
}
type AST = {
kind?: string
tokenStart: number
tokenEnd: number
start: number
end: number
children?: AST[]
}
function Lexer(source: string) {
const knownTokens: (Token & { skip?: boolean })[] = []
function token(name: string, regex: RegExp, skip?: boolean) {
const def = {
name,
regex,
skip,
kind: "token" as const,
}
knownTokens.push(def)
return def
}
return {
token,
tokenize(): TokenStream {
let start = 0
let res: TokenStream = []
matchSource: while (start < source.length) {
for (const token of knownTokens) {
const match = source.slice(start).match(token.regex)
if (match) {
const tokenLength = match[0].length
const end = start + tokenLength
if (!token.skip) {
res.push({
token,
start,
end,
})
}
start += tokenLength
continue matchSource
}
}
throw new Error("unknown token at " + start + ": " + source.slice(start, 5) + "...")
}
return res
},
}
}
function Parser(tokenStream: TokenStream) {
function read(readOffset: number) {
let nextToken
if ((nextToken = tokenStream[readOffset])) {
return nextToken
} else {
return EOF
}
}
function rule(name: string, tokens: () => SubRule): ParserRule {
return {
name,
tokens,
kind: "rule" as const,
}
}
function matchRule(rule: SubRule, offset: number, matchAll: boolean): null | { nodes: AST[]; tokenLength: number } {
if (Array.isArray(rule)) {
const [op, ...tokenRules] = rule
switch (op) {
case "and": {
const subMatches: AST[] = []
let childOffset = 0
for (const subRule of tokenRules) {
const subMatch = matchRule(subRule, offset + childOffset, false)
if (subMatch) {
childOffset += subMatch.tokenLength
subMatches.push(...subMatch.nodes)
} else {
return null
}
}
return {
nodes: subMatches,
tokenLength: childOffset,
}
}
case "or": {
for (const subRule of tokenRules) {
let subMatch = matchRule(subRule, offset, matchAll)
if (subMatch && !(matchAll && subMatch.tokenLength + offset < tokenStream.length)) {
return subMatch
}
}
return null
}
case "?":
case "+":
case "*": {
const subRule = tokenRules[0]
let childOffset = offset
let subMatches = []
while (childOffset < tokens.length) {
const subMatch = matchRule(subRule, childOffset, false)
if (subMatch) {
childOffset += subMatch.tokenLength
subMatches.push(...subMatch.nodes)
if (op === "?") {
break
}
} else {
break
}
}
if (!subMatches.length) {
if (op === "*") {
return {
nodes: [],
tokenLength: 0,
}
} else {
return null
}
} else {
return {
nodes: subMatches,
tokenLength: childOffset - offset,
}
}
}
}
} else if (rule.kind === "rule") {
const subRules = rule.tokens()
const subMatch = matchRule(subRules, offset, matchAll)
if (subMatch) {
if (!subMatch.nodes.length) {
return {
nodes: [],
tokenLength: 0,
}
}
const sourceLocation = sourceCode.slice(subMatch.nodes[0].start, subMatch.nodes[subMatch.nodes.length - 1].end)
console.log(`matched rule ${rule.name} at ${sourceLocation}`)
return {
nodes: [
{
kind: rule.name,
tokenStart: offset,
tokenEnd: offset + subMatch.tokenLength,
start: subMatch.nodes[0].start,
end: subMatch.nodes[subMatch.nodes.length - 1].end,
children: subMatch.nodes,
},
],
tokenLength: subMatch.tokenLength,
}
} else {
return null
}
} else {
const nextToken = read(offset)
if (nextToken === EOF) {
return null
} else if (nextToken.token === rule) {
return {
nodes: [
{
kind: rule.name,
tokenStart: offset,
start: nextToken.start,
end: nextToken.end,
tokenEnd: offset + 1,
},
],
tokenLength: 1,
}
} else {
return null
}
}
}
function match(expr: ParserRule) {
return matchRule(expr, 0, true)
}
return {
rule,
read,
match,
}
}
const sourceCode = `
let a = 3
let b = (10 + a) * 3 / 2
let c = a * b / 2
`
const lexer = Lexer(sourceCode)
const Let = lexer.token("LetKeyword", /^let/)
const Equal = lexer.token("Equal", /^\=/)
const Num = lexer.token("Number", /^\d+/)
const Add = lexer.token("Add", /^\+/)
const Id = lexer.token("Identifier", /^[a-z_A-Z]+/)
const Mul = lexer.token("Multiply", /^\*/)
const Div = lexer.token("Divide", /^\//)
const WS = lexer.token("WhiteSpace", /^(\s|\n)+/, true)
const LBracket = lexer.token("LeftBracket", /^\(/)
const RBracket = lexer.token("RightBracket", /^\)/)
const tokens = lexer.tokenize()
const someLangParser = Parser(tokens)
const mul = someLangParser.rule("mul", () => and(Mul, expr))
const div = someLangParser.rule("div", () => and(Div, expr))
const add = someLangParser.rule("add", () => and(Add, expr))
const expr = someLangParser.rule("expr", () =>
//prettier-ignore
or(
and(
or(
Id,
Num,
and(
LBracket, expr, RBracket
)
),
or(mul, div, add)
),
and(
LBracket, expr, RBracket
),
Id,
Num
)
)
const decl = someLangParser.rule("decl", () => {
return and(Let, Id, Equal, expr)
})
const file = someLangParser.rule("file", () => {
return zeroOrMore(decl)
})
const ast = someLangParser.match(file)
ast && printAst(ast.nodes[0])
function printAst(ast: AST, indent = 0) {
if (ast.kind) {
const tokenStart = tokens[ast.tokenStart]
const tokenEnd = tokens[ast.tokenEnd - 1]
console.log(`${" ".repeat(indent)} ${ast.kind} "${sourceCode.slice(tokenStart.start, tokenEnd.end)}"`)
}
if (ast.children) {
ast.children.forEach(x => printAst(x, indent + 1))
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment