Skip to content

Instantly share code, notes, and snippets.

@JakeCoxon
Created December 2, 2022 15:56
Show Gist options
  • Save JakeCoxon/51f72be4eb278e1450c3aa2e5772492b to your computer and use it in GitHub Desktop.
Save JakeCoxon/51f72be4eb278e1450c3aa2e5772492b to your computer and use it in GitHub Desktop.
Tokenize Python-like whitespace language
function tokenize(input: string) {
const regexes = {
KEYWORD:
/^(?:and|assert|as|break|class|continue|def|elif|else|false|for|if|import|in|is|lambda|null|not|or|pass|return|try|while|with)/,
IDENTIFIER: /^[a-zA-Z_][a-zA-Z_0-9]*/,
LITERAL: /^(?:"(?:[^"\\]|\\.)*"|'(?:[^'\\]|\\.)*')/,
SPECIALNUMBER: /^0[xXbB][0-9a-zA-Z_]+/,
NUMBER: /^-?[0-9][0-9_]*(\.[0-9_]+)?/,
COMMENT: /^#.+(?=\n)/,
OPENPAREN: /^[\{\{\(]/,
CLOSEPAREN: /^[\]\}\)]/,
PUNCTUATION: /^(?:==|!=|[:,=,])/,
NEWLINE: /^\n/, // Precedence before whitespace
WHITESPACE: /^\s+/,
INDENT: /^ +/ // Don't actually match this one, but it will be covered by whitespace instead
};
const tokens = [];
let indent = { level: 0, numSpaces: 0 };
let lineNumber = 0;
let lineStart = 0;
let tokenIndex = 0;
let parentheses = 0;
let match;
const exec = (regex, type) => {
if ((match = regex.exec(line))) {
line = line.substring(match[0].length);
const token = { value: match[0], type, lineNumber, tokenIndex };
tokenIndex += match[0].length;
tokens.push(token);
return token;
}
};
// First is line by line
let line = input;
while (line.length > 0) {
if (exec(regexes.NEWLINE, "NEWLINE")) {
lineNumber++;
lineStart = tokenIndex;
continue;
}
if (line.length > 0) {
let token = exec(regexes.INDENT, "");
const numSpaces = token?.value.length;
if (token && numSpaces > indent.numSpaces) {
indent = { level: indent.level + 1, numSpaces };
token.type = "INDENT";
} else if (token && numSpaces < indent.numSpaces) {
indent = { level: indent.level - 1, numSpaces };
token.type = "OUTDENT";
}
}
// Tokens after the indentation, or within a grouped expression
while (line.length > 0 && (parentheses > 0 || line[0] !== "\n")) {
let token;
for (const [type, regex] of Object.entries(regexes)) {
if ((token = exec(regex, type))) break;
}
if (!token) {
const line = input.substring(lineStart, input.indexOf("\n", lineStart));
const repeat = " ".repeat(tokenIndex - lineStart);
const message = `Unable to tokenize line ${lineNumber} \n${line}\n${repeat}^-- here`;
throw new Error(message);
}
if (token.type === "WHITESPACE" || token.type === "COMMENT") tokens.pop();
if (token.type === "NEWLINE") { lineNumber++; lineStart = tokenIndex; tokens.pop(); } // prettier-ignore
if (token.type === "OPENPAREN") parentheses++;
if (token.type === "CLOSEPAREN") parentheses--;
}
}
while (indent.level) {
tokens.push({ value: "", type: "OUTDENT" });
indent.level--;
}
return tokens;
}
(() => {
const input = `
assert false, "unexp\\"ected thing"
if something == true:
do(something) # this is some cool
foo(
bar, baz, baw)
foo(1,2, 0xff)
`;
const out = tokenize(input);
const html = (() => {
let html = ``;
let last = 0;
const colors = {
KEYWORD: "#ff6767",
IDENTIFIER: "pink",
LITERAL: "#8b8bff",
SPECIALNUMBER: "#b0ffb0",
NUMBER: "#b0ffb0",
COMMENT: "lightgreen",
OPENPAREN: "#ff6eff",
CLOSEPAREN: "#ff6eff",
PUNCTUATION: "#ff91ff",
NEWLINE: "white",
WHITESPACE: "lightgrey",
INDENT: "lightgrey"
};
out.forEach((token) => {
if (token.tokenIndex != last) {
html += input.substring(last, token.tokenIndex);
}
last = token.tokenIndex + token.value.length;
html += `<span style="background-color: ${colors[token.type]}">${token.value}</span>`;
});
return html;
})();
let str = JSON.stringify(out, null, 2);
str = str
.replaceAll("&", "&amp")
.replaceAll("<", "&lt")
.replaceAll(">", "&gt;")
.replaceAll("'", "&#39;")
.replaceAll('"', "&quot;");
document.querySelector("#app")!!.innerHTML = `<pre>${html}</pre><br><pre>${str}</pre>`;
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment