Skip to content

Instantly share code, notes, and snippets.

@samkcarlile
Last active February 10, 2023 21:54
Show Gist options
  • Save samkcarlile/4d5c88716f7fd425d2d1d8f735844ad4 to your computer and use it in GitHub Desktop.
Save samkcarlile/4d5c88716f7fd425d2d1d8f735844ad4 to your computer and use it in GitHub Desktop.
TypeScript Tokenizer
type TokenSpec<T = any> = { match: RegExp; value: (s: string) => T };
type TokenConfig = Record<string, RegExp | TokenSpec>;
type TokenTypes<T extends TokenConfig> = {
[Type in keyof T]: T[Type] extends TokenSpec
? { type: Type; value: ReturnType<T[Type]['value']> }
: { type: Type; value: string };
}[keyof T];
/**
* @param str The input string
* @param tokens An object of token types and their regex
* @param strict If true, throws an error on unmatched text. By default unmatched text has a token type of `undefined`
*/
export function tokenize<T extends Record<string, RegExp | TokenSpec>>(
str: string,
tokens: T,
strict?: boolean
): TokenTypes<T>[] {
const parser = new RegExp(
Object.entries(tokens)
.map(([type, regex]) =>
regex instanceof RegExp
? `(?<${type}>${regex.source})`
: `(?<${type}>${regex.match.source})`
)
.join('|'),
'gm'
);
const transformers = Object.fromEntries(
Object.entries(tokens)
.filter(([, value]) => !(value instanceof RegExp))
.map(([key, spec]) => [key, (spec as TokenSpec).value])
);
const results: { type: keyof T | undefined; value: string }[] = [];
let match: RegExpMatchArray | null;
let lastIndex = 0;
while ((match = parser.exec(str))) {
const token = Object.entries(match.groups!).filter(([, value]) => value)[0];
const noMatch = str.slice(lastIndex, parser.lastIndex - token[1].length);
if (strict && noMatch)
throw new Error(`unknown token at position ${lastIndex}: "${noMatch}"`);
else if (noMatch) results.push({ type: undefined, value: noMatch });
const transform = transformers[token[0]];
results.push({
type: token[0],
value: transform ? transform(token[1]) : token[1],
});
lastIndex = parser.lastIndex;
}
const endOfString = str.slice(lastIndex);
if (endOfString) results.push({ type: undefined, value: endOfString });
return results as any;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment