Skip to content

Instantly share code, notes, and snippets.

@klappy
Forked from borgar/Tiny JavaScript tokenizer.js
Last active September 16, 2019 20:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save klappy/7eb31af0c772031636b57e27fc9ad5e6 to your computer and use it in GitHub Desktop.
Save klappy/7eb31af0c772031636b57e27fc9ad5e6 to your computer and use it in GitHub Desktop.
A compact tokenizer written in JavaScript.
/**
* Tiny tokenizer - https://gist.github.com/borgar/451393
* @param {String} string - string to be tokenized
* @param {Object} parsers - { word:/\w+/, whitespace:/\s+/, punctuation:/[^\w\s]/ }
* @param {String} deftok - type to label tokens that are not classified with the above parsers
* @return {Array} - array of objects => [{ token:"this", type:"word" },{ token:" ", type:"whitespace" }, Object { token:"is", type:"word" }, ... ]
**/
export const classifyTokens = (string, parsers, deftok) => {
string = (!string) ? '' : string; // if string is undefined, make it an empty string
if (typeof string !== 'string') {
throw new Error(`tokenizer.tokenize() string is not String: ${string}`);
}
let m;
let r;
let t;
let tokens = [];
while (string) {
t = null;
m = string.length;
let key;
for (key in parsers) {
if (Object.prototype.hasOwnProperty.call(parsers, key)) {
r = parsers[key].exec( string );
// try to choose the best match if there are several
// where "best" is the closest to the current starting point
if ( r && ( r.index < m ) ) {
t = {
token: r[0],
type: key,
matches: r.slice(1),
};
m = r.index;
}
}
}
if ( m ) {
// there is text between last token and currently
// matched token - push that out as default or "unknown"
tokens.push({
token: string.substr( 0, m ),
type: deftok || 'unknown',
});
}
if ( t ) {
// push current token onto sequence
tokens.push( t );
}
string = string.substr( m + (t ? t.token.length : 0) );
}
return tokens;
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment