Skip to content

Instantly share code, notes, and snippets.

Created September 22, 2013 21:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tmcw/6664186 to your computer and use it in GitHub Desktop.
Save tmcw/6664186 to your computer and use it in GitHub Desktop.
var EventEmitter = require('events').EventEmitter;
var util = require('util');
var assert = require('assert');
var Transform = require('stream').Transform;
var disect = require('disect');
function noop(){}
function Tokenizer (check_token_cb, options) {
if(!(this instanceof Tokenizer)) {
return new Tokenizer(check_token_cb);
}, options);
this._readableState.objectMode = true;
this._buffered = ""; // we buffer untokenized data between writes
this._regexes = []; // should contain objects
// with regex[RegExp] and type[String]
this._ignored = {}; // a hash of ignored token types
// these will be parsed but not emitted
this._checkToken = check_token_cb || noop;
util.inherits(Tokenizer, Transform);
Tokenizer.prototype._transform = function _transform(chunk, encoding, callback) {
chunk = chunk.toString();
// var self = this;
console.log('got chunk');
// return;
// process.nextTick(function () {
try {
var index = 0, step = 64;
while(index < chunk.length) {
this._tokenize(chunk.substr(index, step));
index += step;
} catch(e) {
// })
Tokenizer.prototype._getMatchingRule = function _getMatchingRule(str) {
for (var i = 0; i < this._regexes.length; ++i) {
if(this._regexes[i].regex.test(str)) {
return this._regexes[i];
return null;
Tokenizer.prototype._tokenize = function _tokenize(data, nobuffer) {
var regexes = this._regexes;
// in case we buffered data on previous writes
data = this._buffered + data;
this._buffered = '';
if(!data.length) {
var self = this;
var maxIndex = disect(0, data.length, function (index) {
var buf = data.substring(0, index + 1);
return self._getMatchingRule(buf) === null;
if(maxIndex === 0) {
// no match found
throw new SyntaxError('could not tokenize ' + JSON.stringify(data));
else if (maxIndex === data.length && !nobuffer) {
// the whole string is matching
this._buffered = data;
else {
// some substring is matching
var str = data.substring(0, maxIndex);
var rule = this._getMatchingRule(str);
if(!rule) {
// throw new Error('wut ?');
this._gotToken(str, rule);
// this._tokenize(data.substring(maxIndex), nobuffer);
Tokenizer.prototype._flush = function _flush(callback) {
var self = this;
process.nextTick(function () {
try {
self._tokenize('', true);
} catch(e) {
var Token = function String (content, type) {
this.content = content;
this.type = type;
this.toString = function () {
return this.content.toString();
util.inherits(Token, String);
Token.prototype.valueOf = function valueOf() {
return this.content;
Tokenizer.prototype._gotToken = function _gotToken(str, rule) {
// notify the token checker
var type = this._checkToken(str, rule) || rule.type;
if(this._ignored[type]) return;
var token = new Token(str, type);
this.emit('token', token, type);
Tokenizer.prototype.addRule = function addRule(regex, type) {
// this is useful for built-in rules
if(!type) {
if(Array.isArray(regex)) {
return this.addRule(regex[0], regex[1]);
else if(regex) {
return this.addRule(Tokenizer[regex]);
else {
throw new Error('No parameters specified');
assert.ok((regex instanceof RegExp) || (typeof regex === 'function'));
assert.equal(typeof type, 'string');
* set some tokens to be ignored. these won't be emitted
Tokenizer.prototype.ignore = function ignore(ignored) {
if(typeof ignore === 'array') {
for (var i = 0; i < ignored.length; ++i) {
this._ignored[ignored] = true;
module.exports = Tokenizer;
// built-in rules
Tokenizer.whitespace = [/^(\s)+$/, 'whitespace'];
Tokenizer.word = [/^\w+$/, 'word'];
Tokenizer.number = [/^\d+(\.\d+)?$/, 'number'];
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment