Created
September 22, 2013 21:46
-
-
Save tmcw/6664186 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var EventEmitter = require('events').EventEmitter; | |
var util = require('util'); | |
var assert = require('assert'); | |
var Transform = require('stream').Transform; | |
var disect = require('disect'); | |
function noop(){} | |
function Tokenizer (check_token_cb, options) { | |
if(!(this instanceof Tokenizer)) { | |
return new Tokenizer(check_token_cb); | |
} | |
Transform.call(this, options); | |
this._readableState.objectMode = true; | |
this._buffered = ""; // we buffer untokenized data between writes | |
this._regexes = []; // should contain objects | |
// with regex[RegExp] and type[String] | |
this._ignored = {}; // a hash of ignored token types | |
// these will be parsed but not emitted | |
this._checkToken = check_token_cb || noop; | |
} | |
util.inherits(Tokenizer, Transform); | |
Tokenizer.prototype._transform = function _transform(chunk, encoding, callback) { | |
chunk = chunk.toString(); | |
// var self = this; | |
console.log('got chunk'); | |
// return; | |
// process.nextTick(function () { | |
try { | |
var index = 0, step = 64; | |
while(index < chunk.length) { | |
this._tokenize(chunk.substr(index, step)); | |
index += step; | |
} | |
callback(); | |
} catch(e) { | |
callback(e); | |
} | |
// }) | |
}; | |
Tokenizer.prototype._getMatchingRule = function _getMatchingRule(str) { | |
for (var i = 0; i < this._regexes.length; ++i) { | |
if(this._regexes[i].regex.test(str)) { | |
return this._regexes[i]; | |
} | |
} | |
return null; | |
}; | |
Tokenizer.prototype._tokenize = function _tokenize(data, nobuffer) { | |
var regexes = this._regexes; | |
// in case we buffered data on previous writes | |
data = this._buffered + data; | |
this._buffered = ''; | |
if(!data.length) { | |
return; | |
} | |
var self = this; | |
var maxIndex = disect(0, data.length, function (index) { | |
var buf = data.substring(0, index + 1); | |
return self._getMatchingRule(buf) === null; | |
}); | |
if(maxIndex === 0) { | |
// no match found | |
throw new SyntaxError('could not tokenize ' + JSON.stringify(data)); | |
} | |
else if (maxIndex === data.length && !nobuffer) { | |
// the whole string is matching | |
this._buffered = data; | |
return; | |
} | |
else { | |
// some substring is matching | |
var str = data.substring(0, maxIndex); | |
var rule = this._getMatchingRule(str); | |
if(!rule) { | |
// throw new Error('wut ?'); | |
} | |
this._gotToken(str, rule); | |
// this._tokenize(data.substring(maxIndex), nobuffer); | |
} | |
}; | |
Tokenizer.prototype._flush = function _flush(callback) { | |
var self = this; | |
process.nextTick(function () { | |
try { | |
self._tokenize('', true); | |
callback(); | |
} catch(e) { | |
callback(e); | |
} | |
}); | |
}; | |
var Token = function String (content, type) { | |
this.content = content; | |
this.type = type; | |
this.toString = function () { | |
return this.content.toString(); | |
}; | |
}; | |
util.inherits(Token, String); | |
Token.prototype.valueOf = function valueOf() { | |
return this.content; | |
}; | |
Tokenizer.prototype._gotToken = function _gotToken(str, rule) { | |
// notify the token checker | |
var type = this._checkToken(str, rule) || rule.type; | |
if(this._ignored[type]) return; | |
var token = new Token(str, type); | |
this.push(token); | |
this.emit('token', token, type); | |
}; | |
Tokenizer.prototype.addRule = function addRule(regex, type) { | |
// this is useful for built-in rules | |
if(!type) { | |
if(Array.isArray(regex)) { | |
return this.addRule(regex[0], regex[1]); | |
} | |
else if(regex) { | |
return this.addRule(Tokenizer[regex]); | |
} | |
else { | |
throw new Error('No parameters specified'); | |
} | |
} | |
assert.ok((regex instanceof RegExp) || (typeof regex === 'function')); | |
assert.equal(typeof type, 'string'); | |
this._regexes.push({regex:regex,type:type}); | |
}; | |
/** | |
* set some tokens to be ignored. these won't be emitted | |
*/ | |
Tokenizer.prototype.ignore = function ignore(ignored) { | |
if(typeof ignore === 'array') { | |
for (var i = 0; i < ignored.length; ++i) { | |
this.ignore(ignored[i]); | |
} | |
return; | |
} | |
this._ignored[ignored] = true; | |
}; | |
module.exports = Tokenizer; | |
// built-in rules | |
Tokenizer.whitespace = [/^(\s)+$/, 'whitespace']; | |
Tokenizer.word = [/^\w+$/, 'word']; | |
Tokenizer.number = [/^\d+(\.\d+)?$/, 'number']; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment