Skip to content

Instantly share code, notes, and snippets.

@zheplusplus
Last active December 20, 2015 08:39
Show Gist options
  • Save zheplusplus/6102238 to your computer and use it in GitHub Desktop.
Save zheplusplus/6102238 to your computer and use it in GitHub Desktop.
Automation based tokenizer
var tknz = typeof require === 'undefined' ? window.tokenizer
: require('./tokenizer');
var t = tknz.Tokenizer();
t
.simpleSymbols('=+-*/%<>!.', 'operator')
.simpleSymbols('(', 'op_paren')
.simpleSymbols(')', 'cl_paren')
.simpleSymbols('{', 'op_brace')
.simpleSymbols('}', 'cl_brace')
.simpleSymbols(';', 'semicolon')
.ignore(' \t\r')
.loop(tknz.DIGITS)
.accept('integer')
.startWith(tknz.LETTERS)
.loop(tknz.DIGITS + tknz.LETTERS)
.accept('identifier')
.fixed('if')
.fixed('for')
.fixed('<=', 'operator')
.fixed('>=', 'operator')
.fixed('==', 'operator')
.fixed('!=', 'operator')
;
console.log(t.tokenize('for (i = 0; i < 10; i = i + 1) { if (i % 3 == 0) { console.log(i); } }'));
-function(exports) {
function each(string, cb) {
for (var i = 0; i < string.length; ++i) {
cb(string[i]);
}
}
exports.DIGITS_EXC_0 = '123456789',
exports.DIGITS = exports.DIGITS_EXC_0 + '0',
exports.LOWERCASES = 'qwertyuiopasdfghjklzxcvbnm';
exports.UPPERCASES = exports.LOWERCASES.toUpperCase();
exports.LETTERS = exports.LOWERCASES + exports.UPPERCASES;
exports.Tokenizer = function() {
var entryState = {};
var ignoreState = {
_ignore: true
};
var stateTrace = [];
function started() {
return 0 < stateTrace.length;
}
function currentState() {
if (!started()) {
throw 'not started';
}
return stateTrace[stateTrace.length - 1];
}
return {
simpleSymbols: function(symbols, name) {
var symbolState = {
_type: name || 'symbol'
};
each(symbols, function(ch) {
if (entryState[ch]) {
throw 'duplicate entry: ' + ch;
}
entryState[ch] = symbolState;
});
return this;
},
startWith: function(next) {
if (started()) {
throw 'already started';
}
stateTrace.push({});
each(next, function(ch) {
if (entryState[ch]) {
throw 'duplicate entry: ' + ch;
}
entryState[ch] = currentState();
});
return this;
},
fixed: function(image, type) {
if (!image) {
throw 'empty fixed token';
}
function cloneState(s) {
var clone = {}, key;
if (s) {
for (key in s) {
clone[key] = s[key];
}
}
return clone;
}
var passState = entryState;
var lastChar = image[image.length - 1];
type = type || image;
each(image.substr(0, image.length - 1), function(ch) {
var shadowState = cloneState(passState[ch]);
passState[ch] = shadowState;
passState = shadowState;
});
var shadowState = cloneState(passState[lastChar]);
shadowState._type = type;
passState[lastChar] = shadowState;
return this;
},
loop: function(next) {
if (!started()) {
this.startWith(next);
}
each(next, function(ch) {
currentState()[ch] = currentState();
});
return this;
},
ignore: function(next) {
each(next, function(ch) {
entryState[ch] = ignoreState;
ignoreState[ch] = ignoreState;
});
return this;
},
accept: function(type) {
currentState()._type = type;
stateTrace = [];
return this;
},
tokenize: function(input) {
var state = entryState;
var token = [];
var result = [];
var me = this;
function resetConsume(ch) {
token = [];
state = entryState;
nextChar(ch);
}
function nextChar(ch) {
if (state[ch]) {
state = state[ch];
if (!state._ignore) {
token.push(ch);
}
return;
}
if (state._type) {
result.push({
token: token.join(''),
type: state._type
});
return resetConsume(ch);
}
if (state._ignore) {
return resetConsume(ch);
}
me.error('unexpected character');
}
each(input, nextChar);
if (token.length !== 0 && state) {
result.push({
token: token.join(''),
type: state._type
});
}
return result;
},
error: function(error) {
throw error;
}
};
};
}(typeof exports === 'undefined' ? window.tokenizer = {} : exports);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment