Skip to content

Instantly share code, notes, and snippets.

@thejefflarson
Created January 13, 2010 17:04
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thejefflarson/276369 to your computer and use it in GitHub Desktop.
Save thejefflarson/276369 to your computer and use it in GitHub Desktop.
Tokenize javascript
function(script){
var tokens = function(string){
var curr_pos = string,
from = 0, // character position of the start of the new token in the string
i = 0, // current position
length = string.length, // total length of the string
tks = [], // collection of tokens
line = 1, // line number we're currently on
tokenizer = function(regex, fn){
from = i;
if (regex.test(curr_pos)) {
var ret = fn(regex);
if (ret == "noop"){ // short circuit to handle whitespace etc.
return true;
}
ret.value ? "" : ret.value = curr_pos.match(regex)[0];
i += ret.value.length;
ret.to = i;
tks.push(ret);
curr_pos = string.slice(i);
return true;
}
return false;
},
token = function(type, value){
return {
type: type,
value: value ? value : null, // to be filled in
from: from,
to: i,
line: line
}
},
matchers = {
operator : function(){
return tokenizer(/^[=<>!+\-\*&][=<>&|]*/, function(regex){
return token("operator");
});
},
regex : function(){
return tokenizer(/^\/(.*?)([^\\]|\\\\)\/[imgy]{0,4}/, function(){
return token("regex");
});
},
singleCharOperator : function(){
return tokenizer(/^./, function(regex){
return token("operator");
});
},
identifier : function(){
return tokenizer(/^[a-zA-Z$_](\w|\$)*/, function(regex){
return token("identifier");
});
},
number : function(){
return tokenizer(/^((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?))/i,
function(regex){
return token("number");
});
},
string : function(){
return tokenizer(/^(""|''|".*?(?:[^\\])"|'.*?(?:[^\\])')/, function(regex){
return token("string");
});
},
singleLineComment : function(){
return tokenizer(/^\/\/(?:.*)[\r\n]/, function(regex){
line++;
return token("singleLineComment");
});
},
multiLineComment : function() {
return tokenizer(/^\/\*.*\*\//m, function(regex){
// Multi Line comments can have many line breaks, we need to make sure we keep the
// line counter in, erm, line.
var comment = curr_pos.match(regex)[0],
line_nums = comment.match(/[\r\n]/mg).length
;
line += line_nums;
return token("multiLineComment", comment);
});
},
whiteSpace : function(){
return tokenizer(/^\s/, function(regex){
line_nums = curr_pos.match(/[\r\n]+/).length
line += line_nums;
i += curr_pos.match(regex)[0].length;
curr_pos = string.slice(i);
return "noop";
});
}
},
precedence = ["identifier", "number", "singleLineComment", "multiLineComment", "regex",
"string", "whiteSpace", "operator", "singleCharOperator"],
key = 0
;
while(i < string.length){
for (key = 0; key < precedence.length; key++){
if(matchers[precedence[key]]()){
break;
};
}
}
return tks;
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment