thejefflarson/Javascript Tokenizer

## Javascript Tokenizer
  function(script){
    var tokens  = function(string){

      var curr_pos = string,
          from     = 0,         // character position of the start of the new token in the string
          i        = 0,         // current position
          length   = string.length, // total length of the string
          tks      = [],        // collection of tokens
          line     = 1,         // line number we're currently on
          tokenizer = function(regex, fn){
            from = i;
            if (regex.test(curr_pos)) {
              var ret = fn(regex);
              if (ret == "noop"){ // short circuit to handle whitespace etc.
                return true;
              }

              ret.value ? "" : ret.value = curr_pos.match(regex)[0];
              i += ret.value.length;
              ret.to  = i;
              tks.push(ret);
              curr_pos = string.slice(i);
              return true;
            }
            return false;
          },
          token = function(type, value){
            return {
              type:  type,
              value: value ? value : null, // to be filled in
              from:  from,
              to:    i,
              line:  line
            }
          },
          matchers = {
            operator : function(){
              return tokenizer(/^[=<>!+\-\*&][=<>&|]*/, function(regex){
                return token("operator");
              });
            },
            regex : function(){
              return tokenizer(/^\/(.*?)([^\\]|\\\\)\/[imgy]{0,4}/, function(){
                return token("regex");
              });
            },
            singleCharOperator : function(){
              return tokenizer(/^./, function(regex){
                return token("operator");
              });
            },
            identifier : function(){
              return tokenizer(/^[a-zA-Z$_](\w|\$)*/, function(regex){
                return token("identifier");
              });
            },
            number : function(){
              return tokenizer(/^((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?))/i,
                function(regex){
                  return token("number");
                });
            },
            string : function(){
              return tokenizer(/^(""|''|".*?(?:[^\\])"|'.*?(?:[^\\])')/, function(regex){
                return token("string");
              });
            },
            singleLineComment : function(){
              return tokenizer(/^\/\/(?:.*)[\r\n]/, function(regex){
                line++;
                return token("singleLineComment");
              });
            },
            multiLineComment : function() {
              return tokenizer(/^\/\*.*\*\//m, function(regex){
                // Multi Line comments can have many line breaks, we need to make sure we keep the
                // line counter in, erm, line.
                var comment   = curr_pos.match(regex)[0],
                    line_nums = comment.match(/[\r\n]/mg).length
                ;
                line += line_nums;
                return token("multiLineComment", comment);
              });
            },
            whiteSpace : function(){
              return tokenizer(/^\s/, function(regex){
                line_nums = curr_pos.match(/[\r\n]+/).length
                line += line_nums;
                i += curr_pos.match(regex)[0].length;
                curr_pos = string.slice(i);
                return "noop";
              });
            }

          },
          precedence = ["identifier", "number", "singleLineComment", "multiLineComment", "regex",
                        "string", "whiteSpace", "operator", "singleCharOperator"],
          key = 0
      ;

      while(i < string.length){
        for (key = 0; key < precedence.length; key++){
          if(matchers[precedence[key]]()){
            break;
          };
        }
      }
      return tks;
    };
	function(script){
	var tokens = function(string){

	var curr_pos = string,
	from = 0, // character position of the start of the new token in the string
	i = 0, // current position
	length = string.length, // total length of the string
	tks = [], // collection of tokens
	line = 1, // line number we're currently on
	tokenizer = function(regex, fn){
	from = i;
	if (regex.test(curr_pos)) {
	var ret = fn(regex);
	if (ret == "noop"){ // short circuit to handle whitespace etc.
	return true;
	}

	ret.value ? "" : ret.value = curr_pos.match(regex)[0];
	i += ret.value.length;
	ret.to = i;
	tks.push(ret);
	curr_pos = string.slice(i);
	return true;
	}
	return false;
	},
	token = function(type, value){
	return {
	type: type,
	value: value ? value : null, // to be filled in
	from: from,
	to: i,
	line: line
	}
	},
	matchers = {
	operator : function(){
	return tokenizer(/^[=<>!+\-\&][=<>&\|]/, function(regex){
	return token("operator");
	});
	},
	regex : function(){
	return tokenizer(/^\/(.*?)([^\\]\|\\\\)\/[imgy]{0,4}/, function(){
	return token("regex");
	});
	},
	singleCharOperator : function(){
	return tokenizer(/^./, function(regex){
	return token("operator");
	});
	},
	identifier : function(){
	return tokenizer(/^[a-zA-Z$_](\w\|\$)*/, function(regex){
	return token("identifier");
	});
	},
	number : function(){
	return tokenizer(/^((0(x\|X)[0-9a-fA-F]+)\|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?))/i,
	function(regex){
	return token("number");
	});
	},
	string : function(){
	return tokenizer(/^(""\|''\|".?(?:[^\\])"\|'.?(?:[^\\])')/, function(regex){
	return token("string");
	});
	},
	singleLineComment : function(){
	return tokenizer(/^\/\/(?:.*)[\r\n]/, function(regex){
	line++;
	return token("singleLineComment");
	});
	},
	multiLineComment : function() {
	return tokenizer(/^\/\.\*\//m, function(regex){
	// Multi Line comments can have many line breaks, we need to make sure we keep the
	// line counter in, erm, line.
	var comment = curr_pos.match(regex)[0],
	line_nums = comment.match(/[\r\n]/mg).length
	;
	line += line_nums;
	return token("multiLineComment", comment);
	});
	},
	whiteSpace : function(){
	return tokenizer(/^\s/, function(regex){
	line_nums = curr_pos.match(/[\r\n]+/).length
	line += line_nums;
	i += curr_pos.match(regex)[0].length;
	curr_pos = string.slice(i);
	return "noop";
	});
	}

	},
	precedence = ["identifier", "number", "singleLineComment", "multiLineComment", "regex",
	"string", "whiteSpace", "operator", "singleCharOperator"],
	key = 0
	;

	while(i < string.length){
	for (key = 0; key < precedence.length; key++){
	if(matchers[precedence[key]]()){
	break;
	};
	}
	}
	return tks;
	};