Skip to content

Instantly share code, notes, and snippets.

@BonsaiDen
Created February 9, 2012 19:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save BonsaiDen/1782410 to your computer and use it in GitHub Desktop.
Save BonsaiDen/1782410 to your computer and use it in GitHub Desktop.
Tokenizer
/**
* Copyright (c) 2012 Ivo Wetzel.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
/**
* Tokenizer
* - Parses stuff into tokens
* - Creates basic tree layout with {([ / ])} tokens
*/
function tokenize(input, tokenData, whitespace) {
var parsePos = 0,
inputPos = 0,
searchPos = 0,
inputLength = input.length,
inputLineStart = 0,
value = '',
token = null,
tokenStream = [],
inString = false,
escaped = false,
inputLine = 1,
lastPos = 0;
while(parsePos < inputLength) {
var current = input[inputPos],
endOfInput = inputPos === inputLength;
value += current;
// Match the next token
// first we try the regexpressions since those are greedy
var mToken = matchExp(input.substring(searchPos), tokenData.list, tokenData.map);
if (mToken) {
searchPos += mToken.value.length;
// Then the simple tokens
} else {
searchPos++;
mToken = matchToken(value, tokenData.list, tokenData.map);
}
if (token === null) {
token = mToken;
}
if (mToken.value.length < token.value.length || endOfInput) {
var other = matchToken(token.value, tokenData.list, tokenData.map, true);
if (other && other.type !== 'unknown' && other.type !== token.type) {
token.type = other.type;
}
var inputColumn = parsePos - inputLineStart + 1;
// Add unkown token and end parsing
if (token.type === 'unknown') {
var lastToken = tokenStream[tokenStream.length - 1];
tokenStream.push({
type: 'unkown',
line: inputLine,
start: lastToken.end,
end: -1
});
break;
}
// Add token to stream
token.start = {
line: inputLine,
column: inputColumn
};
token.end = {
line: inputLine,
column: token.start.column + token.value.length
};
if (token.type !== 'whitespace' && token.type !== 'newline') {
tokenStream.push(token);
}
// Advance parser
parsePos += token.value.length;
inputPos = parsePos;
// Handle newlines
if (token.type === 'comment.multi') {
var lines = token.value.split('\n');
inputLine += lines.length - 1;
token.end.line = inputLine;
token.end.column = lines[lines.length - 1].length;
inputLineStart = parsePos - token.end.column;
}
if (token.type === 'newline') {
inputLine++;
inputLineStart = parsePos;
}
value = '';
token = null;
continue;
} else {
token = mToken;
}
inputPos++;
}
function level(tokens, open, close, opens, closes) {
var lvl = 0, openTokens = [];
for(var i = 0, l = tokens.length; i < l; i++) {
var token = tokens[i];
if (opens.indexOf(token.type) !== -1) {
lvl++;
} else if (closes.indexOf(token.type) !== -1) {
token.lvl = lvl;
lvl--;
}
if (token.type === open) {
openTokens.push(token);
token.lvl = lvl;
} else if (token.type === close) {
openTokens.pop();
if (lvl === -1) {
parseError('Unbalanced', token, 'missing opening token');
}
}
}
if (lvl > 0) {
parseError('Unbalanced ', openTokens[0], 'missing closing token');
}
}
function group(tokens, open, close, type) {
var count = tokens.length;
for(var i = 0, l = count; i < l; i++) {
var token = tokens[i];
if (!token) {
continue;
}
// Enter exisiting sub groups
if (token.values) {
group(token.values, open, close, type);
}
// Convert the token
if (token.type === open) {
token.type = type;
token.values = [];
for(var e = i + 1; e < count; e++) {
var otherToken = tokens[e];
// Remove token from stream
tokens.splice(e, 1);
e--;
i--;
count--;
// Check for end of group
if (otherToken.type === close
&& otherToken.level === token.level) {
// Recurse through sub groups
group(token.values, open, close, type);
break;
}
token.values.push(otherToken);
}
}
}
}
var opens = ['group.curly.open', 'group.brace.open', 'group.paren.open'],
closes = ['group.curly.close', 'group.brace.close', 'group.paren.close'];
// Leveling
level(tokenStream, 'group.curly.open', 'group.curly.close', opens, closes);
level(tokenStream, 'group.brace.open', 'group.brace.close', opens, closes);
level(tokenStream, 'group.paren.open', 'group.paren.close', opens, closes);
// Grouping
group(tokenStream, 'group.curly.open', 'group.curly.close', 'block.curly');
group(tokenStream, 'group.brace.open', 'group.brace.close', 'block.brace');
group(tokenStream, 'group.paren.open', 'group.paren.close', 'block.paren');
// Create lists
function list(tokens, type) {
var count = tokens.length;
for(var i = 0, l = count; i < l; i++) {
var token = tokens[i];
if (!token) {
continue;
}
// Enter exisiting sub groups
if (token.values) {
list(token.values, type);
}
if (token.type === type) {
var subList = [], tokenList = [];
for(var e = 0, el = token.values.length; e < el; e++) {
var subToken = token.values[e];
if (subToken.type === 'punct.comma') {
tokenList.push(subList);
subList = [];
} else {
subList.push(subToken);
}
}
if (subList.length > 0) {
tokenList.push(subList);
}
if (tokenList.length > 0) {
//console.log(tokenList);
}
}
}
}
//list(tokenStream, 'block.paren');
// Returns a tree, actually
return tokenStream;
}
exports.getTokens = function(source) {
var tokenData = generateTokenTree(TOKEN_TREE);
return tokenize(source, tokenData);
};
// Helpers --------------------------------------------------------------------
// ----------------------------------------------------------------------------
function parseError(message, token, expected) {
console.log(message + ': "' + token.value + '" at line ' + token.start.line + ' column ' + token.start.column + '; ' + expected);
process.exit(0);
}
function parseTokenTree(tokens, base, map) {
for(var i in tokens) {
if (typeof tokens[i] === 'string') {
map['s' + base + '.' + i] = tokens[i];
} else if (tokens[i] instanceof RegExp) {
map['r' + base + '.' + i] = tokens[i];
} else if (tokens[i] instanceof Array) {
map['l' + base + '.' + i] = tokens[i][0];
} else {
parseTokenTree(tokens[i], base + '.' + i, map);
}
}
}
function generateTokenTree(tree) {
var tokenMap = {},
tokenList = [],
typeMap = {},
typeList = [];
parseTokenTree(tree, '', tokenMap);
for(var i in tokenMap) {
tokenList.push(tokenMap[i]);
typeMap[tokenMap[i]] = i;
}
tokenList.sort(function(a, b) {
// Move regexp at the start
if (typeMap[a][0] === 'r' && typeMap[b][0] !== 'r') {
return 1;
} else if (typeMap[b][0] === 'r' && typeMap[a][0] !== 'r') {
return -1;
} else {
return b.toString().length - a.toString().length;
}
});
tokenList.sort(function(a, b) {
// Move regexp at the start
if (typeMap[a][0] === 'l' && typeMap[b][0] !== 'l') {
return 1;
} else if (typeMap[b][0] === 'l' && typeMap[a][0] !== 'l') {
return -1;
} else {
return b.toString().length - a.toString().length;
}
});
return {
list: tokenList,
map: typeMap
};
}
function matchExp(string, list, map) {
for(var i = 0, l = list.length; i < l; i++) {
var token = list[i],
type = map[token.toString()];
if (type[0] === 'l') {
var match = string.match(token);
if (match) {
return {
type: type.substr(2),
value: match[0]
};
}
}
}
return null;
}
function matchToken(string, list, map, noexp) {
for(var i = 0, l = list.length; i < l; i++) {
var token = list[i],
type = map[token.toString()]
if (type[0] === 'r' && !noexp) {
var match = string.match(token);
if (match) {
return {
type: type.substr(2),
value: match[0]
};
}
} else if (type[0] !== 'l') {
if (string === token) {
return {
type: type.substr(2),
value: token
};
}
}
}
return {
type: 'unknown',
value: ''
};
}
// Tokens ---------------------------------------------------------------------
// ----------------------------------------------------------------------------
var TOKEN_TREE = {
whitespace: /^[\ \t]+$/,
newline: /^(\n|\r)$/,
comment: {
line: /^\/\/.*$/,
multi: [/^\/\*(.|[\r\n])*?\*\/$/m]
},
operator: {
increment: '++',
decrement: '--',
math: {
add: '+',
subtract: '-',
multiply: '*',
divide: '/'
},
assign: {
assign: '=',
math: {
add: '+=',
substract: '-=',
multiply: '*=',
divide: '/='
},
bitwise: {
and: '&=',
or: '|=',
negate: '~=',
xor: '^=',
shiftLeft: '<<=',
shiftRight: '>>=',
ushiftRight: '>>>='
}
},
compare: {
lt: '<',
gt: '>',
lte: '<=',
gte: '>=',
equal: '==',
not: '!='
},
bitwise: {
and: '&',
or: '|',
negate: '~',
xor: '^',
shiftLeft: '<<',
shiftRight: '>>',
ushiftRight: '>>>'
},
logic: {
and: '&&',
or: '||'
},
not: '!'
},
punct: {
dot: '.',
colon: ':',
tenary: '?',
comma: ',',
semicolon: ';'
},
group: {
brace: {
open: '[',
close: ']'
},
curly: {
open: '{',
close: '}'
},
paren: {
open: '(',
close: ')'
}
},
number: {
integer: /^(0|[1-9]([0-9]+)?)$/,
floating: /^(0|[1-9]([0-9]+)?)\.[0-9]*$/,
hex: /^0x[0-9a-fA-F]+$/
},
string: {
single: /^'([^'\\]|\\.)*'$/,
'double': /^"([^"\\]|\\.)*"$/
},
identifier: /^[a-zA-Z\$_]([a-zA-Z0-9\$_]+)?$/,
//member: /^@[a-zA-Z\$_]([a-zA-Z0-9\$_]+)?$/,
keyword: {
'var': 'var',
'this': 'this',
'void': 'void',
'function': 'function',
'boolean': {
'true': 'true',
'false': 'false'
},
taken: {
'boolean': 'boolean',
'public': 'public',
'private': 'private',
'protected': 'protected',
'const': 'const',
'static': 'static',
'abstract': 'abstract',
'interface': 'interface',
'class': 'class',
'extends': 'extends',
'implements': 'implements'
},
statement: {
conditional: {
'if': 'if',
'else': 'else',
'switch': 'switch'
},
'switch': {
'case': 'switch',
'default': 'default'
},
loop: {
'while': 'while',
'for': 'for',
'do': 'do'
},
error: {
'try': 'try',
'catch': 'catch',
'finally': 'finally',
'raise': 'raise'
},
'continue': 'continue',
'break': 'break',
'return': 'return'
},
type: {
number: 'Number',
string: 'String',
array: 'Array',
object: 'Object',
error: 'Error',
date: 'Date',
regexp: 'RegExp'
}
}
};
function ast(tokens, level) {
level = level || 0;
var pos = 0;
function after(i) {
if (i === undefined) {
i = 1;
}
return tokens[pos + i];
}
function next() {
return tokens[pos++];
}
function end() {
level--;
log('--');
log();
}
function log() {
var data = Array.prototype.slice.call(arguments);
data.unshift(new Array(level + 1).join(' '));
console.log.apply(console, data);
}
var token = next();
if (!token) {
end();
return;
}
do {
switch(token.type) {
case 'keyword':
break
case 'block.paren':
log('() block', token.lvl);
ast(token.values, token.lvl);
break;
case 'block.brace':
log('[] block', token.lvl);
ast(token.values, token.lvl);
break;
case 'block.curly':
log('{} block', token.lvl);
ast(token.values, token.lvl);
break;
default:
log(token.type, level);
break;
}
token = next();
} while(token);
end();
}
// test
var fs = require('fs'),
source = fs.readFileSync('test.js').toString().trimRight();
var tokens = exports.getTokens(source);
ast(tokens);
var util = require('util'),
data = JSON.stringify(tokens);
fs.writeFileSync('tokens.json', data);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment