Created
February 9, 2012 19:51
-
-
Save BonsaiDen/1782410 to your computer and use it in GitHub Desktop.
Tokenizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Copyright (c) 2012 Ivo Wetzel. | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to deal | |
* in the Software without restriction, including without limitation the rights | |
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
* copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included in | |
* all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
* THE SOFTWARE. | |
*/ | |
/** | |
* Tokenizer | |
* - Parses stuff into tokens | |
* - Creates basic tree layout with {([ / ])} tokens | |
*/ | |
function tokenize(input, tokenData, whitespace) { | |
var parsePos = 0, | |
inputPos = 0, | |
searchPos = 0, | |
inputLength = input.length, | |
inputLineStart = 0, | |
value = '', | |
token = null, | |
tokenStream = [], | |
inString = false, | |
escaped = false, | |
inputLine = 1, | |
lastPos = 0; | |
while(parsePos < inputLength) { | |
var current = input[inputPos], | |
endOfInput = inputPos === inputLength; | |
value += current; | |
// Match the next token | |
// first we try the regexpressions since those are greedy | |
var mToken = matchExp(input.substring(searchPos), tokenData.list, tokenData.map); | |
if (mToken) { | |
searchPos += mToken.value.length; | |
// Then the simple tokens | |
} else { | |
searchPos++; | |
mToken = matchToken(value, tokenData.list, tokenData.map); | |
} | |
if (token === null) { | |
token = mToken; | |
} | |
if (mToken.value.length < token.value.length || endOfInput) { | |
var other = matchToken(token.value, tokenData.list, tokenData.map, true); | |
if (other && other.type !== 'unknown' && other.type !== token.type) { | |
token.type = other.type; | |
} | |
var inputColumn = parsePos - inputLineStart + 1; | |
// Add unkown token and end parsing | |
if (token.type === 'unknown') { | |
var lastToken = tokenStream[tokenStream.length - 1]; | |
tokenStream.push({ | |
type: 'unkown', | |
line: inputLine, | |
start: lastToken.end, | |
end: -1 | |
}); | |
break; | |
} | |
// Add token to stream | |
token.start = { | |
line: inputLine, | |
column: inputColumn | |
}; | |
token.end = { | |
line: inputLine, | |
column: token.start.column + token.value.length | |
}; | |
if (token.type !== 'whitespace' && token.type !== 'newline') { | |
tokenStream.push(token); | |
} | |
// Advance parser | |
parsePos += token.value.length; | |
inputPos = parsePos; | |
// Handle newlines | |
if (token.type === 'comment.multi') { | |
var lines = token.value.split('\n'); | |
inputLine += lines.length - 1; | |
token.end.line = inputLine; | |
token.end.column = lines[lines.length - 1].length; | |
inputLineStart = parsePos - token.end.column; | |
} | |
if (token.type === 'newline') { | |
inputLine++; | |
inputLineStart = parsePos; | |
} | |
value = ''; | |
token = null; | |
continue; | |
} else { | |
token = mToken; | |
} | |
inputPos++; | |
} | |
function level(tokens, open, close, opens, closes) { | |
var lvl = 0, openTokens = []; | |
for(var i = 0, l = tokens.length; i < l; i++) { | |
var token = tokens[i]; | |
if (opens.indexOf(token.type) !== -1) { | |
lvl++; | |
} else if (closes.indexOf(token.type) !== -1) { | |
token.lvl = lvl; | |
lvl--; | |
} | |
if (token.type === open) { | |
openTokens.push(token); | |
token.lvl = lvl; | |
} else if (token.type === close) { | |
openTokens.pop(); | |
if (lvl === -1) { | |
parseError('Unbalanced', token, 'missing opening token'); | |
} | |
} | |
} | |
if (lvl > 0) { | |
parseError('Unbalanced ', openTokens[0], 'missing closing token'); | |
} | |
} | |
function group(tokens, open, close, type) { | |
var count = tokens.length; | |
for(var i = 0, l = count; i < l; i++) { | |
var token = tokens[i]; | |
if (!token) { | |
continue; | |
} | |
// Enter exisiting sub groups | |
if (token.values) { | |
group(token.values, open, close, type); | |
} | |
// Convert the token | |
if (token.type === open) { | |
token.type = type; | |
token.values = []; | |
for(var e = i + 1; e < count; e++) { | |
var otherToken = tokens[e]; | |
// Remove token from stream | |
tokens.splice(e, 1); | |
e--; | |
i--; | |
count--; | |
// Check for end of group | |
if (otherToken.type === close | |
&& otherToken.level === token.level) { | |
// Recurse through sub groups | |
group(token.values, open, close, type); | |
break; | |
} | |
token.values.push(otherToken); | |
} | |
} | |
} | |
} | |
var opens = ['group.curly.open', 'group.brace.open', 'group.paren.open'], | |
closes = ['group.curly.close', 'group.brace.close', 'group.paren.close']; | |
// Leveling | |
level(tokenStream, 'group.curly.open', 'group.curly.close', opens, closes); | |
level(tokenStream, 'group.brace.open', 'group.brace.close', opens, closes); | |
level(tokenStream, 'group.paren.open', 'group.paren.close', opens, closes); | |
// Grouping | |
group(tokenStream, 'group.curly.open', 'group.curly.close', 'block.curly'); | |
group(tokenStream, 'group.brace.open', 'group.brace.close', 'block.brace'); | |
group(tokenStream, 'group.paren.open', 'group.paren.close', 'block.paren'); | |
// Create lists | |
function list(tokens, type) { | |
var count = tokens.length; | |
for(var i = 0, l = count; i < l; i++) { | |
var token = tokens[i]; | |
if (!token) { | |
continue; | |
} | |
// Enter exisiting sub groups | |
if (token.values) { | |
list(token.values, type); | |
} | |
if (token.type === type) { | |
var subList = [], tokenList = []; | |
for(var e = 0, el = token.values.length; e < el; e++) { | |
var subToken = token.values[e]; | |
if (subToken.type === 'punct.comma') { | |
tokenList.push(subList); | |
subList = []; | |
} else { | |
subList.push(subToken); | |
} | |
} | |
if (subList.length > 0) { | |
tokenList.push(subList); | |
} | |
if (tokenList.length > 0) { | |
//console.log(tokenList); | |
} | |
} | |
} | |
} | |
//list(tokenStream, 'block.paren'); | |
// Returns a tree, actually | |
return tokenStream; | |
} | |
exports.getTokens = function(source) { | |
var tokenData = generateTokenTree(TOKEN_TREE); | |
return tokenize(source, tokenData); | |
}; | |
// Helpers -------------------------------------------------------------------- | |
// ---------------------------------------------------------------------------- | |
function parseError(message, token, expected) { | |
console.log(message + ': "' + token.value + '" at line ' + token.start.line + ' column ' + token.start.column + '; ' + expected); | |
process.exit(0); | |
} | |
function parseTokenTree(tokens, base, map) { | |
for(var i in tokens) { | |
if (typeof tokens[i] === 'string') { | |
map['s' + base + '.' + i] = tokens[i]; | |
} else if (tokens[i] instanceof RegExp) { | |
map['r' + base + '.' + i] = tokens[i]; | |
} else if (tokens[i] instanceof Array) { | |
map['l' + base + '.' + i] = tokens[i][0]; | |
} else { | |
parseTokenTree(tokens[i], base + '.' + i, map); | |
} | |
} | |
} | |
function generateTokenTree(tree) { | |
var tokenMap = {}, | |
tokenList = [], | |
typeMap = {}, | |
typeList = []; | |
parseTokenTree(tree, '', tokenMap); | |
for(var i in tokenMap) { | |
tokenList.push(tokenMap[i]); | |
typeMap[tokenMap[i]] = i; | |
} | |
tokenList.sort(function(a, b) { | |
// Move regexp at the start | |
if (typeMap[a][0] === 'r' && typeMap[b][0] !== 'r') { | |
return 1; | |
} else if (typeMap[b][0] === 'r' && typeMap[a][0] !== 'r') { | |
return -1; | |
} else { | |
return b.toString().length - a.toString().length; | |
} | |
}); | |
tokenList.sort(function(a, b) { | |
// Move regexp at the start | |
if (typeMap[a][0] === 'l' && typeMap[b][0] !== 'l') { | |
return 1; | |
} else if (typeMap[b][0] === 'l' && typeMap[a][0] !== 'l') { | |
return -1; | |
} else { | |
return b.toString().length - a.toString().length; | |
} | |
}); | |
return { | |
list: tokenList, | |
map: typeMap | |
}; | |
} | |
function matchExp(string, list, map) { | |
for(var i = 0, l = list.length; i < l; i++) { | |
var token = list[i], | |
type = map[token.toString()]; | |
if (type[0] === 'l') { | |
var match = string.match(token); | |
if (match) { | |
return { | |
type: type.substr(2), | |
value: match[0] | |
}; | |
} | |
} | |
} | |
return null; | |
} | |
function matchToken(string, list, map, noexp) { | |
for(var i = 0, l = list.length; i < l; i++) { | |
var token = list[i], | |
type = map[token.toString()] | |
if (type[0] === 'r' && !noexp) { | |
var match = string.match(token); | |
if (match) { | |
return { | |
type: type.substr(2), | |
value: match[0] | |
}; | |
} | |
} else if (type[0] !== 'l') { | |
if (string === token) { | |
return { | |
type: type.substr(2), | |
value: token | |
}; | |
} | |
} | |
} | |
return { | |
type: 'unknown', | |
value: '' | |
}; | |
} | |
// Tokens --------------------------------------------------------------------- | |
// ---------------------------------------------------------------------------- | |
var TOKEN_TREE = { | |
whitespace: /^[\ \t]+$/, | |
newline: /^(\n|\r)$/, | |
comment: { | |
line: /^\/\/.*$/, | |
multi: [/^\/\*(.|[\r\n])*?\*\/$/m] | |
}, | |
operator: { | |
increment: '++', | |
decrement: '--', | |
math: { | |
add: '+', | |
subtract: '-', | |
multiply: '*', | |
divide: '/' | |
}, | |
assign: { | |
assign: '=', | |
math: { | |
add: '+=', | |
substract: '-=', | |
multiply: '*=', | |
divide: '/=' | |
}, | |
bitwise: { | |
and: '&=', | |
or: '|=', | |
negate: '~=', | |
xor: '^=', | |
shiftLeft: '<<=', | |
shiftRight: '>>=', | |
ushiftRight: '>>>=' | |
} | |
}, | |
compare: { | |
lt: '<', | |
gt: '>', | |
lte: '<=', | |
gte: '>=', | |
equal: '==', | |
not: '!=' | |
}, | |
bitwise: { | |
and: '&', | |
or: '|', | |
negate: '~', | |
xor: '^', | |
shiftLeft: '<<', | |
shiftRight: '>>', | |
ushiftRight: '>>>' | |
}, | |
logic: { | |
and: '&&', | |
or: '||' | |
}, | |
not: '!' | |
}, | |
punct: { | |
dot: '.', | |
colon: ':', | |
tenary: '?', | |
comma: ',', | |
semicolon: ';' | |
}, | |
group: { | |
brace: { | |
open: '[', | |
close: ']' | |
}, | |
curly: { | |
open: '{', | |
close: '}' | |
}, | |
paren: { | |
open: '(', | |
close: ')' | |
} | |
}, | |
number: { | |
integer: /^(0|[1-9]([0-9]+)?)$/, | |
floating: /^(0|[1-9]([0-9]+)?)\.[0-9]*$/, | |
hex: /^0x[0-9a-fA-F]+$/ | |
}, | |
string: { | |
single: /^'([^'\\]|\\.)*'$/, | |
'double': /^"([^"\\]|\\.)*"$/ | |
}, | |
identifier: /^[a-zA-Z\$_]([a-zA-Z0-9\$_]+)?$/, | |
//member: /^@[a-zA-Z\$_]([a-zA-Z0-9\$_]+)?$/, | |
keyword: { | |
'var': 'var', | |
'this': 'this', | |
'void': 'void', | |
'function': 'function', | |
'boolean': { | |
'true': 'true', | |
'false': 'false' | |
}, | |
taken: { | |
'boolean': 'boolean', | |
'public': 'public', | |
'private': 'private', | |
'protected': 'protected', | |
'const': 'const', | |
'static': 'static', | |
'abstract': 'abstract', | |
'interface': 'interface', | |
'class': 'class', | |
'extends': 'extends', | |
'implements': 'implements' | |
}, | |
statement: { | |
conditional: { | |
'if': 'if', | |
'else': 'else', | |
'switch': 'switch' | |
}, | |
'switch': { | |
'case': 'switch', | |
'default': 'default' | |
}, | |
loop: { | |
'while': 'while', | |
'for': 'for', | |
'do': 'do' | |
}, | |
error: { | |
'try': 'try', | |
'catch': 'catch', | |
'finally': 'finally', | |
'raise': 'raise' | |
}, | |
'continue': 'continue', | |
'break': 'break', | |
'return': 'return' | |
}, | |
type: { | |
number: 'Number', | |
string: 'String', | |
array: 'Array', | |
object: 'Object', | |
error: 'Error', | |
date: 'Date', | |
regexp: 'RegExp' | |
} | |
} | |
}; | |
function ast(tokens, level) { | |
level = level || 0; | |
var pos = 0; | |
function after(i) { | |
if (i === undefined) { | |
i = 1; | |
} | |
return tokens[pos + i]; | |
} | |
function next() { | |
return tokens[pos++]; | |
} | |
function end() { | |
level--; | |
log('--'); | |
log(); | |
} | |
function log() { | |
var data = Array.prototype.slice.call(arguments); | |
data.unshift(new Array(level + 1).join(' ')); | |
console.log.apply(console, data); | |
} | |
var token = next(); | |
if (!token) { | |
end(); | |
return; | |
} | |
do { | |
switch(token.type) { | |
case 'keyword': | |
break | |
case 'block.paren': | |
log('() block', token.lvl); | |
ast(token.values, token.lvl); | |
break; | |
case 'block.brace': | |
log('[] block', token.lvl); | |
ast(token.values, token.lvl); | |
break; | |
case 'block.curly': | |
log('{} block', token.lvl); | |
ast(token.values, token.lvl); | |
break; | |
default: | |
log(token.type, level); | |
break; | |
} | |
token = next(); | |
} while(token); | |
end(); | |
} | |
// test | |
var fs = require('fs'), | |
source = fs.readFileSync('test.js').toString().trimRight(); | |
var tokens = exports.getTokens(source); | |
ast(tokens); | |
var util = require('util'), | |
data = JSON.stringify(tokens); | |
fs.writeFileSync('tokens.json', data); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment