Created
April 29, 2013 19:43
-
-
Save BonsaiDen/5484165 to your computer and use it in GitHub Desktop.
Emblem Lexer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var rules = require('./rules'); | |
// Emblem Lexer --------------------------------------------------------------- | |
// ---------------------------------------------------------------------------- | |
var Lexer = function() { | |
this.line = 0; | |
this.col = 0; | |
this.offset = 0; | |
this.source = null; | |
this.tokens = null; | |
this.rules = this.compileRules(rules.tokens, rules.macros); | |
}; | |
exports.lex = function(source) { | |
return new Lexer().parse(source); | |
}; | |
// Lexer Token ---------------------------------------------------------------- | |
Lexer.Token = function(id, value, loc) { | |
this.id = id; | |
this.value = value; | |
this.start = loc.start; | |
this.end = loc.end; | |
}; | |
Lexer.Token.prototype = { | |
isType: function(id) { | |
return this.id === id; | |
}, | |
isValue: function(value) { | |
return this.value === value; | |
}, | |
toString: function() { | |
return this.id + ' @ ' + this.start.line + ':' + this.start.col; | |
} | |
}; | |
// Lexer Methods -------------------------------------------------------------- | |
// ---------------------------------------------------------------------------- | |
Lexer.prototype = { | |
// Public Interface ------------------------------------------------------- | |
parse: function(source) { | |
this.line = 0; | |
this.col = 0; | |
this.offset = 0; | |
this.source = this.parseSource(source); | |
this.bufferedToken = null; | |
return this; | |
}, | |
peek: function(id) { | |
if (!this.bufferedToken) { | |
this.bufferedToken = this.next(); | |
} | |
return this.bufferedToken; | |
}, | |
next: function() { | |
var token = null; | |
if (this.bufferedToken) { | |
token = this.bufferedToken; | |
this.bufferedToken = null; | |
} else { | |
token = this.getToken(); | |
} | |
return token; | |
}, | |
advance: function(id) { | |
var token = this.next(); | |
if (token.id !== id) { | |
throw new Error('Lexer: Expected ' + id + ' but got: ' + token); | |
} else { | |
return token; | |
} | |
}, | |
// Tokenization ----------------------------------------------------------- | |
getToken: function() { | |
var token = null; | |
while((token = this.matchToken())) { | |
if (token.id !== 'WHITESPACE') { | |
break; | |
} | |
} | |
return token; | |
}, | |
matchToken: function() { | |
var token = null, | |
text = '', | |
id = 'UNKNOWN', | |
subtext = this.source.substring(this.offset), | |
match = this.matchFirstRule(subtext), | |
loc = this.getTokenLocation(text); | |
if (match) { | |
var tokenName = match.rule; | |
text = match[0]; | |
loc = this.getTokenLocation(text); | |
// Create token from available match | |
id = tokenName && typeof tokenName === 'string' ? tokenName : text; | |
token = new Lexer.Token(id, text, loc); | |
if (typeof tokenName === 'function') { | |
tokenName(token); | |
} | |
// Update location | |
this.offset = this.offset + match.index + text.length; | |
this.line = loc.end.line - 1; | |
this.col = loc.end.col - 1; | |
} else if (subtext.length) { | |
text = subtext.substring(0, 10) + '...'; | |
token = new Lexer.Token(id, text, this.offset, loc); | |
} | |
return token; | |
}, | |
getTokenLocation: function(text) { | |
var textLines = text.split(/\r\n|\r|\n/), | |
lineCount = textLines.length, | |
currentLine = textLines[textLines.length - 1], | |
endLine = this.line + lineCount, | |
endCol = lineCount > 1 ? currentLine.length | |
: this.col + currentLine.length; | |
return { | |
start: { | |
line: this.line + 1, | |
col: this.col + 1 | |
}, | |
end: { | |
line: endLine + 1, | |
col: endCol + 1 | |
} | |
}; | |
}, | |
matchFirstRule: function(text) { | |
// Go through all the regular expressions and find the first one that | |
// matches from the start of the text | |
var match = null; | |
for(var i = 0, l = this.rules.length; i < l; i++) { | |
var rule = this.rules[i]; | |
match = text.match(rule[0]); | |
if (match) { | |
match.rule = rule[1]; | |
break; | |
} | |
} | |
return match; | |
}, | |
// Initialization --------------------------------------------------------- | |
parseSource: function(source) { | |
// Remove shebang | |
if (source.substring(0, 2) === '#!') { | |
this.line = 1; | |
return source.substring(source.indexOf('\n') + 1); | |
} else { | |
return source; | |
} | |
}, | |
compileRules: function(tokens, macros) { | |
return tokens.map(function(token) { | |
for(var i in macros) { | |
if (macros.hasOwnProperty(i)) { | |
var exp = new RegExp('{' + i + '}', 'g'); | |
token[0] = token[0].replace(exp, macros[i]); | |
} | |
} | |
return [new RegExp('^' + token[0]), token[1]]; | |
}); | |
} | |
}; | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*jshint evil: true */ | |
exports.macros = { | |
digit: '[0-9]', | |
ident: '[a-zA-Z_]([a-zA-Z_0-9]+)?', | |
esc: '\\\\', | |
'int': '-?(?:[0-9]|[1-9][0-9]+)', | |
hex: '-?(?:0x[0-9a-fA-F]+)', | |
exp: '(?:[eE][-+]?[0-9]+)', | |
frac: '(?:\\.[0-9]+)' | |
}; | |
exports.tokens = [ | |
// Whitespace | |
['[\\ \\t\\v\\n\\r]+', 'WHITESPACE'], | |
// Literals | |
['{int}{frac}{exp}?\\b', 'FLOAT'], | |
['{int}{exp}?\\b', 'INTEGER'], | |
['{hex}\\b', 'HEX'], | |
['true\\b', 'BOOL'], | |
['false\\b', 'BOOL'], | |
// Strings | |
["'(?:{esc}['bfnrt/{esc}]|{esc}u[a-fA-F0-9]{4}|[^'{esc}])*'", function(token) { | |
token.id = 'STRING'; | |
token.text = eval(token.text); | |
}], | |
// Raw Strings | |
["`(?:{esc}[`bfnrt/{esc}]|{esc}u[a-fA-F0-9]{4}|[^`{esc}])*`", function(token) { | |
token.id = 'RAW_STRING'; | |
token.text = token.text.substring(1, token.text.length - 1); | |
}], | |
// Doc Comments | |
['\\-\\-\\-[^]*?\\-\\-\\-', function(token) { | |
token.id = 'DOC_COMMENT'; | |
token.text = token.text.substring(3, token.text.length - 3); | |
}], | |
// Line Comments | |
['\\-\\-[^\\-].*', function(token) { | |
token.id = 'LINE_COMMENT'; | |
token.text = token.text.substring(2); | |
}], | |
// Types | |
['int\\b', 'TYPE'], | |
['float\\b', 'TYPE'], | |
['string\\b', 'TYPE'], | |
['bool\\b', 'TYPE'], | |
['list\\b', 'TYPE'], | |
['map\\b', 'TYPE'], | |
['struct\\b', 'TYPE'], | |
['void\\b', 'TYPE'], | |
// Type Modifiers | |
['mutable\\b', 'MODIFIER'], | |
['public\\b', 'MODIFIER'], | |
['abstract\\b', 'MODIFIER'], | |
['protected\\b', 'MODIFIER'], | |
['private\\b', 'MODIFIER'], | |
// Block Statements | |
['scope\\b', 'KEYWORD'], | |
['if\\b', 'KEYWORD'], | |
['elif\\b', 'KEYWORD'], | |
['else\\b', 'KEYWORD'], | |
['match\\b', 'KEYWORD'], | |
['case\\b', 'KEYWORD'], | |
// Loop Statements | |
['loop\\b', 'KEYWORD'], | |
['leave\\b', 'KEYWORD'], | |
['each\\b', 'KEYWORD'], | |
['in\\b', 'KEYWORD'], | |
// Class | |
['class\\b', 'KEYWORD'], | |
['extends\\b', 'KEYWORD'], | |
['interface\\b', 'KEYWORD'], | |
['implements\\b', 'KEYWORD'], | |
// Import / Export | |
['import\\b', 'KEYWORD'], | |
['from\\b', 'KEYWORD'], | |
['as\\b', 'KEYWORD'], | |
['export\\b', 'KEYWORD'], | |
// Identifiers | |
['[a-zA-Z_]([a-zA-Z_0-9]+)?\\b', 'IDENTIFIER'], | |
// Compound assignments | |
['\\&\\=', 'COMPOUND'], | |
['\\~\\=', 'COMPOUND'], | |
['\\^\\=', 'COMPOUND'], | |
['\\+\\=', 'COMPOUND'], | |
['\\-\\=', 'COMPOUND'], | |
['\\%\\=', 'COMPOUND'], | |
['\\/\\/\\=', 'COMPOUND'], | |
['\\/\\=', 'COMPOUND'], | |
['\\*\\*\\=', 'COMPOUND'], | |
['\\*\\=', 'COMPOUND'], | |
['\\|\\|\\=', 'COMPOUND'], | |
['\\&\\&\\=', 'COMPOUND'], | |
['\\<\\<\\=', 'COMPOUND'], | |
['\\>\\>\\=', 'COMPOUND'], | |
// Shift Operators | |
['\\>\\>', 'SHIFT'], | |
['\\<\\<', 'SHIFT'], | |
['\\>\\>\\>', 'SHIFT'], | |
// Relational Operators | |
['\\>', 'RELATION'], | |
['\\>\\=', 'RELATION'], | |
['\\<', 'RELATION'], | |
['\\<\\=', 'RELATION'], | |
// Compare Operators | |
['\\!\\=', 'RELATION'], | |
['\\=\\=', 'RELATION'], | |
// Logical Operators | |
['\\&\\&', 'LOGIC'], | |
['\\|\\|', 'LOGIC'], | |
// Infix Operators | |
['\\/\\/', 'INFIX'], | |
['\\/', 'INFIX'], | |
['\\*\\*', 'INFIX'], | |
['\\*', 'INFIX'], | |
['\\|', 'INFIX'], | |
['\\%', 'INFIX'], | |
['\\^', 'INFIX'], | |
['\\&', 'INFIX'], | |
// Unary Operators | |
['\\+'], | |
['\\-'], | |
['\\#', 'UNARY'], | |
['\\!', 'UNARY'], | |
['\\~', 'UNARY'], | |
// Basic punctiation | |
['\\.\\.\\.'], | |
['\\.\\.'], | |
['\\.'], | |
['\\,'], | |
['\\:'], | |
['\\;'], | |
['\\@'], | |
['\\?'], | |
['\\='], | |
// Parenthesis | |
['\\('], | |
['\\)'], | |
['\\{'], | |
['\\}'], | |
['\\['], | |
['\\]'] | |
]; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment