Skip to content

Instantly share code, notes, and snippets.

@BonsaiDen
Created April 29, 2013 19:43
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save BonsaiDen/5484165 to your computer and use it in GitHub Desktop.
Save BonsaiDen/5484165 to your computer and use it in GitHub Desktop.
Emblem Lexer
var rules = require('./rules');
// Emblem Lexer ---------------------------------------------------------------
// ----------------------------------------------------------------------------
var Lexer = function() {
this.line = 0;
this.col = 0;
this.offset = 0;
this.source = null;
this.tokens = null;
this.rules = this.compileRules(rules.tokens, rules.macros);
};
exports.lex = function(source) {
return new Lexer().parse(source);
};
// Lexer Token ----------------------------------------------------------------
Lexer.Token = function(id, value, loc) {
this.id = id;
this.value = value;
this.start = loc.start;
this.end = loc.end;
};
Lexer.Token.prototype = {
isType: function(id) {
return this.id === id;
},
isValue: function(value) {
return this.value === value;
},
toString: function() {
return this.id + ' @ ' + this.start.line + ':' + this.start.col;
}
};
// Lexer Methods --------------------------------------------------------------
// ----------------------------------------------------------------------------
Lexer.prototype = {
// Public Interface -------------------------------------------------------
parse: function(source) {
this.line = 0;
this.col = 0;
this.offset = 0;
this.source = this.parseSource(source);
this.bufferedToken = null;
return this;
},
peek: function(id) {
if (!this.bufferedToken) {
this.bufferedToken = this.next();
}
return this.bufferedToken;
},
next: function() {
var token = null;
if (this.bufferedToken) {
token = this.bufferedToken;
this.bufferedToken = null;
} else {
token = this.getToken();
}
return token;
},
advance: function(id) {
var token = this.next();
if (token.id !== id) {
throw new Error('Lexer: Expected ' + id + ' but got: ' + token);
} else {
return token;
}
},
// Tokenization -----------------------------------------------------------
getToken: function() {
var token = null;
while((token = this.matchToken())) {
if (token.id !== 'WHITESPACE') {
break;
}
}
return token;
},
matchToken: function() {
var token = null,
text = '',
id = 'UNKNOWN',
subtext = this.source.substring(this.offset),
match = this.matchFirstRule(subtext),
loc = this.getTokenLocation(text);
if (match) {
var tokenName = match.rule;
text = match[0];
loc = this.getTokenLocation(text);
// Create token from available match
id = tokenName && typeof tokenName === 'string' ? tokenName : text;
token = new Lexer.Token(id, text, loc);
if (typeof tokenName === 'function') {
tokenName(token);
}
// Update location
this.offset = this.offset + match.index + text.length;
this.line = loc.end.line - 1;
this.col = loc.end.col - 1;
} else if (subtext.length) {
text = subtext.substring(0, 10) + '...';
token = new Lexer.Token(id, text, this.offset, loc);
}
return token;
},
getTokenLocation: function(text) {
var textLines = text.split(/\r\n|\r|\n/),
lineCount = textLines.length,
currentLine = textLines[textLines.length - 1],
endLine = this.line + lineCount,
endCol = lineCount > 1 ? currentLine.length
: this.col + currentLine.length;
return {
start: {
line: this.line + 1,
col: this.col + 1
},
end: {
line: endLine + 1,
col: endCol + 1
}
};
},
matchFirstRule: function(text) {
// Go through all the regular expressions and find the first one that
// matches from the start of the text
var match = null;
for(var i = 0, l = this.rules.length; i < l; i++) {
var rule = this.rules[i];
match = text.match(rule[0]);
if (match) {
match.rule = rule[1];
break;
}
}
return match;
},
// Initialization ---------------------------------------------------------
parseSource: function(source) {
// Remove shebang
if (source.substring(0, 2) === '#!') {
this.line = 1;
return source.substring(source.indexOf('\n') + 1);
} else {
return source;
}
},
compileRules: function(tokens, macros) {
return tokens.map(function(token) {
for(var i in macros) {
if (macros.hasOwnProperty(i)) {
var exp = new RegExp('{' + i + '}', 'g');
token[0] = token[0].replace(exp, macros[i]);
}
}
return [new RegExp('^' + token[0]), token[1]];
});
}
};
/*jshint evil: true */
exports.macros = {
digit: '[0-9]',
ident: '[a-zA-Z_]([a-zA-Z_0-9]+)?',
esc: '\\\\',
'int': '-?(?:[0-9]|[1-9][0-9]+)',
hex: '-?(?:0x[0-9a-fA-F]+)',
exp: '(?:[eE][-+]?[0-9]+)',
frac: '(?:\\.[0-9]+)'
};
exports.tokens = [
// Whitespace
['[\\ \\t\\v\\n\\r]+', 'WHITESPACE'],
// Literals
['{int}{frac}{exp}?\\b', 'FLOAT'],
['{int}{exp}?\\b', 'INTEGER'],
['{hex}\\b', 'HEX'],
['true\\b', 'BOOL'],
['false\\b', 'BOOL'],
// Strings
["'(?:{esc}['bfnrt/{esc}]|{esc}u[a-fA-F0-9]{4}|[^'{esc}])*'", function(token) {
token.id = 'STRING';
token.text = eval(token.text);
}],
// Raw Strings
["`(?:{esc}[`bfnrt/{esc}]|{esc}u[a-fA-F0-9]{4}|[^`{esc}])*`", function(token) {
token.id = 'RAW_STRING';
token.text = token.text.substring(1, token.text.length - 1);
}],
// Doc Comments
['\\-\\-\\-[^]*?\\-\\-\\-', function(token) {
token.id = 'DOC_COMMENT';
token.text = token.text.substring(3, token.text.length - 3);
}],
// Line Comments
['\\-\\-[^\\-].*', function(token) {
token.id = 'LINE_COMMENT';
token.text = token.text.substring(2);
}],
// Types
['int\\b', 'TYPE'],
['float\\b', 'TYPE'],
['string\\b', 'TYPE'],
['bool\\b', 'TYPE'],
['list\\b', 'TYPE'],
['map\\b', 'TYPE'],
['struct\\b', 'TYPE'],
['void\\b', 'TYPE'],
// Type Modifiers
['mutable\\b', 'MODIFIER'],
['public\\b', 'MODIFIER'],
['abstract\\b', 'MODIFIER'],
['protected\\b', 'MODIFIER'],
['private\\b', 'MODIFIER'],
// Block Statements
['scope\\b', 'KEYWORD'],
['if\\b', 'KEYWORD'],
['elif\\b', 'KEYWORD'],
['else\\b', 'KEYWORD'],
['match\\b', 'KEYWORD'],
['case\\b', 'KEYWORD'],
// Loop Statements
['loop\\b', 'KEYWORD'],
['leave\\b', 'KEYWORD'],
['each\\b', 'KEYWORD'],
['in\\b', 'KEYWORD'],
// Class
['class\\b', 'KEYWORD'],
['extends\\b', 'KEYWORD'],
['interface\\b', 'KEYWORD'],
['implements\\b', 'KEYWORD'],
// Import / Export
['import\\b', 'KEYWORD'],
['from\\b', 'KEYWORD'],
['as\\b', 'KEYWORD'],
['export\\b', 'KEYWORD'],
// Identifiers
['[a-zA-Z_]([a-zA-Z_0-9]+)?\\b', 'IDENTIFIER'],
// Compound assignments
['\\&\\=', 'COMPOUND'],
['\\~\\=', 'COMPOUND'],
['\\^\\=', 'COMPOUND'],
['\\+\\=', 'COMPOUND'],
['\\-\\=', 'COMPOUND'],
['\\%\\=', 'COMPOUND'],
['\\/\\/\\=', 'COMPOUND'],
['\\/\\=', 'COMPOUND'],
['\\*\\*\\=', 'COMPOUND'],
['\\*\\=', 'COMPOUND'],
['\\|\\|\\=', 'COMPOUND'],
['\\&\\&\\=', 'COMPOUND'],
['\\<\\<\\=', 'COMPOUND'],
['\\>\\>\\=', 'COMPOUND'],
// Shift Operators
['\\>\\>', 'SHIFT'],
['\\<\\<', 'SHIFT'],
['\\>\\>\\>', 'SHIFT'],
// Relational Operators
['\\>', 'RELATION'],
['\\>\\=', 'RELATION'],
['\\<', 'RELATION'],
['\\<\\=', 'RELATION'],
// Compare Operators
['\\!\\=', 'RELATION'],
['\\=\\=', 'RELATION'],
// Logical Operators
['\\&\\&', 'LOGIC'],
['\\|\\|', 'LOGIC'],
// Infix Operators
['\\/\\/', 'INFIX'],
['\\/', 'INFIX'],
['\\*\\*', 'INFIX'],
['\\*', 'INFIX'],
['\\|', 'INFIX'],
['\\%', 'INFIX'],
['\\^', 'INFIX'],
['\\&', 'INFIX'],
// Unary Operators
['\\+'],
['\\-'],
['\\#', 'UNARY'],
['\\!', 'UNARY'],
['\\~', 'UNARY'],
// Basic punctiation
['\\.\\.\\.'],
['\\.\\.'],
['\\.'],
['\\,'],
['\\:'],
['\\;'],
['\\@'],
['\\?'],
['\\='],
// Parenthesis
['\\('],
['\\)'],
['\\{'],
['\\}'],
['\\['],
['\\]']
];
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment