BonsaiDen/lexer.js

## lexer.js
var rules = require('./rules');


// Emblem Lexer ---------------------------------------------------------------
// ----------------------------------------------------------------------------
var Lexer = function() {

    this.line = 0;
    this.col = 0;
    this.offset = 0;
    this.source = null;
    this.tokens = null;
    this.rules = this.compileRules(rules.tokens, rules.macros);

};

exports.lex = function(source) {
    return new Lexer().parse(source);
};


// Lexer Token ----------------------------------------------------------------
Lexer.Token = function(id, value, loc) {
    this.id = id;
    this.value = value;
    this.start = loc.start;
    this.end = loc.end;
};

Lexer.Token.prototype = {

    isType: function(id) {
        return this.id === id;
    },

    isValue: function(value) {
        return this.value === value;
    },

    toString: function() {
        return this.id + ' @ '  + this.start.line + ':' + this.start.col;
    }

};


// Lexer Methods --------------------------------------------------------------
// ----------------------------------------------------------------------------
Lexer.prototype = {

    // Public Interface -------------------------------------------------------
    parse: function(source) {

        this.line = 0;
        this.col = 0;
        this.offset = 0;
        this.source = this.parseSource(source);
        this.bufferedToken = null;

        return this;

    },

    peek: function(id) {

        if (!this.bufferedToken) {
            this.bufferedToken = this.next();
        }

        return this.bufferedToken;

    },

    next: function() {

        var token = null;
        if (this.bufferedToken) {
            token = this.bufferedToken;
            this.bufferedToken = null;

        } else {
            token = this.getToken();
        }

        return token;

    },

    advance: function(id) {

        var token = this.next();
        if (token.id !== id) {
            throw new Error('Lexer: Expected ' + id + ' but got: ' + token);

        } else {
            return token;
        }

    },


    // Tokenization -----------------------------------------------------------
    getToken: function() {

        var token = null;
        while((token = this.matchToken())) {

            if (token.id !== 'WHITESPACE') {
                break;
            }
        }

        return token;

    },

    matchToken: function() {

        var token = null,
            text = '',
            id = 'UNKNOWN',
            subtext = this.source.substring(this.offset),
            match = this.matchFirstRule(subtext),
            loc = this.getTokenLocation(text);

        if (match) {

            var tokenName = match.rule;
            text = match[0];
            loc = this.getTokenLocation(text);

            // Create token from available match
            id = tokenName && typeof tokenName === 'string' ? tokenName : text;
            token = new Lexer.Token(id, text, loc);
            if (typeof tokenName === 'function') {
                tokenName(token);
            }

            // Update location
            this.offset = this.offset + match.index + text.length;
            this.line = loc.end.line - 1;
            this.col = loc.end.col - 1;

        } else if (subtext.length) {
            text = subtext.substring(0, 10) + '...';
            token = new Lexer.Token(id, text, this.offset, loc);
        }

        return token;

    },

    getTokenLocation: function(text) {

        var textLines = text.split(/\r\n|\r|\n/),
            lineCount = textLines.length,
            currentLine = textLines[textLines.length - 1],
            endLine = this.line + lineCount,
            endCol = lineCount > 1 ? currentLine.length
                                   : this.col + currentLine.length;

        return {
            start: {
                line: this.line + 1,
                col: this.col + 1
            },
            end: {
                line: endLine + 1,
                col: endCol + 1
            }
        };

    },

    matchFirstRule: function(text) {

        // Go through all the regular expressions and find the first one that
        // matches from the start of the text
        var match = null;
        for(var i = 0, l = this.rules.length; i < l; i++) {
            var rule = this.rules[i];
            match = text.match(rule[0]);
            if (match) {
                match.rule = rule[1];
                break;
            }
        }

        return match;

    },


    // Initialization ---------------------------------------------------------
    parseSource: function(source) {

        // Remove shebang
        if (source.substring(0, 2) === '#!') {
            this.line = 1;
            return source.substring(source.indexOf('\n') + 1);

        } else {
            return source;
        }

    },

    compileRules: function(tokens, macros) {
        return tokens.map(function(token) {

            for(var i in macros) {
                if (macros.hasOwnProperty(i)) {
                    var exp = new RegExp('{' + i + '}', 'g');
                    token[0] = token[0].replace(exp, macros[i]);
                }
            }

            return [new RegExp('^' + token[0]), token[1]];

        });
    }

};


## rules.js
/*jshint evil: true */
exports.macros = {
    digit: '[0-9]',
    ident: '[a-zA-Z_]([a-zA-Z_0-9]+)?',
    esc: '\\\\',
    'int': '-?(?:[0-9]|[1-9][0-9]+)',
    hex: '-?(?:0x[0-9a-fA-F]+)',
    exp: '(?:[eE][-+]?[0-9]+)',
    frac: '(?:\\.[0-9]+)'
};

exports.tokens = [

    // Whitespace
    ['[\\ \\t\\v\\n\\r]+', 'WHITESPACE'],

    // Literals
    ['{int}{frac}{exp}?\\b', 'FLOAT'],
    ['{int}{exp}?\\b', 'INTEGER'],
    ['{hex}\\b', 'HEX'],
    ['true\\b', 'BOOL'],
    ['false\\b', 'BOOL'],

    // Strings
    ["'(?:{esc}['bfnrt/{esc}]|{esc}u[a-fA-F0-9]{4}|[^'{esc}])*'", function(token) {
        token.id = 'STRING';
        token.text = eval(token.text);
    }],

    // Raw Strings
    ["`(?:{esc}[`bfnrt/{esc}]|{esc}u[a-fA-F0-9]{4}|[^`{esc}])*`", function(token) {
        token.id = 'RAW_STRING';
        token.text = token.text.substring(1, token.text.length - 1);
    }],

    // Doc Comments
    ['\\-\\-\\-[^]*?\\-\\-\\-', function(token) {
        token.id = 'DOC_COMMENT';
        token.text = token.text.substring(3, token.text.length - 3);
    }],

    // Line Comments
    ['\\-\\-[^\\-].*', function(token) {
        token.id = 'LINE_COMMENT';
        token.text = token.text.substring(2);
    }],

    // Types
    ['int\\b', 'TYPE'],
    ['float\\b', 'TYPE'],
    ['string\\b', 'TYPE'],
    ['bool\\b', 'TYPE'],
    ['list\\b', 'TYPE'],
    ['map\\b', 'TYPE'],
    ['struct\\b', 'TYPE'],
    ['void\\b', 'TYPE'],

    // Type Modifiers
    ['mutable\\b', 'MODIFIER'],
    ['public\\b', 'MODIFIER'],
    ['abstract\\b', 'MODIFIER'],
    ['protected\\b', 'MODIFIER'],
    ['private\\b', 'MODIFIER'],

    // Block Statements
    ['scope\\b', 'KEYWORD'],
    ['if\\b', 'KEYWORD'],
    ['elif\\b', 'KEYWORD'],
    ['else\\b', 'KEYWORD'],
    ['match\\b', 'KEYWORD'],
    ['case\\b', 'KEYWORD'],

    // Loop Statements
    ['loop\\b', 'KEYWORD'],
    ['leave\\b', 'KEYWORD'],
    ['each\\b', 'KEYWORD'],
    ['in\\b', 'KEYWORD'],

    // Class
    ['class\\b', 'KEYWORD'],
    ['extends\\b', 'KEYWORD'],
    ['interface\\b', 'KEYWORD'],
    ['implements\\b', 'KEYWORD'],

    // Import / Export
    ['import\\b', 'KEYWORD'],
    ['from\\b', 'KEYWORD'],
    ['as\\b', 'KEYWORD'],
    ['export\\b', 'KEYWORD'],

    // Identifiers
    ['[a-zA-Z_]([a-zA-Z_0-9]+)?\\b', 'IDENTIFIER'],

    // Compound assignments
    ['\\&\\=', 'COMPOUND'],
    ['\\~\\=', 'COMPOUND'],
    ['\\^\\=', 'COMPOUND'],
    ['\\+\\=', 'COMPOUND'],
    ['\\-\\=', 'COMPOUND'],
    ['\\%\\=', 'COMPOUND'],
    ['\\/\\/\\=', 'COMPOUND'],
    ['\\/\\=', 'COMPOUND'],
    ['\\*\\*\\=', 'COMPOUND'],
    ['\\*\\=', 'COMPOUND'],
    ['\\|\\|\\=', 'COMPOUND'],
    ['\\&\\&\\=', 'COMPOUND'],
    ['\\<\\<\\=', 'COMPOUND'],
    ['\\>\\>\\=', 'COMPOUND'],

    // Shift Operators
    ['\\>\\>', 'SHIFT'],
    ['\\<\\<', 'SHIFT'],
    ['\\>\\>\\>', 'SHIFT'],

    // Relational Operators
    ['\\>', 'RELATION'],
    ['\\>\\=', 'RELATION'],
    ['\\<', 'RELATION'],
    ['\\<\\=', 'RELATION'],

    // Compare Operators
    ['\\!\\=', 'RELATION'],
    ['\\=\\=', 'RELATION'],

    // Logical Operators
    ['\\&\\&', 'LOGIC'],
    ['\\|\\|', 'LOGIC'],

    // Infix Operators
    ['\\/\\/', 'INFIX'],
    ['\\/', 'INFIX'],
    ['\\*\\*', 'INFIX'],
    ['\\*', 'INFIX'],
    ['\\|', 'INFIX'],
    ['\\%', 'INFIX'],
    ['\\^', 'INFIX'],
    ['\\&', 'INFIX'],

    // Unary Operators
    ['\\+'],
    ['\\-'],
    ['\\#', 'UNARY'],
    ['\\!', 'UNARY'],
    ['\\~', 'UNARY'],

    // Basic punctiation
    ['\\.\\.\\.'],
    ['\\.\\.'],
    ['\\.'],
    ['\\,'],
    ['\\:'],
    ['\\;'],
    ['\\@'],
    ['\\?'],
    ['\\='],

    // Parenthesis
    ['\\('],
    ['\\)'],
    ['\\{'],
    ['\\}'],
    ['\\['],
    ['\\]']

];
	var rules = require('./rules');


	// Emblem Lexer ---------------------------------------------------------------
	// ----------------------------------------------------------------------------
	var Lexer = function() {

	this.line = 0;
	this.col = 0;
	this.offset = 0;
	this.source = null;
	this.tokens = null;
	this.rules = this.compileRules(rules.tokens, rules.macros);

	};

	exports.lex = function(source) {
	return new Lexer().parse(source);
	};


	// Lexer Token ----------------------------------------------------------------
	Lexer.Token = function(id, value, loc) {
	this.id = id;
	this.value = value;
	this.start = loc.start;
	this.end = loc.end;
	};

	Lexer.Token.prototype = {

	isType: function(id) {
	return this.id === id;
	},

	isValue: function(value) {
	return this.value === value;
	},

	toString: function() {
	return this.id + ' @ ' + this.start.line + ':' + this.start.col;
	}

	};


	// Lexer Methods --------------------------------------------------------------
	// ----------------------------------------------------------------------------
	Lexer.prototype = {

	// Public Interface -------------------------------------------------------
	parse: function(source) {

	this.line = 0;
	this.col = 0;
	this.offset = 0;
	this.source = this.parseSource(source);
	this.bufferedToken = null;

	return this;

	},

	peek: function(id) {

	if (!this.bufferedToken) {
	this.bufferedToken = this.next();
	}

	return this.bufferedToken;

	},

	next: function() {

	var token = null;
	if (this.bufferedToken) {
	token = this.bufferedToken;
	this.bufferedToken = null;

	} else {
	token = this.getToken();
	}

	return token;

	},

	advance: function(id) {

	var token = this.next();
	if (token.id !== id) {
	throw new Error('Lexer: Expected ' + id + ' but got: ' + token);

	} else {
	return token;
	}

	},


	// Tokenization -----------------------------------------------------------
	getToken: function() {

	var token = null;
	while((token = this.matchToken())) {

	if (token.id !== 'WHITESPACE') {
	break;
	}
	}

	return token;

	},

	matchToken: function() {

	var token = null,
	text = '',
	id = 'UNKNOWN',
	subtext = this.source.substring(this.offset),
	match = this.matchFirstRule(subtext),
	loc = this.getTokenLocation(text);

	if (match) {

	var tokenName = match.rule;
	text = match[0];
	loc = this.getTokenLocation(text);

	// Create token from available match
	id = tokenName && typeof tokenName === 'string' ? tokenName : text;
	token = new Lexer.Token(id, text, loc);
	if (typeof tokenName === 'function') {
	tokenName(token);
	}

	// Update location
	this.offset = this.offset + match.index + text.length;
	this.line = loc.end.line - 1;
	this.col = loc.end.col - 1;

	} else if (subtext.length) {
	text = subtext.substring(0, 10) + '...';
	token = new Lexer.Token(id, text, this.offset, loc);
	}

	return token;

	},

	getTokenLocation: function(text) {

	var textLines = text.split(/\r\n\|\r\|\n/),
	lineCount = textLines.length,
	currentLine = textLines[textLines.length - 1],
	endLine = this.line + lineCount,
	endCol = lineCount > 1 ? currentLine.length
	: this.col + currentLine.length;

	return {
	start: {
	line: this.line + 1,
	col: this.col + 1
	},
	end: {
	line: endLine + 1,
	col: endCol + 1
	}
	};

	},

	matchFirstRule: function(text) {

	// Go through all the regular expressions and find the first one that
	// matches from the start of the text
	var match = null;
	for(var i = 0, l = this.rules.length; i < l; i++) {
	var rule = this.rules[i];
	match = text.match(rule[0]);
	if (match) {
	match.rule = rule[1];
	break;
	}
	}

	return match;

	},


	// Initialization ---------------------------------------------------------
	parseSource: function(source) {

	// Remove shebang
	if (source.substring(0, 2) === '#!') {
	this.line = 1;
	return source.substring(source.indexOf('\n') + 1);

	} else {
	return source;
	}

	},

	compileRules: function(tokens, macros) {
	return tokens.map(function(token) {

	for(var i in macros) {
	if (macros.hasOwnProperty(i)) {
	var exp = new RegExp('{' + i + '}', 'g');
	token[0] = token[0].replace(exp, macros[i]);
	}
	}

	return [new RegExp('^' + token[0]), token[1]];

	});
	}

	};
	/jshint evil: true /
	exports.macros = {
	digit: '[0-9]',
	ident: '[a-zA-Z_]([a-zA-Z_0-9]+)?',
	esc: '\\\\',
	'int': '-?(?:[0-9]\|[1-9][0-9]+)',
	hex: '-?(?:0x[0-9a-fA-F]+)',
	exp: '(?:[eE][-+]?[0-9]+)',
	frac: '(?:\\.[0-9]+)'
	};

	exports.tokens = [

	// Whitespace
	['[\\ \\t\\v\\n\\r]+', 'WHITESPACE'],

	// Literals
	['{int}{frac}{exp}?\\b', 'FLOAT'],
	['{int}{exp}?\\b', 'INTEGER'],
	['{hex}\\b', 'HEX'],
	['true\\b', 'BOOL'],
	['false\\b', 'BOOL'],

	// Strings
	["'(?:{esc}['bfnrt/{esc}]\|{esc}u[a-fA-F0-9]{4}\|[^'{esc}])*'", function(token) {
	token.id = 'STRING';
	token.text = eval(token.text);
	}],

	// Raw Strings
	["`(?:{esc}[`bfnrt/{esc}]\|{esc}u[a-fA-F0-9]{4}\|[^`{esc}])*`", function(token) {
	token.id = 'RAW_STRING';
	token.text = token.text.substring(1, token.text.length - 1);
	}],

	// Doc Comments
	['\\-\\-\\-[^]*?\\-\\-\\-', function(token) {
	token.id = 'DOC_COMMENT';
	token.text = token.text.substring(3, token.text.length - 3);
	}],

	// Line Comments
	['\\-\\-[^\\-].*', function(token) {
	token.id = 'LINE_COMMENT';
	token.text = token.text.substring(2);
	}],

	// Types
	['int\\b', 'TYPE'],
	['float\\b', 'TYPE'],
	['string\\b', 'TYPE'],
	['bool\\b', 'TYPE'],
	['list\\b', 'TYPE'],
	['map\\b', 'TYPE'],
	['struct\\b', 'TYPE'],
	['void\\b', 'TYPE'],

	// Type Modifiers
	['mutable\\b', 'MODIFIER'],
	['public\\b', 'MODIFIER'],
	['abstract\\b', 'MODIFIER'],
	['protected\\b', 'MODIFIER'],
	['private\\b', 'MODIFIER'],

	// Block Statements
	['scope\\b', 'KEYWORD'],
	['if\\b', 'KEYWORD'],
	['elif\\b', 'KEYWORD'],
	['else\\b', 'KEYWORD'],
	['match\\b', 'KEYWORD'],
	['case\\b', 'KEYWORD'],

	// Loop Statements
	['loop\\b', 'KEYWORD'],
	['leave\\b', 'KEYWORD'],
	['each\\b', 'KEYWORD'],
	['in\\b', 'KEYWORD'],

	// Class
	['class\\b', 'KEYWORD'],
	['extends\\b', 'KEYWORD'],
	['interface\\b', 'KEYWORD'],
	['implements\\b', 'KEYWORD'],

	// Import / Export
	['import\\b', 'KEYWORD'],
	['from\\b', 'KEYWORD'],
	['as\\b', 'KEYWORD'],
	['export\\b', 'KEYWORD'],

	// Identifiers
	['[a-zA-Z_]([a-zA-Z_0-9]+)?\\b', 'IDENTIFIER'],

	// Compound assignments
	['\\&\\=', 'COMPOUND'],
	['\\~\\=', 'COMPOUND'],
	['\\^\\=', 'COMPOUND'],
	['\\+\\=', 'COMPOUND'],
	['\\-\\=', 'COMPOUND'],
	['\\%\\=', 'COMPOUND'],
	['\\/\\/\\=', 'COMPOUND'],
	['\\/\\=', 'COMPOUND'],
	['\\\\\\=', 'COMPOUND'],
	['\\*\\=', 'COMPOUND'],
	['\\\|\\\|\\=', 'COMPOUND'],
	['\\&\\&\\=', 'COMPOUND'],
	['\\<\\<\\=', 'COMPOUND'],
	['\\>\\>\\=', 'COMPOUND'],

	// Shift Operators
	['\\>\\>', 'SHIFT'],
	['\\<\\<', 'SHIFT'],
	['\\>\\>\\>', 'SHIFT'],

	// Relational Operators
	['\\>', 'RELATION'],
	['\\>\\=', 'RELATION'],
	['\\<', 'RELATION'],
	['\\<\\=', 'RELATION'],

	// Compare Operators
	['\\!\\=', 'RELATION'],
	['\\=\\=', 'RELATION'],

	// Logical Operators
	['\\&\\&', 'LOGIC'],
	['\\\|\\\|', 'LOGIC'],

	// Infix Operators
	['\\/\\/', 'INFIX'],
	['\\/', 'INFIX'],
	['\\\\', 'INFIX'],
	['\\*', 'INFIX'],
	['\\\|', 'INFIX'],
	['\\%', 'INFIX'],
	['\\^', 'INFIX'],
	['\\&', 'INFIX'],

	// Unary Operators
	['\\+'],
	['\\-'],
	['\\#', 'UNARY'],
	['\\!', 'UNARY'],
	['\\~', 'UNARY'],

	// Basic punctiation
	['\\.\\.\\.'],
	['\\.\\.'],
	['\\.'],
	['\\,'],
	['\\:'],
	['\\;'],
	['\\@'],
	['\\?'],
	['\\='],

	// Parenthesis
	['\\('],
	['\\)'],
	['\\{'],
	['\\}'],
	['\\['],
	['\\]']

	];