taufik-nurrohman/lexer.js

## lexer.js
#!/usr/bin/env node

var assert = require('assert');

// Each lexed token is a array of three integers:
// 1. the "token type": an index into the list of token patterns.
// 2. the index into the input string marking the start of this token.
// 3. the length of the token.

// The list of "token types" which our lexer understands:
var patterns = [
    ["wspace", /^\s+/],
    ["identifier", /^[a-zA-Z_]\w*/],
    ["oparen", /^\(/],
    ["cparen", /^\)/],
    ["float", /^-?\d+\.\d+/],
    ["int", /^-?\d+/],
    ["obrace", /^\{/],
    ["cbrace", /^\}/],
    ["obracket", /^\[/],
    ["cbracket", /^\]/],
    // match two quotes containing any number of escaped-backslash, escaped quote, or any non-quote characters.
    ["string", /^"(\\\\|\\"|[^"])*"/],
    ["comment", /^\/\/.*/],
    ["other", /^[\s\S]/],
];

// finds the next token in the input string, starting at input[i], using the given token patterns.
// returns a token and an updated starting index, or causes assertion failure if it was unable to parse a token.
function read_token(input, i, patterns) {
    // assert.ok(input.length > 0, "zero-length input");
    // assert.ok(i < input.length, "starting index past end of input");
    for (var j = 0; j < patterns.length; j++) {
        var regex = patterns[j][1];
        var result = input.slice(i).match(regex);
        if (result !== null) {
            var text = result[0];
            var token = [j, i, text.length];
            return [token, i + text.length];
        }
    }
    assert.fail("exhausted input while lexing token");
}

// takes an input string and a list of token patterns and tokenizes the string.
// returns a list of tokens.
function tokenize(input, patterns) {
    var tokens = [];
    for (var i = 0; i < input.length;) {
        var result = read_token(input, i, patterns);
        var token = result[0];
        i = result[1];
        tokens.push(token);
        // console.log("Debug: found token: ", token);
    }
    return tokens;
}

// Read from stdin
var fs = require('fs');
var input = fs.readFileSync(0).toString();

// Tokenize the input and track some benchmark stats
var then = Date.now();
var tokens = tokenize(input, patterns);
var now = Date.now();
var elapsed = now - then;
console.log("Lexed", tokens.length, "tokens in", elapsed, "ms (", Math.round(tokens.length/(elapsed/1000)), "tokens/sec)");

// Print out the lexed tokens (if less than 100 tokens)
if (tokens.length < 100) {
    console.log("\nLexed tokens:");
    for (var i = 0; i < tokens.length; i++) {
        var token = tokens[i];
        var token_type = patterns[token[0]][0];
        var token_text = input.substr(token[1], token[2]);
        console.log(token, token_type, token_text);
    }
}

// Print out some token frequency stats (use this to optimize the order of the list of regex patterns)
// var stats = {};
// for (var i = 0; i < patterns.length; i++) {
//     stats[i] = 0;
// }
// for (var i = 0; i < tokens.length; i++) {
//     var token = tokens[i];
//     stats[token[0]] += 1;
// }
// var stats2 = {};
// for (var i = 0; i < patterns.length; i++) {
//     var label = patterns[i][0];
//     stats2[label] = stats[i];
// }
// console.log(stats2);

## sample-benchmark.txt
# combine all C files in the linux kernel source as test input:
$ curl curl https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.19.tar.xz | unxz | tar x
$ cd linux-4.19
$ cat **/*.c > ../linux-4.19.c

# the result is a ~500k line C file:
$ wc -l linux-4.19.c
  493065 linux-4.19.c

# benchmark the lexer:
$ cat linux-4.19.c | ./lexer.js
Lexed 3950697 tokens in 1785 ms ( 2213276 tokens/sec)

## sample-input.txt
int main(void) { exit(0); }
(list 1 2 "A () 123 \\\"")
// this is a comment
some plain text

## sample-output.txt
$ cat input.txt | ./lexer.js
Lexed 35 tokens in 0 ms ( Infinity tokens/sec)

Lexed tokens:
[ 1, 0, 3 ] 'identifier' 'int'
[ 0, 3, 1 ] 'wspace' ' '
[ 1, 4, 4 ] 'identifier' 'main'
[ 2, 8, 1 ] 'oparen' '('
[ 1, 9, 4 ] 'identifier' 'void'
[ 3, 13, 1 ] 'cparen' ')'
[ 0, 14, 1 ] 'wspace' ' '
[ 6, 15, 1 ] 'obrace' '{'
[ 0, 16, 1 ] 'wspace' ' '
[ 1, 17, 4 ] 'identifier' 'exit'
[ 2, 21, 1 ] 'oparen' '('
[ 5, 22, 1 ] 'int' '0'
[ 3, 23, 1 ] 'cparen' ')'
[ 12, 24, 1 ] 'other' ';'
[ 0, 25, 1 ] 'wspace' ' '
[ 7, 26, 1 ] 'cbrace' '}'
[ 0, 27, 1 ] 'wspace' '\n'
[ 2, 28, 1 ] 'oparen' '('
[ 1, 29, 4 ] 'identifier' 'list'
[ 0, 33, 1 ] 'wspace' ' '
[ 5, 34, 1 ] 'int' '1'
[ 0, 35, 1 ] 'wspace' ' '
[ 5, 36, 1 ] 'int' '2'
[ 0, 37, 1 ] 'wspace' ' '
[ 10, 38, 15 ] 'string' '"A () 123 \\\\\\""'
[ 3, 53, 1 ] 'cparen' ')'
[ 0, 54, 1 ] 'wspace' '\n'
[ 11, 55, 20 ] 'comment' '// this is a comment'
[ 0, 75, 1 ] 'wspace' '\n'
[ 1, 76, 4 ] 'identifier' 'some'
[ 0, 80, 1 ] 'wspace' ' '
[ 1, 81, 5 ] 'identifier' 'plain'
[ 0, 86, 1 ] 'wspace' ' '
[ 1, 87, 4 ] 'identifier' 'text'
[ 0, 91, 1 ] 'wspace' '\n'
	#!/usr/bin/env node

	var assert = require('assert');

	// Each lexed token is a array of three integers:
	// 1. the "token type": an index into the list of token patterns.
	// 2. the index into the input string marking the start of this token.
	// 3. the length of the token.

	// The list of "token types" which our lexer understands:
	var patterns = [
	["wspace", /^\s+/],
	["identifier", /^[a-zA-Z_]\w*/],
	["oparen", /^\(/],
	["cparen", /^\)/],
	["float", /^-?\d+\.\d+/],
	["int", /^-?\d+/],
	["obrace", /^\{/],
	["cbrace", /^\}/],
	["obracket", /^\[/],
	["cbracket", /^\]/],
	// match two quotes containing any number of escaped-backslash, escaped quote, or any non-quote characters.
	["string", /^"(\\\\\|\\"\|[^"])*"/],
	["comment", /^\/\/.*/],
	["other", /^[\s\S]/],
	];

	// finds the next token in the input string, starting at input[i], using the given token patterns.
	// returns a token and an updated starting index, or causes assertion failure if it was unable to parse a token.
	function read_token(input, i, patterns) {
	// assert.ok(input.length > 0, "zero-length input");
	// assert.ok(i < input.length, "starting index past end of input");
	for (var j = 0; j < patterns.length; j++) {
	var regex = patterns[j][1];
	var result = input.slice(i).match(regex);
	if (result !== null) {
	var text = result[0];
	var token = [j, i, text.length];
	return [token, i + text.length];
	}
	}
	assert.fail("exhausted input while lexing token");
	}

	// takes an input string and a list of token patterns and tokenizes the string.
	// returns a list of tokens.
	function tokenize(input, patterns) {
	var tokens = [];
	for (var i = 0; i < input.length;) {
	var result = read_token(input, i, patterns);
	var token = result[0];
	i = result[1];
	tokens.push(token);
	// console.log("Debug: found token: ", token);
	}
	return tokens;
	}

	// Read from stdin
	var fs = require('fs');
	var input = fs.readFileSync(0).toString();

	// Tokenize the input and track some benchmark stats
	var then = Date.now();
	var tokens = tokenize(input, patterns);
	var now = Date.now();
	var elapsed = now - then;
	console.log("Lexed", tokens.length, "tokens in", elapsed, "ms (", Math.round(tokens.length/(elapsed/1000)), "tokens/sec)");

	// Print out the lexed tokens (if less than 100 tokens)
	if (tokens.length < 100) {
	console.log("\nLexed tokens:");
	for (var i = 0; i < tokens.length; i++) {
	var token = tokens[i];
	var token_type = patterns[token[0]][0];
	var token_text = input.substr(token[1], token[2]);
	console.log(token, token_type, token_text);
	}
	}

	// Print out some token frequency stats (use this to optimize the order of the list of regex patterns)
	// var stats = {};
	// for (var i = 0; i < patterns.length; i++) {
	// stats[i] = 0;
	// }
	// for (var i = 0; i < tokens.length; i++) {
	// var token = tokens[i];
	// stats[token[0]] += 1;
	// }
	// var stats2 = {};
	// for (var i = 0; i < patterns.length; i++) {
	// var label = patterns[i][0];
	// stats2[label] = stats[i];
	// }
	// console.log(stats2);
	# combine all C files in the linux kernel source as test input:
	$ curl curl https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.19.tar.xz \| unxz \| tar x
	$ cd linux-4.19
	$ cat */.c > ../linux-4.19.c

	# the result is a ~500k line C file:
	$ wc -l linux-4.19.c
	493065 linux-4.19.c

	# benchmark the lexer:
	$ cat linux-4.19.c \| ./lexer.js
	Lexed 3950697 tokens in 1785 ms ( 2213276 tokens/sec)
	int main(void) { exit(0); }
	(list 1 2 "A () 123 \\\"")
	// this is a comment
	some plain text
	$ cat input.txt \| ./lexer.js
	Lexed 35 tokens in 0 ms ( Infinity tokens/sec)

	Lexed tokens:
	[ 1, 0, 3 ] 'identifier' 'int'
	[ 0, 3, 1 ] 'wspace' ' '
	[ 1, 4, 4 ] 'identifier' 'main'
	[ 2, 8, 1 ] 'oparen' '('
	[ 1, 9, 4 ] 'identifier' 'void'
	[ 3, 13, 1 ] 'cparen' ')'
	[ 0, 14, 1 ] 'wspace' ' '
	[ 6, 15, 1 ] 'obrace' '{'
	[ 0, 16, 1 ] 'wspace' ' '
	[ 1, 17, 4 ] 'identifier' 'exit'
	[ 2, 21, 1 ] 'oparen' '('
	[ 5, 22, 1 ] 'int' '0'
	[ 3, 23, 1 ] 'cparen' ')'
	[ 12, 24, 1 ] 'other' ';'
	[ 0, 25, 1 ] 'wspace' ' '
	[ 7, 26, 1 ] 'cbrace' '}'
	[ 0, 27, 1 ] 'wspace' '\n'
	[ 2, 28, 1 ] 'oparen' '('
	[ 1, 29, 4 ] 'identifier' 'list'
	[ 0, 33, 1 ] 'wspace' ' '
	[ 5, 34, 1 ] 'int' '1'
	[ 0, 35, 1 ] 'wspace' ' '
	[ 5, 36, 1 ] 'int' '2'
	[ 0, 37, 1 ] 'wspace' ' '
	[ 10, 38, 15 ] 'string' '"A () 123 \\\\\\""'
	[ 3, 53, 1 ] 'cparen' ')'
	[ 0, 54, 1 ] 'wspace' '\n'
	[ 11, 55, 20 ] 'comment' '// this is a comment'
	[ 0, 75, 1 ] 'wspace' '\n'
	[ 1, 76, 4 ] 'identifier' 'some'
	[ 0, 80, 1 ] 'wspace' ' '
	[ 1, 81, 5 ] 'identifier' 'plain'
	[ 0, 86, 1 ] 'wspace' ' '
	[ 1, 87, 4 ] 'identifier' 'text'
	[ 0, 91, 1 ] 'wspace' '\n'