|
#!/usr/bin/env node |
|
|
|
// Usage: cat input.txt | ./lex.js grammar.txt |
|
|
|
// A "token" is an array of three integers: |
|
// 1. The "token type": an index into the list of token patterns. |
|
// 2. The index into the input string marking the start of this token. |
|
// 3. The length of the token. |
|
|
|
'use strict'; |
|
var assert = require('assert'); |
|
var fs = require('fs'); |
|
|
|
// Load the lexer grammar from the given filename. |
|
// A grammar file should be pairs of lines: token name followed by regex pattern. |
|
function load_lex_grammar(filename) { |
|
var patterns = []; |
|
var contents = fs.readFileSync(filename, 'utf-8').toString(); |
|
var lines = contents.split(/\r?\n/); |
|
for (var i = 0; i+1 < lines.length; i += 2) { |
|
var token_name = lines[i]; |
|
var pattern = "^" + lines[i+1]; |
|
var regex = new RegExp(pattern, 'g'); |
|
patterns.push([token_name, regex]); |
|
} |
|
return patterns; |
|
} |
|
|
|
// Finds the next token in the input string, starting at input[i], using the given token patterns. |
|
// Returns a token, or causes assertion failure if it was unable to parse a token. |
|
function read_token(patterns, input, i) { |
|
for (var j = 0; j < patterns.length; j++) { |
|
var regex = patterns[j][1]; |
|
var result = input.slice(i).match(regex); |
|
if (result !== null) { |
|
var text = result[0]; |
|
var token = [j, i, text.length]; |
|
return token; |
|
} |
|
} |
|
assert.fail("exhausted input while lexing token"); |
|
} |
|
|
|
// Takes an input string and a list of token patterns and tokenizes the string. |
|
// Returns a list of tokens. |
|
function tokenize(patterns, input) { |
|
var tokens = []; |
|
for (var i = 0; i < input.length;) { |
|
var token = read_token(patterns, input, i); |
|
i += token[2]; |
|
tokens.push(token); |
|
// console.log("Debug: found token: ", token); |
|
} |
|
return tokens; |
|
} |
|
|
|
// Takes an input string and a list of token patterns and tokenizes the string. |
|
// Streams the tokens to stdout as they are lexed. |
|
function stream_tokens(patterns, input) { |
|
for (var i = 0; i < input.length;) { |
|
var token = read_token(patterns, input, i); |
|
i += token[2]; |
|
process.stdout.write(token[0] + ',' + token[1] + ',' + token[2] + '\n'); |
|
} |
|
} |
|
|
|
function benchmark(input, patterns) { |
|
var then = Date.now(); |
|
var tokens = tokenize(patterns, input); |
|
var now = Date.now(); |
|
var elapsed = now - then; |
|
console.log("Lexed", tokens.length, "tokens in", elapsed, "ms (", Math.round(tokens.length/(elapsed/1000)), "tokens/sec)"); |
|
|
|
if (tokens.length < 100) { |
|
console.log("\nLexed tokens:"); |
|
for (var i = 0; i < tokens.length; i++) { |
|
var token = tokens[i]; |
|
var token_type = patterns[token[0]][0]; |
|
var token_text = input.substr(token[1], token[2]); |
|
console.log(token, token_type, token_text); |
|
} |
|
} |
|
|
|
var stats = {}; |
|
for (var i = 0; i < patterns.length; i++) { |
|
stats[i] = 0; |
|
} |
|
for (var i = 0; i < tokens.length; i++) { |
|
var token = tokens[i]; |
|
stats[token[0]] += 1; |
|
} |
|
var stats2 = {}; |
|
for (var i = 0; i < patterns.length; i++) { |
|
var label = patterns[i][0]; |
|
stats2[label] = stats[i]; |
|
} |
|
console.log(stats2); |
|
} |
|
|
|
function main() { |
|
var patterns = load_lex_grammar(process.argv[2]); |
|
var stdin = 0; |
|
if (process.argv.length < 4) { |
|
var infile = stdin; |
|
} else { |
|
var infile = process.argv[3]; |
|
} |
|
var input = fs.readFileSync(infile, 'utf-8').toString(); |
|
stream_tokens(patterns, input); |
|
} |
|
|
|
main(); |