Skip to content

Instantly share code, notes, and snippets.

@pepasflo
Created November 7, 2018 00:30
Show Gist options
  • Save pepasflo/8fcbd2d3c80e5a6439221dc3405b7dda to your computer and use it in GitHub Desktop.
Save pepasflo/8fcbd2d3c80e5a6439221dc3405b7dda to your computer and use it in GitHub Desktop.
Updated lexers
wspace
\s+
identifier
[a-zA-Z_]\w*
float
-?\d+\.\d+
int
-?\d+
string
"(\\\\|\\"|[^"])*"
comment
\/\/.*
other
[\s\S]
#!/usr/bin/env node
// Usage: cat input.txt | ./lex.js grammar.txt
// A "token" is an array of three integers:
// 1. The "token type": an index into the list of token patterns.
// 2. The index into the input string marking the start of this token.
// 3. The length of the token.
'use strict';
var assert = require('assert');
var fs = require('fs');
// Load the lexer grammar from the given filename.
// A grammar file should be pairs of lines: token name followed by regex pattern.
function load_lex_grammar(filename) {
var patterns = [];
var contents = fs.readFileSync(filename, 'utf-8').toString();
var lines = contents.split(/\r?\n/);
for (var i = 0; i+1 < lines.length; i += 2) {
var token_name = lines[i];
var pattern = "^" + lines[i+1];
var regex = new RegExp(pattern, 'g');
patterns.push([token_name, regex]);
}
return patterns;
}
// Finds the next token in the input string, starting at input[i], using the given token patterns.
// Returns a token, or causes assertion failure if it was unable to parse a token.
function read_token(patterns, input, i) {
for (var j = 0; j < patterns.length; j++) {
var regex = patterns[j][1];
var result = input.slice(i).match(regex);
if (result !== null) {
var text = result[0];
var token = [j, i, text.length];
return token;
}
}
assert.fail("exhausted input while lexing token");
}
// Takes an input string and a list of token patterns and tokenizes the string.
// Returns a list of tokens.
function tokenize(patterns, input) {
var tokens = [];
for (var i = 0; i < input.length;) {
var token = read_token(patterns, input, i);
i += token[2];
tokens.push(token);
// console.log("Debug: found token: ", token);
}
return tokens;
}
// Takes an input string and a list of token patterns and tokenizes the string.
// Streams the tokens to stdout as they are lexed.
function stream_tokens(patterns, input) {
for (var i = 0; i < input.length;) {
var token = read_token(patterns, input, i);
i += token[2];
process.stdout.write(token[0] + ',' + token[1] + ',' + token[2] + '\n');
}
}
function benchmark(input, patterns) {
var then = Date.now();
var tokens = tokenize(patterns, input);
var now = Date.now();
var elapsed = now - then;
console.log("Lexed", tokens.length, "tokens in", elapsed, "ms (", Math.round(tokens.length/(elapsed/1000)), "tokens/sec)");
if (tokens.length < 100) {
console.log("\nLexed tokens:");
for (var i = 0; i < tokens.length; i++) {
var token = tokens[i];
var token_type = patterns[token[0]][0];
var token_text = input.substr(token[1], token[2]);
console.log(token, token_type, token_text);
}
}
var stats = {};
for (var i = 0; i < patterns.length; i++) {
stats[i] = 0;
}
for (var i = 0; i < tokens.length; i++) {
var token = tokens[i];
stats[token[0]] += 1;
}
var stats2 = {};
for (var i = 0; i < patterns.length; i++) {
var label = patterns[i][0];
stats2[label] = stats[i];
}
console.log(stats2);
}
function main() {
var patterns = load_lex_grammar(process.argv[2]);
var stdin = 0;
if (process.argv.length < 4) {
var infile = stdin;
} else {
var infile = process.argv[3];
}
var input = fs.readFileSync(infile, 'utf-8').toString();
stream_tokens(patterns, input);
}
main();
#!/usr/bin/env python
# Usage: cat input.txt | ./lex.js grammar.txt
# A "token" is an array of three integers:
# 1. The "token type": an index into the list of token patterns.
# 2. The index into the input string marking the start of this token.
# 3. The length of the token.
from __future__ import print_function
def load_lex_grammar(input_stream):
"""Load a grammar from 'input_stream', where a grammar is pairs of lines,
with the token type on the first line and regex pattern on the next, e.g.:
integer
\\d+
newline
\\n
"""
import re
lines = [line.rstrip() for line in input_stream.readlines()]
# thanks to https://stackoverflow.com/questions/5764782/iterate-through-pairs-of-items-in-a-python-list#comment46500664_5764948
pairs = zip(lines[::2], lines[1::2])
patterns = []
for (token_type, regex) in pairs:
patterns.append((token_type, re.compile(regex, re.MULTILINE)))
return patterns
def read_token(patterns, input, i):
"""Finds the next token in the input string, starting at input[i], using the given token patterns.
Returns a token, or throws an exception if it was unable to parse a token."""
for (j, (_, regex)) in enumerate(patterns):
m = regex.match(buffer(input, i))
if m:
text = m.group()
token = (j, i, len(text))
return token
else:
raise ValueError("Bad input, starting at: '%s'" % buffer(input, i, 16))
def tokenize(patterns, input):
"""Takes an input string and a list of token patterns and tokenizes the string.
Returns a list of tokens."""
tokens = []
i = 0
while i < len(input):
token = read_token(patterns, input, i)
tokens.append(token)
i += token[2]
return tokens
def stream_tokens(patterns, input):
"""Stream the tokens to stdout as they are lexed."""
i = 0
while i < len(input):
token = read_token(patterns, input, i)
i += token[2]
sys.stdout.write("%i,%i,%i\n" % token)
def benchmark(patterns, input):
import time
then = time.time()
tokens = tokenize(patterns, input)
now = time.time()
elapsed = now - then
print("Lexed", len(tokens), "tokens in", elapsed*1000, "ms (", int(len(tokens)/elapsed), "tokens/sec)")
if len(tokens) < 100:
for t in tokens:
print("%i,%i,%i %s: '%s'" % (t[0], t[1], t[2], patterns[t[0]][0], input[t[1]:t[1]+t[2]]))
if __name__ == "__main__":
import sys
patterns = load_lex_grammar(open(sys.argv[1]))
input_stream = open(sys.argv[2]) if len(sys.argv) > 2 else sys.stdin
input = input_stream.read()
stream_tokens(patterns, input)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment