Skip to content

Instantly share code, notes, and snippets.

@pepasflo
Last active October 30, 2018 22:14
Show Gist options
  • Save pepasflo/89c93ef29447f4e22d0f6fb34a99cea7 to your computer and use it in GitHub Desktop.
Save pepasflo/89c93ef29447f4e22d0f6fb34a99cea7 to your computer and use it in GitHub Desktop.
Basic lexers in javascript and python
#!/bin/bash
# use this to create a ~500,000 line C file for performance testing
set -e -o pipefail
if [ ! -e linux-4.19.c ]
then
curl https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.19.tar.xz | unxz | tar x
cd linux-4.19
cat **/*.c > ../linux-4.19.c
fi
# You can use a fraction of it by piping into 'head':
# cat linux-4.19.c | head -n10000 | ./lex.py grammar.txt
# cat linux-4.19.c | head -n10000 | ./lex.js
wspace
\s+
identifier
[a-zA-Z_]\w*
float
-?\d+\.\d+
int
-?\d+
string
"(\\\\|\\"|[^"])*"
comment
\/\/.*
other
[\s\S]
#!/usr/bin/env node
// Usage: cat input.txt | ./lex.js
// A "token" is an array of three integers:
// 1. The "token type": an index into the list of token patterns.
// 2. The index into the input string marking the start of this token.
// 3. The length of the token.
var assert = require('assert');
// Finds the next token in the input string, starting at input[i], using the given token patterns.
// Returns a token, or causes assertion failure if it was unable to parse a token.
function read_token(input, i, patterns) {
for (var j = 0; j < patterns.length; j++) {
var regex = patterns[j][1];
var result = input.slice(i).match(regex);
if (result !== null) {
var text = result[0];
var token = [j, i, text.length];
return token;
}
}
assert.fail("exhausted input while lexing token");
}
// Takes an input string and a list of token patterns and tokenizes the string.
// Returns a list of tokens.
function tokenize(input, patterns) {
var tokens = [];
for (var i = 0; i < input.length;) {
var token = read_token(input, i, patterns);
i += token[2];
tokens.push(token);
// console.log("Debug: found token: ", token);
}
return tokens;
}
var patterns = [
["wspace", /^\s+/],
["identifier", /^[a-zA-Z_]\w*/],
// ["oparen", /^\(/],
// ["cparen", /^\)/],
["float", /^-?\d+\.\d+/],
["int", /^-?\d+/],
// ["obrace", /^\{/],
// ["cbrace", /^\}/],
// ["obracket", /^\[/],
// ["cbracket", /^\]/],
// match two quotes containing any number of escaped-backslash, escaped quote, or any non-quote characters.
["string", /^"(\\\\|\\"|[^"])*"/],
["comment", /^\/\/.*/],
// ["mcomment", /^\/\*[\s\S]*\*\//],
["other", /^[\s\S]/],
];
var fs = require('fs');
var input = fs.readFileSync(0).toString(); // 0 is the file descriptor of stdin
var then = Date.now();
var tokens = tokenize(input, patterns);
var now = Date.now();
var elapsed = now - then;
console.log("Lexed", tokens.length, "tokens in", elapsed, "ms (", Math.round(tokens.length/(elapsed/1000)), "tokens/sec)");
if (tokens.length < 100) {
console.log("\nLexed tokens:");
for (var i = 0; i < tokens.length; i++) {
var token = tokens[i];
var token_type = patterns[token[0]][0];
var token_text = input.substr(token[1], token[2]);
console.log(token, token_type, token_text);
}
}
var stats = {};
for (var i = 0; i < patterns.length; i++) {
stats[i] = 0;
}
for (var i = 0; i < tokens.length; i++) {
var token = tokens[i];
stats[token[0]] += 1;
}
var stats2 = {};
for (var i = 0; i < patterns.length; i++) {
var label = patterns[i][0];
stats2[label] = stats[i];
}
console.log(stats2);
#!/usr/bin/env python
# Usage: cat input.txt | ./lex.js grammar.txt
# A "token" is an array of three integers:
# 1. The "token type": an index into the list of token patterns.
# 2. The index into the input string marking the start of this token.
# 3. The length of the token.
from __future__ import print_function
def read_grammar(input_stream):
"""Load a grammar from 'input_stream', where a grammar is pairs of lines,
with the token type on the first line and regex pattern on the next, e.g.:
integer
\\d+
newline
\\n
"""
import re
lines = [line.rstrip() for line in input_stream.readlines()]
# thanks to https://stackoverflow.com/questions/5764782/iterate-through-pairs-of-items-in-a-python-list#comment46500664_5764948
pairs = zip(lines[::2], lines[1::2])
patterns = []
for (token_type, regex) in pairs:
patterns.append((token_type, re.compile(regex, re.MULTILINE)))
return patterns
def read_token(input, i, patterns):
"""Finds the next token in the input string, starting at input[i], using the given token patterns.
Returns a token, or throws an exception if it was unable to parse a token."""
for (j, (_, regex)) in enumerate(patterns):
m = regex.match(buffer(input, i))
if m:
text = m.group()
token = (j, i, len(text))
return token
else:
raise ValueError("Bad input, starting at: '%s'" % buffer(input, i, 16))
def tokenize(input, patterns):
"""Takes an input string and a list of token patterns and tokenizes the string.
Returns a list of tokens."""
tokens = []
i = 0
while i < len(input):
token = read_token(input, i, patterns)
tokens.append(token)
i += token[2]
return tokens
if __name__ == "__main__":
import sys
patterns = read_grammar(open(sys.argv[1]))
input_stream = open(sys.argv[2]) if len(sys.argv) > 2 else sys.stdin
input = input_stream.read()
import time
then = time.time()
tokens = tokenize(input, patterns)
now = time.time()
elapsed = now - then
print("Lexed", len(tokens), "tokens in", elapsed*1000, "ms (", int(len(tokens)/elapsed), "tokens/sec)")
if len(tokens) < 100:
for t in tokens:
print("%i,%i,%i %s: '%s'" % (t[0], t[1], t[2], patterns[t[0]][0], input[t[1]:t[1]+t[2]]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment