Last active
October 30, 2018 22:14
-
-
Save pepasflo/89c93ef29447f4e22d0f6fb34a99cea7 to your computer and use it in GitHub Desktop.
Basic lexers in javascript and python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# use this to create a ~500,000 line C file for performance testing | |
set -e -o pipefail | |
if [ ! -e linux-4.19.c ] | |
then | |
curl https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.19.tar.xz | unxz | tar x | |
cd linux-4.19 | |
cat **/*.c > ../linux-4.19.c | |
fi | |
# You can use a fraction of it by piping into 'head': | |
# cat linux-4.19.c | head -n10000 | ./lex.py grammar.txt | |
# cat linux-4.19.c | head -n10000 | ./lex.js |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
wspace | |
\s+ | |
identifier | |
[a-zA-Z_]\w* | |
float | |
-?\d+\.\d+ | |
int | |
-?\d+ | |
string | |
"(\\\\|\\"|[^"])*" | |
comment | |
\/\/.* | |
other | |
[\s\S] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env node | |
// Usage: cat input.txt | ./lex.js | |
// A "token" is an array of three integers: | |
// 1. The "token type": an index into the list of token patterns. | |
// 2. The index into the input string marking the start of this token. | |
// 3. The length of the token. | |
var assert = require('assert'); | |
// Finds the next token in the input string, starting at input[i], using the given token patterns. | |
// Returns a token, or causes assertion failure if it was unable to parse a token. | |
function read_token(input, i, patterns) { | |
for (var j = 0; j < patterns.length; j++) { | |
var regex = patterns[j][1]; | |
var result = input.slice(i).match(regex); | |
if (result !== null) { | |
var text = result[0]; | |
var token = [j, i, text.length]; | |
return token; | |
} | |
} | |
assert.fail("exhausted input while lexing token"); | |
} | |
// Takes an input string and a list of token patterns and tokenizes the string. | |
// Returns a list of tokens. | |
function tokenize(input, patterns) { | |
var tokens = []; | |
for (var i = 0; i < input.length;) { | |
var token = read_token(input, i, patterns); | |
i += token[2]; | |
tokens.push(token); | |
// console.log("Debug: found token: ", token); | |
} | |
return tokens; | |
} | |
var patterns = [ | |
["wspace", /^\s+/], | |
["identifier", /^[a-zA-Z_]\w*/], | |
// ["oparen", /^\(/], | |
// ["cparen", /^\)/], | |
["float", /^-?\d+\.\d+/], | |
["int", /^-?\d+/], | |
// ["obrace", /^\{/], | |
// ["cbrace", /^\}/], | |
// ["obracket", /^\[/], | |
// ["cbracket", /^\]/], | |
// match two quotes containing any number of escaped-backslash, escaped quote, or any non-quote characters. | |
["string", /^"(\\\\|\\"|[^"])*"/], | |
["comment", /^\/\/.*/], | |
// ["mcomment", /^\/\*[\s\S]*\*\//], | |
["other", /^[\s\S]/], | |
]; | |
var fs = require('fs'); | |
var input = fs.readFileSync(0).toString(); // 0 is the file descriptor of stdin | |
var then = Date.now(); | |
var tokens = tokenize(input, patterns); | |
var now = Date.now(); | |
var elapsed = now - then; | |
console.log("Lexed", tokens.length, "tokens in", elapsed, "ms (", Math.round(tokens.length/(elapsed/1000)), "tokens/sec)"); | |
if (tokens.length < 100) { | |
console.log("\nLexed tokens:"); | |
for (var i = 0; i < tokens.length; i++) { | |
var token = tokens[i]; | |
var token_type = patterns[token[0]][0]; | |
var token_text = input.substr(token[1], token[2]); | |
console.log(token, token_type, token_text); | |
} | |
} | |
var stats = {}; | |
for (var i = 0; i < patterns.length; i++) { | |
stats[i] = 0; | |
} | |
for (var i = 0; i < tokens.length; i++) { | |
var token = tokens[i]; | |
stats[token[0]] += 1; | |
} | |
var stats2 = {}; | |
for (var i = 0; i < patterns.length; i++) { | |
var label = patterns[i][0]; | |
stats2[label] = stats[i]; | |
} | |
console.log(stats2); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Usage: cat input.txt | ./lex.js grammar.txt | |
# A "token" is an array of three integers: | |
# 1. The "token type": an index into the list of token patterns. | |
# 2. The index into the input string marking the start of this token. | |
# 3. The length of the token. | |
from __future__ import print_function | |
def read_grammar(input_stream): | |
"""Load a grammar from 'input_stream', where a grammar is pairs of lines, | |
with the token type on the first line and regex pattern on the next, e.g.: | |
integer | |
\\d+ | |
newline | |
\\n | |
""" | |
import re | |
lines = [line.rstrip() for line in input_stream.readlines()] | |
# thanks to https://stackoverflow.com/questions/5764782/iterate-through-pairs-of-items-in-a-python-list#comment46500664_5764948 | |
pairs = zip(lines[::2], lines[1::2]) | |
patterns = [] | |
for (token_type, regex) in pairs: | |
patterns.append((token_type, re.compile(regex, re.MULTILINE))) | |
return patterns | |
def read_token(input, i, patterns): | |
"""Finds the next token in the input string, starting at input[i], using the given token patterns. | |
Returns a token, or throws an exception if it was unable to parse a token.""" | |
for (j, (_, regex)) in enumerate(patterns): | |
m = regex.match(buffer(input, i)) | |
if m: | |
text = m.group() | |
token = (j, i, len(text)) | |
return token | |
else: | |
raise ValueError("Bad input, starting at: '%s'" % buffer(input, i, 16)) | |
def tokenize(input, patterns): | |
"""Takes an input string and a list of token patterns and tokenizes the string. | |
Returns a list of tokens.""" | |
tokens = [] | |
i = 0 | |
while i < len(input): | |
token = read_token(input, i, patterns) | |
tokens.append(token) | |
i += token[2] | |
return tokens | |
if __name__ == "__main__": | |
import sys | |
patterns = read_grammar(open(sys.argv[1])) | |
input_stream = open(sys.argv[2]) if len(sys.argv) > 2 else sys.stdin | |
input = input_stream.read() | |
import time | |
then = time.time() | |
tokens = tokenize(input, patterns) | |
now = time.time() | |
elapsed = now - then | |
print("Lexed", len(tokens), "tokens in", elapsed*1000, "ms (", int(len(tokens)/elapsed), "tokens/sec)") | |
if len(tokens) < 100: | |
for t in tokens: | |
print("%i,%i,%i %s: '%s'" % (t[0], t[1], t[2], patterns[t[0]][0], input[t[1]:t[1]+t[2]])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment