Skip to content

Instantly share code, notes, and snippets.

@cellularmitosis
Last active March 29, 2023 01:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cellularmitosis/226a18fb44f8b5287765ad3978e9074d to your computer and use it in GitHub Desktop.
Save cellularmitosis/226a18fb44f8b5287765ad3978e9074d to your computer and use it in GitHub Desktop.
mklexer.py part 5: the `offside-rule` pragma

Blog 2020/6/17

<- previous | index | next ->

mklexer.py part 5: the offside-rule pragma

See also part 1, 2, 3, 4

Languages like Python have indentation-based scoping. This is known as off-side rule.

This involves automatic insertion of INDENT tokens when the level of indentation increases, and DEDENT tokens when the level of indentation decreases.

For example, the input:

foo
    bar

or more specifically, the character sequence:

f   o   o   \n           b   a   r   \n

might become the token stream:

SYMBOL('foo') NEWLINE INDENT SYMBOL('bar') NEWLINE DEDENT.

Implementing off-side rule in the lexer makes for an easier job when it comes time to write the parser.

This was a technique I used in An alternative syntax for C.

Example: a trivial Python-like function definition

input.txt:

def five():
    return 5

tokendefs.txt:

#pragma offside-rule 4 spaces
SYMBOL
[a-z]+
NUMBER
[0-9]+
OPAREN
\(
CPAREN
\)
COLON
:
NEWLINE
\n
WSPACE
\s+

#pragma keywords
DEF
def
RETURN
return
$ ./mklexer.py tokendefs.txt > lexer.py
$ chmod +x lexer.py 
$ ./lexer.py input.txt | jq .
[
  {
    "type": "format",
    "format": "tokens"
  },
  [
    {
      "token_type": "DEF",
      "text": "def",
      "type": "token"
    },
    {
      "token_type": "WSPACE",
      "text": " ",
      "type": "token"
    },
    {
      "token_type": "SYMBOL",
      "text": "five",
      "type": "token"
    },
    {
      "token_type": "OPAREN",
      "text": "(",
      "type": "token"
    },
    {
      "token_type": "CPAREN",
      "text": ")",
      "type": "token"
    },
    {
      "token_type": "COLON",
      "text": ":",
      "type": "token"
    },
    {
      "token_type": "NEWLINE",
      "text": "\n",
      "type": "token"
    },
    {
      "token_type": "INDENT",
      "text": "    ",
      "type": "token"
    },
    {
      "token_type": "RETURN",
      "text": "return",
      "type": "token"
    },
    {
      "token_type": "WSPACE",
      "text": " ",
      "type": "token"
    },
    {
      "token_type": "NUMBER",
      "text": "5",
      "type": "token"
    },
    {
      "token_type": "NEWLINE",
      "text": "\n",
      "type": "token"
    },
    {
      "token_type": "DEDENT",
      "text": "",
      "type": "token"
    }
  ]
]
#!/usr/bin/env python
import base64
import sys
import os
infile = sys.argv[1]
outfile = os.path.basename(infile).rstrip('.base64')
s64 = open(infile,'rb').read()
s = base64.b64decode(s64)
open(outfile,'wb').write(s)
#!/usr/bin/env python
import base64
import sys
import os
infile = sys.argv[1]
outfile = os.path.basename(infile) + ".base64"
s = open(infile,'rb').read()
s64 = base64.b64encode(s)
open(outfile,'wb').write(s64)
lexer.py: mklexer.py tokendefs.txt
./mklexer.py tokendefs.txt > lexer.py
chmod +x lexer.py
test:
./test.sh
tests.tar.base64:
tar c tests > tests.tar
python b64enc.py tests.tar
rm -f tests.tar
clean:
rm -f lexer.py tests.tar
.PHONY: clean test tests.tar.base64
#!/usr/bin/env python
# mklexer.py: generate a Python lexer based on a token definitions file.
# See https://gist.github.com/cellularmitosis/1da62db09d41703c5a505d0bac9d9056
# Copyright (c) 2020 Jason Pepas
# Released under the terms of the MIT license.
# See https://opensource.org/licenses/MIT
import sys
import os
import pprint
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
def usage(fd):
"""Prints the usage help to the given file descriptor."""
exe = os.path.basename(sys.argv[0])
w = fd.write
if fd is sys.stderr:
w("Error: bad usage.\n")
w("\n")
else:
w("%s: generate a Python lexer based on a token definitions file.\n" % exe)
w("\n")
w("Display help:\n")
w(" %s -h\n" % exe)
w(" %s --help\n" % exe)
w("\n")
w("Generate a lexer using tokendefs.txt:\n")
w(" %s tokendefs.txt > lexer.py\n" % exe)
w(" chmod +x lexer.py\n")
w("""
tokendefs.txt consists of pairs of TOKENTYPE and <regex> lines.
Example tokendefs.txt:
NUMBER
-?\\d+(\\.\\d+)?
SYMBOL
[a-zA-Z_][a-zA-Z0-9_-]*
Use the lexer on input.txt, producing the standard JSON token format:
./lexer.py input.txt | jq .
Two example tokens in standard JSON format:
{'type': 'TOKEN', 'token_type': 'NUMBER', 'text': '3.14159'}
{'type': 'TOKEN', 'token_type': 'SYMBOL', 'text': 'fibonacci'}
Use the lexer on input.txt, producing "fast" array-based JSON tokens:
./lexer.py --fast input.txt | jq .
"fast" tokens are [<token type index>, <matched text>] pairs.
The same example tokens, but in 'fast' JSON format:
[0, '3.14159']
[1, 'fibonacci']
tokendefs.txt may also contain #pragma's:
line-oriented, discard, eof, keywords, offside-rule.
""")
#
# Parsing.
#
def parse_tokendefs(lines):
"""Parses the token definitions file, stopping at the first 'refine' pragma."""
tokendefs = []
pragmas = {}
i = 0
while i < len(lines):
line1 = lines[i]
if len(line1) == 0:
# skip blank lines.
i += 1
continue
words = line1.split()
if words[0] == '#pragma':
if len(words) == 1:
raise Exception("Can't parse pragma: %s" % line1)
pragma_name = words[1]
if pragma_name in ['line-oriented', 'eof']:
pragmas[pragma_name] = True
i += 1
continue
elif pragma_name == 'discard':
discardable_token_types = words[2:]
if len(discardable_token_types) == 0:
raise Exception("Discard pragma with no token types listed")
pragmas[pragma_name] = discardable_token_types
i += 1
continue
elif pragma_name == 'offside-rule':
quantity = int(words[2])
if words[3] == 'spaces':
indent_unit = ' ' * quantity
elif words[3] == 'tabs':
indent_unit = '\t' * quantity
else:
assert False
pragmas[pragma_name] = True
pragmas['indent_unit'] = indent_unit
i += 1
continue
elif pragma_name == 'keywords':
# this marks the start of the keywords section.
break
else:
raise Exception("Unknown pragma '%s'" % pragma_name)
elif line1[0] == '#':
# skip comments.
i += 1
continue
i += 1
if i >= len(lines):
raise Exception(
"Line %d: Token type '%s' has no corresponding regex." \
% (i, line1)
)
line2 = lines[i]
if len(line2) == 0:
raise Exception("Line %d: Zero-length regex." % (i+1))
tokentype = line1
regex = line2
pair = (tokentype, regex)
tokendefs.append(pair)
i += 1
continue
remaining_lines = lines[i:]
(keyword_maps, toktypes) = parse_keywords(remaining_lines, tokendefs)
if 'eof' in pragmas.keys():
toktypes.append('EOF')
if 'offside-rule' in pragmas.keys():
toktypes.append('INDENT')
toktypes.append('DEDENT')
return (pragmas, toktypes, tokendefs, keyword_maps)
def parse_keywords(lines, tokendefs):
"""Parses the section starting at the keywords pragma."""
keywords_map = {}
keywords_map_fast = {}
toktypes = [pair[0] for pair in tokendefs]
if len(lines) == 0:
keyword_maps = (keywords_map, keywords_map_fast)
return (keyword_maps, toktypes)
assert lines[0].split()[0] == '#pragma' and lines[0].split()[1] == 'keywords'
if len(lines) == 1:
raise Exception("Keywords pragma found, but no keywords defined?")
i = 1
while i < len(lines):
line1 = lines[i]
if len(line1) == 0:
# skip blank lines
i += 1
continue
elif line1.split()[0] == '#pragma':
raise Exception("Pragmas not allowed within keywords section.")
elif line1.startswith('#'):
# skip comments.
i += 1
continue
i += 1
if i >= len(lines):
raise Exception(
"Line %d: Token type '%s' has no corresponding keyword." \
% (i, line1)
)
line2 = lines[i]
if len(line2) == 0:
raise Exception("Line %d: Zero-length keyword." % i+1)
tokentype = line1
keyword = line2
keywords_map[keyword] = tokentype
toktypes.append(tokentype)
i += 1
continue
for keyword, toktype in keywords_map.items():
keywords_map_fast[keyword] = toktypes.index(toktype)
keyword_maps = (keywords_map, keywords_map_fast)
return (keyword_maps, toktypes)
#
# Code generation.
#
def codegen_pragmas(pragmas):
"""Generates the Python code for pragmas."""
fd = StringIO()
w = fd.write
is_line_oriented = 'line-oriented' in pragmas.keys()
w("pragma_line_oriented = %s\n" % is_line_oriented)
has_eof_pragma = 'eof' in pragmas.keys()
w("pragma_eof = %s\n" % has_eof_pragma)
if 'discard' in pragmas.keys():
toktypes_string = '[%s]' % ','.join(
[("'%s'" % token_type) for token_type in pragmas['discard']]
)
w("pragma_discard = %s\n" % toktypes_string)
else:
w("pragma_discard = []\n")
use_offside_rule = 'offside-rule' in pragmas.keys()
w("pragma_offside_rule = %s\n" % use_offside_rule)
if 'indent_unit' in pragmas.keys():
indent_unit = pragmas['indent_unit']
w("indent_unit = %s\n" % indent_unit.__repr__())
else:
w("indent_unit = ''\n")
w('\n')
code = fd.getvalue()
fd.close()
return code
def codegen_toktypes(toktypes):
"""Generates the Python code of the toktypes array."""
assert len(toktypes) > 0, toktypes
reprs = [toktype.__repr__() for toktype in toktypes]
if len(reprs) == 1:
return "toktypes = [%s]\n\n" % reprs[0]
lines = []
linebuf = "toktypes = [%s" % reprs[0]
for r in reprs[1:]:
s = ', ' + r
if len(linebuf) + len(s) < 80:
linebuf += s
continue
else:
linebuf += ','
lines.append(linebuf)
linebuf = ' ' + r
if len(lines) == 0:
linebuf += ']\n\n'
lines.append(linebuf)
else:
lines.append(linebuf)
lines.append(']\n\n')
return '\n'.join(lines)
def codegen_tokendefs(tokendefs):
"""Generates the Python code of the tokendefs table."""
def codegen_regex(regex_text):
"""Generates the Python code of a regex."""
# do everything we can to avoid the backslash plague.
if "'" not in regex_text:
return "r'%s'" % regex_text
elif '"' not in regex_text:
return 'r"%s"' % regex_text
elif "'''" not in regex_text and not regex_text.startswith("'") and not regex_text.endswith("'"):
return "r'''%s'''" % regex_text
elif '"""' not in regex_text and not regex_text.startswith('"') and not regex_text.endswith('"'):
return 'r"""%s"""' % regex_text
else:
# oh well, at least we tried :shrug:
return regex_text.__repr__()
fd = StringIO()
w = fd.write
w("tokendefs = [\n")
for token_type, regex in tokendefs:
w(" ['%s', %s],\n" % (token_type, codegen_regex(regex)))
w("]\n\n")
code = fd.getvalue()
fd.close()
return code
def codegen_keyword_maps(keyword_maps):
"""Generates the Python code of the keyword maps."""
(keywords_map, keywords_map_fast) = keyword_maps
s1 = 'keywords_map = '
s1 += pprint.pformat(keywords_map, width=80-len(s1)) + '\n\n'
s2 = 'keywords_map_fast = '
s2 += pprint.pformat(keywords_map_fast, width=80-len(s2)) + '\n\n'
return s1 + s2
def codegen(pragmas, toktypes, tokendefs, keyword_maps):
"""Generates the Python code of the lexer."""
fd = StringIO()
w = fd.write
w("""#!/usr/bin/env python
# DO NOT EDIT: this lexer was generated by mklexer.py.
import sys
import re
import json
""")
pragmas_code = codegen_pragmas(pragmas)
w(pragmas_code)
toktypes_code = codegen_toktypes(toktypes)
w(toktypes_code)
tokendefs_code = codegen_tokendefs(tokendefs)
w(tokendefs_code)
keyword_maps_code = codegen_keyword_maps(keyword_maps)
w(keyword_maps_code)
w("""
def compile_regexes():
\"\"\"Compile the regexes.\"\"\"
for pair in tokendefs:
try:
pair[1] = re.compile(pair[1])
except Exception as e:
sys.stderr.write("Error: can't compile regex for '%s'.\\n" % pair[0])
raise e
compile_regexes()
def token_text(token):
\"\"\"Returns the text of the token.\"\"\"
if use_fast_format:
return token[1]
else:
return token['text']
def get_linenum_charnum(text, offset):
\"\"\"Returns the line number and character number of the offset.\"\"\"
linenum = 1
charnum = 1
i = 0
while i < offset:
if text[i] == '\\n':
linenum += 1
charnum = 1
i += 1
continue
else:
charnum += 1
i += 1
continue
return (linenum, charnum)
def consume_next_token(text, offset, use_fast_format):
\"\"\"Consumes next token from the given text input.
Returns a (token, offset) pair.
Throws if no tokens match.\"\"\"
for i, pair in enumerate(tokendefs):
(token_type, regex) = pair
m = regex.match(text, offset)
if m is None:
continue
matched_text = m.group()
if use_fast_format:
toktype_index = i
if matched_text in keywords_map_fast:
toktype_index = keywords_map_fast[matched_text]
token = [toktype_index, matched_text]
else:
if matched_text in keywords_map:
token_type = keywords_map[matched_text]
token = {
'type': 'token',
'token_type': token_type,
'text': matched_text,
}
new_offset = offset + len(matched_text)
return (token, new_offset)
# none of the token types matched
(linenum, charnum) = get_linenum_charnum(text, offset)
raise Exception(
"Can't lex starting at line %d, character %d, context: '%s'" \\
% (linenum, charnum, text[offset:offset+32])
)
def discard_tokens(tokens, use_fast_format):
\"\"\"Discards any tokens specified by the 'discard' pragma.\"\"\"
def make_discard_set():
if use_fast_format:
discard_set = set()
for i, pair in enumerate(tokendefs):
(token_type, _) = pair
if token_type in pragma_discard:
discard_set.add(i)
continue
else:
discard_set = set(pragma_discard)
return discard_set
discard_set = make_discard_set()
kept_tokens = []
for token in tokens:
if use_fast_format:
toktype = token[0]
else:
toktype = token['token_type']
if toktype not in discard_set:
kept_tokens.append(token)
continue
return kept_tokens
def make_lines(tokens, use_fast_format):
\"\"\"Return a line-oriented array-of-arrays from the given tokens.\"\"\"
lines = []
line = []
for token in tokens:
text = token_text(token)
if text == '\\n':
lines.append(line)
line = []
continue
else:
line.append(token)
continue
if len(line) > 0:
lines.append(line)
return lines
def offside_rule(token, previous_token, indent_level):
\"\"\"Implement the "offside rule" indentation-based scoping mechanism.
See https://en.wikipedia.org/wiki/Off-side_rule
Returns a list of replacement tokens and the new indentation level.
If the previous token was a NL and the indentation level has changed,
inject INDENT or DEDENT tokens as needed.
Otherwise, return the token unmodified.
Example: the token stream
COLON NL WSPACE RETURN NL
should become
COLON NL INDENT RETURN NL DEDENT
\"\"\"
if previous_token is None:
return ([token], indent_level)
previous_text = token_text(previous_token)
if previous_text not in ['\\n', '\\r\\n', '\\r']:
return ([token], indent_level)
new_tokens = []
new_indent_level = 0
indent_unit_len = len(indent_unit)
text = token_text(token)
while text.startswith(indent_unit):
text = text[indent_unit_len:]
new_indent_level += 1
continue
# if the indentation level increased, insert INDENT tokens.
for _ in range(new_indent_level - indent_level):
if use_fast_format:
new_tokens.append([toktypes.index('INDENT'), indent_unit])
else:
new_tokens.append({
'type': 'token',
'token_type': 'INDENT',
'text': indent_unit
})
# if the indentation level decreased, insert DEDENT tokens.
for _ in range(indent_level - new_indent_level):
if use_fast_format:
new_tokens.append([toktypes.index('DEDENT'), ''])
else:
new_tokens.append({
'type': 'token',
'token_type': 'DEDENT',
'text': ''
})
if len(text) > 0:
new_tokens.append(token)
return (new_tokens, new_indent_level)
def lex(text, use_fast_format):
\"\"\"Returns a list of tokens for the given text input.\"\"\"
tokens = []
offset = 0
indent_level = 0
previous_token = None
# consume the text.
while offset < len(text):
(token, offset) = consume_next_token(text, offset, use_fast_format)
if pragma_offside_rule:
# the offside-rule special-case:
(tokens2, indent_level) = offside_rule(token, previous_token, indent_level)
tokens += tokens2
previous_token = token
else:
tokens.append(token)
continue
# if there is a left-over indentation level, insert trailing DEDENT tokens.
while indent_level > 0:
if use_fast_format:
tokens.append([toktypes.index('DEDENT'), ''])
else:
tokens.append({
'type': 'token',
'token_type': 'DEDENT',
'text': ''
})
indent_level -= 1
continue
# discard any unwanted tokens.
tokens = discard_tokens(tokens, use_fast_format)
# chop tokens array into lines if desired.
if pragma_line_oriented:
tokens = make_lines(tokens, use_fast_format)
# append an EOF token if desired.
if pragma_eof:
if use_fast_format:
eof_token = [len(toktypes)-1, ""]
else:
eof_token = {
"type": "token",
"token_type": "EOF",
"text": ""
}
if pragma_line_oriented:
tokens.append([eof_token])
else:
tokens.append(eof_token)
# create the format dictionary.
format_dict = {'type': 'format'}
if use_fast_format:
if pragma_line_oriented:
format_dict['format'] = 'fast-lines'
else:
format_dict['format'] = 'fast'
format_dict['token_types'] = toktypes
else:
if pragma_line_oriented:
format_dict['format'] = 'tokens-lines'
else:
format_dict['format'] = 'tokens'
json_obj = [format_dict, tokens]
return json_obj
if __name__ == '__main__':
infile = [arg for arg in sys.argv[1:] if not arg.startswith('-')][-1]
use_fast_format = False
if '--fast' in sys.argv[1:]:
use_fast_format = True
fd = open(infile, 'r')
text = fd.read()
fd.close()
json_obj = lex(text, use_fast_format)
output = json.dumps(json_obj)
if not output.endswith('\\n'):
output += '\\n'
sys.stdout.write(output)
""")
code = fd.getvalue()
fd.close()
return code
if __name__ == "__main__":
if len(sys.argv) < 2:
usage(sys.stderr)
sys.exit(1)
if '-h' in sys.argv or '--help' in sys.argv:
usage(sys.stdout)
sys.exit(0)
# the last non-option arg is the tokendefs file.
tokendefs_fpath = None
non_option_args = [arg for arg in sys.argv[1:] if not arg.startswith('-')]
if len(non_option_args) != 1:
usage(sys.stderr)
sys.exit(1)
tokendefs_fpath = non_option_args[0]
fd = open(tokendefs_fpath, 'r')
tokendefs_lines = fd.read().splitlines()
fd.close()
(pragmas, toktypes, tokendefs, keyword_maps) = parse_tokendefs(tokendefs_lines)
code = codegen(pragmas, toktypes, tokendefs, keyword_maps)
sys.stdout.write(code)
#!/bin/bash
set -e
set -o pipefail
shopt -s expand_aliases
alias canonicaljson="python -c 'import json, sys; sys.stdout.write(json.dumps(json.load(sys.stdin), sort_keys=True))'"
if ! test -d tests
then
rm -f tests.tar
python b64dec.py tests.tar.base64
cat tests.tar | tar x
rm -f tests.tar
fi
for d in $( ls tests )
do
echo "Test $d"
cd tests/$d
../../mklexer.py tokendefs.txt > /tmp/lexer.py
echo " Python 2"
cat answer.txt | canonicaljson > /tmp/1
python2 /tmp/lexer.py input.txt | canonicaljson > /tmp/2
diff -q /tmp/1 /tmp/2
echo " Python 3"
python3 /tmp/lexer.py input.txt | canonicaljson > /tmp/2
diff -q /tmp/1 /tmp/2
echo " Python 2, --fast format"
cat answer-fast.txt | canonicaljson > /tmp/1
python2 /tmp/lexer.py --fast input.txt | canonicaljson > /tmp/2
diff -q /tmp/1 /tmp/2
echo " Python 3, --fast format"
python3 /tmp/lexer.py --fast input.txt | canonicaljson > /tmp/2
diff -q /tmp/1 /tmp/2
rm -f /tmp/lexer.py /tmp/1 /tmp/2
cd ../..
done
tests/                                                                                              000755  000765  000024  00000000000 13672235046 012312  5                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         tests/007/                                                                                          000755  000765  000024  00000000000 13672474225 012624  5                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         tests/001/                                                                                          000755  000765  000024  00000000000 13672231132 012602  5                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         tests/006/                                                                                          000755  000765  000024  00000000000 13672232034 012611  5                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         tests/003/                                                                                          000755  000765  000024  00000000000 13672231132 012604  5                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         tests/004/                                                                                          000755  000765  000024  00000000000 13672231132 012605  5                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         tests/005/                                                                                          000755  000765  000024  00000000000 13672231132 012606  5                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         tests/002/                                                                                          000755  000765  000024  00000000000 13672231132 012603  5                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         tests/002/answer-fast.txt                                                                           000644  000765  000024  00000000320 13671454235 015603  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "fast-lines", "token_types": ["SYMBOL", "WSPACE"]},
 [
  [
   [0, "foo"],
   [1, " "],
   [0, "bar"]
  ],
  [
   [0, "bin"],
   [1, " "],
   [0, "baz"]
  ]
 ]
]
                                                                                                                                                                                                                                                                                                                tests/002/input.txt                                                                                 000644  000765  000024  00000000020 13671454235 014505  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         foo bar
bin baz
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                tests/002/answer.txt                                                                                000644  000765  000024  00000000701 13671454235 014653  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "tokens-lines"},
 [
  [
   {"type": "token", "token_type": "SYMBOL", "text": "foo"},
   {"type": "token", "token_type": "WSPACE", "text": " "},
   {"type": "token", "token_type": "SYMBOL", "text": "bar"}
  ],
  [
   {"type": "token", "token_type": "SYMBOL", "text": "bin"},
   {"type": "token", "token_type": "WSPACE", "text": " "},
   {"type": "token", "token_type": "SYMBOL", "text": "baz"}
  ]
 ]
]
                                                               tests/002/tokendefs.txt                                                                             000644  000765  000024  00000000057 13672231132 015330  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         #pragma line-oriented
SYMBOL
[a-z]+
WSPACE
\s+
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 tests/005/answer-fast.txt                                                                           000644  000765  000024  00000000320 13671454235 015606  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "fast-lines", "token_types": ["SYMBOL", "WSPACE", "EOF"]},
 [
  [
   [0, "foo"],
   [1, " "],
   [0, "bar"]
  ],
  [
   [0, "baz"]
  ],
  [
   [2, ""]
  ]
 ]
]
                                                                                                                                                                                                                                                                                                                tests/005/input.txt                                                                                 000644  000765  000024  00000000014 13671454235 014513  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         foo bar
baz
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    tests/005/answer.txt                                                                                000644  000765  000024  00000000611 13671454235 014656  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "tokens-lines"},
 [
  [
   {"type": "token", "token_type": "SYMBOL", "text": "foo"},
   {"type": "token", "token_type": "WSPACE", "text": " "},
   {"type": "token", "token_type": "SYMBOL", "text": "bar"}
  ],
  [
   {"type": "token", "token_type": "SYMBOL", "text": "baz"}
  ],
  [
   {"type": "token", "token_type": "EOF", "text": ""}
  ]
 ]
]
                                                                                                                       tests/005/tokendefs.txt                                                                             000644  000765  000024  00000000073 13672231132 015331  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         #pragma line-oriented
#pragma eof
SYMBOL
[a-z]+
WSPACE
\s+
                                                                                                                                                                                                                                                                                                                                                                                                                                                                     tests/004/answer-fast.txt                                                                           000644  000765  000024  00000000304 13671454235 015607  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "fast", "token_types": ["SYMBOL", "WSPACE", "EOF"]},
 [
  [0, "foo"],
  [1, " "],
  [0, "bar"],
  [1, "   "],
  [0, "baz"],
  [1, "\n"],
  [2, ""]
 ]
]
                                                                                                                                                                                                                                                                                                                            tests/004/input.txt                                                                                 000644  000765  000024  00000000016 13671454235 014514  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         foo bar   baz
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  tests/004/answer.txt                                                                                000644  000765  000024  00000000731 13671454235 014660  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "tokens"},
 [
  {"type": "token", "token_type": "SYMBOL", "text": "foo"},
  {"type": "token", "token_type": "WSPACE", "text": " "},
  {"type": "token", "token_type": "SYMBOL", "text": "bar"},
  {"type": "token", "token_type": "WSPACE", "text": "   "},
  {"type": "token", "token_type": "SYMBOL", "text": "baz"},
  {"type": "token", "token_type": "WSPACE", "text": "\n"},
  {"type": "token", "token_type": "EOF", "text": ""}
 ]
]
                                       tests/004/tokendefs.txt                                                                             000644  000765  000024  00000000045 13672231132 015327  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         #pragma eof
SYMBOL
[a-z]+
WSPACE
\s+
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           tests/003/answer-fast.txt                                                                           000644  000765  000024  00000000202 13671454235 015603  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "fast", "token_types": ["SYMBOL", "NUMBER", "WSPACE"]},
 [
  [0, "foo"],
  [0, "bar"]
 ]
]
                                                                                                                                                                                                                                                                                                                                                                                              tests/003/input.txt                                                                                 000644  000765  000024  00000000014 13671454235 014511  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         foo 123 bar
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    tests/003/answer.txt                                                                                000644  000765  000024  00000000261 13671454235 014655  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "tokens"},
 [
  {"type": "token", "token_type": "SYMBOL", "text": "foo"},
  {"type": "token", "token_type": "SYMBOL", "text": "bar"}
 ]
]
                                                                                                                                                                                                                                                                                                                                               tests/003/tokendefs.txt                                                                             000644  000765  000024  00000000105 13672231132 015323  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         #pragma discard WSPACE NUMBER
SYMBOL
[a-z]+
NUMBER
[0-9]+
WSPACE
\s+
                                                                                                                                                                                                                                                                                                                                                                                                                                                           tests/006/answer-fast.txt                                                                           000644  000765  000024  00000000255 13672231763 015616  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "fast", "token_types": [
     "SYMBOL", "WSPACE", "WHILE"
 ]},
 [
  [0, "whilezzz"],
  [1, " "],
  [2, "while"],
  [1, "\n"]
 ]
]
                                                                                                                                                                                                                                                                                                                                                   tests/006/input.txt                                                                                 000644  000765  000024  00000000017 13672231563 014515  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         whilezzz while
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 tests/006/answer.txt                                                                                000644  000765  000024  00000000456 13672231744 014665  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "tokens"},
 [
  {"type": "token", "token_type": "SYMBOL", "text": "whilezzz"},
  {"type": "token", "token_type": "WSPACE", "text": " "},
  {"type": "token", "token_type": "WHILE", "text": "while"},
  {"type": "token", "token_type": "WSPACE", "text": "\n"}
 ]
]
                                                                                                                                                                                                                  tests/006/tokendefs.txt                                                                             000644  000765  000024  00000000067 13672232034 015337  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         SYMBOL
[a-z]+
WSPACE
\s+

#pragma keywords
WHILE
while
                                                                                                                                                                                                                                                                                                                                                                                                                                                                         tests/001/answer-fast.txt                                                                           000644  000765  000024  00000000261 13671454235 015606  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "fast", "token_types": ["SYMBOL", "WSPACE"]},
 [
  [0, "foo"],
  [1, " "],
  [0, "bar"],
  [1, "   "],
  [0, "baz"],
  [1, "\n"]
 ]
]
                                                                                                                                                                                                                                                                                                                                               tests/001/input.txt                                                                                 000644  000765  000024  00000000016 13671454235 014511  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         foo bar   baz
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  tests/001/answer.txt                                                                                000644  000765  000024  00000000642 13671454235 014656  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "tokens"},
 [
  {"type": "token", "token_type": "SYMBOL", "text": "foo"},
  {"type": "token", "token_type": "WSPACE", "text": " "},
  {"type": "token", "token_type": "SYMBOL", "text": "bar"},
  {"type": "token", "token_type": "WSPACE", "text": "   "},
  {"type": "token", "token_type": "SYMBOL", "text": "baz"},
  {"type": "token", "token_type": "WSPACE", "text": "\n"}
 ]
]
                                                                                              tests/001/tokendefs.txt                                                                             000644  000765  000024  00000000031 13672231132 015317  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         SYMBOL
[a-z]+
WSPACE
\s+
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       tests/007/answer-fast.txt                                                                           000644  000765  000024  00000000541 13672475070 015616  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "fast", "token_types": ["SYMBOL", "NUMBER",
    "OPAREN", "CPAREN", "COLON", "NEWLINE", "WSPACE", "DEF", "RETURN",
    "INDENT", "DEDENT"]
 },
 [
  [7, "def"],
  [6, " "],
  [0, "five"],
  [2, "("],
  [3, ")"],
  [4, ":"],
  [5, "\n"],
  [9, "    "],
  [8, "return"],
  [6, " "],
  [1, "5"],
  [5, "\n"],
  [10, ""]
 ]
]
                                                                                                                                                               tests/007/input.txt                                                                                 000644  000765  000024  00000000031 13672236133 014510  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         def five():
    return 5
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       tests/007/answer.txt                                                                                000644  000765  000024  00000001460 13672473704 014666  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "tokens"},
 [
  {"type": "token", "token_type": "DEF", "text": "def"},
  {"type": "token", "token_type": "WSPACE", "text": " "},
  {"type": "token", "token_type": "SYMBOL", "text": "five"},
  {"type": "token", "token_type": "OPAREN", "text": "("},
  {"type": "token", "token_type": "CPAREN", "text": ")"},
  {"type": "token", "token_type": "COLON", "text": ":"},
  {"type": "token", "token_type": "NEWLINE", "text": "\n"},
  {"type": "token", "token_type": "INDENT", "text": "    "},
  {"type": "token", "token_type": "RETURN", "text": "return"},
  {"type": "token", "token_type": "WSPACE", "text": " "},
  {"type": "token", "token_type": "NUMBER", "text": "5"},
  {"type": "token", "token_type": "NEWLINE", "text": "\n"},
  {"type": "token", "token_type": "DEDENT", "text": ""}
 ]
]
                                                                                                                                                                                                                tests/007/tokendefs.txt                                                                             000644  000765  000024  00000000224 13672470576 015352  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         #pragma offside-rule 4 spaces
SYMBOL
[a-z]+
NUMBER
[0-9]+
OPAREN
\(
CPAREN
\)
COLON
:
NEWLINE
\n
WSPACE
\s+

#pragma keywords
DEF
def
RETURN
return
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment