Skip to content

Instantly share code, notes, and snippets.

@cellularmitosis
Last active March 29, 2023 01:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cellularmitosis/117f809868535de8f8ab4d69477e29d2 to your computer and use it in GitHub Desktop.
Save cellularmitosis/117f809868535de8f8ab4d69477e29d2 to your computer and use it in GitHub Desktop.
mklexer.py part 4: ditch refine in favor of keywords pragma

Blog 2020/6/16

<- previous | index | next ->

mklexer.py part 4: ditch refine in favor of keywords pragma

See also part 1, 2, 3, 5

In part 3, I introduced the refine pragma, which is a powerful, general solution. However, after thinking about it a bit more, I realized I likely won't need the power of arbitrary regexes in the refine step. Most likely, I will always use refine to match against static strings:

#pragma keywords
WHILE
while
BREAK
break
CONTINUE
continue

If this is the case, we can avoid looping over the list of refine regexes for each token, and instead just perform a dictionary lookup. This is faster and simplifies the implementation. If we don't need the extra power of regexes, better to give them up!

Note that we can also match multiple keywords to a single token type, e.g.:

#pragma keywords
BOOL_LITERAL
true
BOOL_LITERAL
false

This was a technique I used in An alternative syntax for C.

Example: keywords pragma

tokendefs.txt:

SYMBOL
[a-z]+
WSPACE
\s+

#pragma keywords
WHILE
while
BREAK
break
CONTINUE
continue

input.txt:

whilezzz while

Result:

$ ./mklexer.py tokendefs.txt > lexer.py
$ chmod +x lexer.py
$ lexer.py input.txt | jq .
[
  {
    "type": "format",
    "format": "tokens"
  },
  [
    {
      "token_type": "SYMBOL",
      "text": "whilezzz",
      "type": "token"
    },
    {
      "token_type": "WSPACE",
      "text": " ",
      "type": "token"
    },
    {
      "token_type": "WHILE",
      "text": "while",
      "type": "token"
    },
    {
      "token_type": "WSPACE",
      "text": "\n",
      "type": "token"
    }
  ]
]
#!/usr/bin/env python
import base64
import sys
import os
infile = sys.argv[1]
outfile = os.path.basename(infile).rstrip('.base64')
s64 = open(infile,'rb').read()
s = base64.b64decode(s64)
open(outfile,'wb').write(s)
#!/usr/bin/env python
import base64
import sys
import os
infile = sys.argv[1]
outfile = os.path.basename(infile) + ".base64"
s = open(infile,'rb').read()
s64 = base64.b64encode(s)
open(outfile,'wb').write(s64)
#!/usr/bin/env python
# mklexer.py: generate a Python lexer based on a token definitions file.
# See https://gist.github.com/cellularmitosis/1da62db09d41703c5a505d0bac9d9056
# Copyright (c) 2020 Jason Pepas
# Released under the terms of the MIT license.
# See https://opensource.org/licenses/MIT
import sys
import os
import pprint
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
def usage(fd):
"""Prints the usage help to the given file descriptor."""
exe = os.path.basename(sys.argv[0])
w = fd.write
if fd is sys.stderr:
w("Error: bad usage.\n")
w("\n")
else:
w("%s: generate a Python lexer based on a token definitions file.\n" % exe)
w("\n")
w("Display help:\n")
w(" %s -h\n" % exe)
w(" %s --help\n" % exe)
w("\n")
w("Generate a lexer using tokendefs.txt:\n")
w(" %s tokendefs.txt > lexer.py\n" % exe)
w(" chmod +x lexer.py\n")
w("""
tokendefs.txt consists of pairs of TOKENTYPE and <regex> lines.
Example tokendefs.txt:
NUMBER
-?\\d+(\\.\\d+)?
SYMBOL
[a-zA-Z_][a-zA-Z0-9_-]*
Use the lexer on input.txt, producing the standard JSON token format:
./lexer.py input.txt | jq .
Two example tokens in standard JSON format:
{'type': 'TOKEN', 'token_type': 'NUMBER', 'text': '3.14159'}
{'type': 'TOKEN', 'token_type': 'SYMBOL', 'text': 'fibonacci'}
Use the lexer on input.txt, producing "fast" array-based JSON tokens:
./lexer.py --fast input.txt | jq .
"fast" tokens are [<token type index>, <matched text>] pairs.
The same example tokens, but in 'fast' JSON format:
[0, '3.14159']
[1, 'fibonacci']
tokendefs.txt may also contain #pragma's: line-oriented, discard, eof, refine.
""")
#
# Parsing.
#
def parse_tokendefs(lines):
"""Parses the token definitions file, stopping at the first 'refine' pragma."""
tokendefs = []
pragmas = {}
i = 0
while i < len(lines):
line1 = lines[i]
if len(line1) == 0:
# skip blank lines.
i += 1
continue
words = line1.split()
if words[0] == '#pragma':
if len(words) == 1:
raise Exception("Can't parse pragma: %s" % line1)
pragma_name = words[1]
if pragma_name in ['line-oriented', 'eof']:
pragmas[pragma_name] = True
i += 1
continue
elif pragma_name == 'discard':
discardable_token_types = words[2:]
if len(discardable_token_types) == 0:
raise Exception("Discard pragma with no token types listed")
pragmas[pragma_name] = discardable_token_types
i += 1
continue
elif pragma_name == 'keywords':
# this marks the start of the keywords section.
break
else:
raise Exception("Unknown pragma '%s'" % pragma_name)
elif line1[0] == '#':
# skip comments.
i += 1
continue
i += 1
if i >= len(lines):
raise Exception(
"Line %d: Token type '%s' has no corresponding regex." \
% (i, tokentype)
)
line2 = lines[i]
if len(line2) == 0:
raise Exception("Line %d: Zero-length regex." % i+1)
tokentype = line1
regex = line2
pair = (tokentype, regex)
tokendefs.append(pair)
i += 1
continue
remaining_lines = lines[i:]
(keyword_maps, toktypes) = parse_keywords(remaining_lines, tokendefs)
if 'eof' in pragmas.keys():
toktypes.append('EOF')
return (pragmas, toktypes, tokendefs, keyword_maps)
def parse_keywords(lines, tokendefs):
"""Parses the section starting at the keywords pragma."""
keywords_map = {}
keywords_map_fast = {}
toktypes = [pair[0] for pair in tokendefs]
if len(lines) == 0:
keyword_maps = (keywords_map, keywords_map_fast)
return (keyword_maps, toktypes)
assert lines[0].split()[0] == '#pragma' and lines[0].split()[1] == 'keywords'
if len(lines) == 1:
raise Exception("Keywords pragma found, but no keywords defined?")
i = 1
while i < len(lines):
line1 = lines[i]
if len(line1) == 0:
# skip blank lines
i += 1
continue
elif line1.split()[0] == '#pragma':
raise Exception("Pragmas not allowed within keywords section.")
elif line1.startswith('#'):
# skip comments.
i += 1
continue
i += 1
if i >= len(lines):
raise Exception(
"Line %d: Token type '%s' has no corresponding keyword." \
% (i, tokentype)
)
line2 = lines[i]
if len(line2) == 0:
raise Exception("Line %d: Zero-length keyword." % i+1)
tokentype = line1
keyword = line2
keywords_map[keyword] = tokentype
toktypes.append(tokentype)
i += 1
continue
for keyword, toktype in keywords_map.items():
keywords_map_fast[keyword] = toktypes.index(toktype)
keyword_maps = (keywords_map, keywords_map_fast)
return (keyword_maps, toktypes)
#
# Code generation.
#
def codegen_pragmas(pragmas):
"""Generates the Python code for pragmas."""
fd = StringIO()
w = fd.write
is_line_oriented = 'line-oriented' in pragmas.keys()
w("pragma_line_oriented = %s\n" % is_line_oriented)
has_eof_pragma = 'eof' in pragmas.keys()
w("pragma_eof = %s\n" % has_eof_pragma)
if 'discard' in pragmas.keys():
toktypes_string = '[%s]' % ','.join(
[("'%s'" % token_type) for token_type in pragmas['discard']]
)
w("pragma_discard = %s\n" % toktypes_string)
else:
w("pragma_discard = []\n")
w('\n')
code = fd.getvalue()
fd.close()
return code
def codegen_toktypes(toktypes):
"""Generates the Python code of the toktypes array."""
assert len(toktypes) > 0, toktypes
reprs = [toktype.__repr__() for toktype in toktypes]
if len(reprs) == 1:
return "toktypes = [%s]\n\n" % reprs[0]
lines = []
linebuf = "toktypes = [%s" % reprs[0]
for r in reprs[1:]:
s = ', ' + r
if len(linebuf) + len(s) < 80:
linebuf += s
continue
else:
linebuf += ','
lines.append(linebuf)
linebuf = ' ' + r
if len(lines) == 0:
linebuf += ']\n\n'
lines.append(linebuf)
else:
lines.append(linebuf)
lines.append(']\n\n')
return '\n'.join(lines)
def codegen_tokendefs(tokendefs):
"""Generates the Python code of the tokendefs table."""
def codegen_regex(regex_text):
"""Generates the Python code of a regex."""
# do everything we can to avoid the backslash plague.
if "'" not in regex_text:
return "r'%s'" % regex_text
elif '"' not in regex_text:
return 'r"%s"' % regex_text
elif "'''" not in regex_text and not regex_text.startswith("'") and not regex_text.endswith("'"):
return "r'''%s'''" % regex_text
elif '"""' not in regex_text and not regex_text.startswith('"') and not regex_text.endswith('"'):
return 'r"""%s"""' % regex_text
else:
# oh well, at least we tried :shrug:
return regex_text.__repr__()
fd = StringIO()
w = fd.write
w("tokendefs = [\n")
for token_type, regex in tokendefs:
w(" ['%s', %s],\n" % (token_type, codegen_regex(regex)))
w("]\n\n")
code = fd.getvalue()
fd.close()
return code
def codegen_keyword_maps(keyword_maps):
"""Generates the Python code of the keyword maps."""
(keywords_map, keywords_map_fast) = keyword_maps
s1 = 'keywords_map = '
s1 += pprint.pformat(keywords_map, width=80-len(s1)) + '\n\n'
s2 = 'keywords_map_fast = '
s2 += pprint.pformat(keywords_map_fast, width=80-len(s2)) + '\n\n'
return s1 + s2
def codegen(pragmas, toktypes, tokendefs, keyword_maps):
"""Generates the Python code of the lexer."""
fd = StringIO()
w = fd.write
w("""#!/usr/bin/env python
# DO NOT EDIT: this lexer was generated by mklexer.py.
import sys
import re
import json
""")
pragmas_code = codegen_pragmas(pragmas)
w(pragmas_code)
toktypes_code = codegen_toktypes(toktypes)
w(toktypes_code)
tokendefs_code = codegen_tokendefs(tokendefs)
w(tokendefs_code)
keyword_maps_code = codegen_keyword_maps(keyword_maps)
w(keyword_maps_code)
w("""
def compile_regexes():
\"\"\"Compile the regexes.\"\"\"
for pair in tokendefs:
pair[1] = re.compile(pair[1])
compile_regexes()
def get_linenum_charnum(text, offset):
\"\"\"Returns the line number and character number of the offset.\"\"\"
linenum = 1
charnum = 1
i = 0
while i < offset:
if text[i] == '\\n':
linenum += 1
charnum = 1
i += 1
continue
else:
charnum += 1
i += 1
continue
return (linenum, charnum)
def consume_next_token(text, offset, use_fast_format):
\"\"\"Consumes next token from the given text input.
Returns a (token, offset) pair.
Throws if no tokens match.\"\"\"
for i, pair in enumerate(tokendefs):
(token_type, regex) = pair
m = regex.match(text, offset)
if m is None:
continue
matched_text = m.group()
if use_fast_format:
toktype_index = i
if matched_text in keywords_map_fast:
toktype_index = keywords_map_fast[matched_text]
token = [toktype_index, matched_text]
else:
if matched_text in keywords_map:
token_type = keywords_map[matched_text]
token = {
'type': 'token',
'token_type': token_type,
'text': matched_text,
}
new_offset = offset + len(matched_text)
return (token, new_offset)
# none of the token types matched
(linenum, charnum) = get_linenum_charnum(text, offset)
raise Exception(
"Can't lex starting at line %d, character %d, context: '%s'" \\
% (linenum, charnum, text[offset:offset+32])
)
def discard_tokens(tokens, use_fast_format):
\"\"\"Discards any tokens specified by the 'discard' pragma.\"\"\"
def make_discard_set():
if use_fast_format:
discard_set = set()
for i, pair in enumerate(tokendefs):
(token_type, _) = pair
if token_type in pragma_discard:
discard_set.add(i)
continue
else:
discard_set = set(pragma_discard)
return discard_set
discard_set = make_discard_set()
kept_tokens = []
for token in tokens:
if use_fast_format:
toktype = token[0]
else:
toktype = token['token_type']
if toktype not in discard_set:
kept_tokens.append(token)
continue
return kept_tokens
def make_lines(tokens, use_fast_format):
\"\"\"Return a line-oriented array-of-arrays from the given tokens.\"\"\"
lines = []
line = []
for token in tokens:
if use_fast_format:
text = token[1]
else:
text = token['text']
if text == '\\n':
lines.append(line)
line = []
continue
else:
line.append(token)
continue
if len(line) > 0:
lines.append(line)
return lines
def lex(text, use_fast_format):
\"\"\"Returns a list of tokens for the given text input.\"\"\"
tokens = []
offset = 0
while offset < len(text):
(token, offset) = consume_next_token(text, offset, use_fast_format)
tokens.append(token)
continue
tokens = discard_tokens(tokens, use_fast_format)
if pragma_line_oriented:
tokens = make_lines(tokens, use_fast_format)
if pragma_eof:
if use_fast_format:
eof_token = [len(toktypes)-1, ""]
else:
eof_token = {
"type": "token",
"token_type": "EOF",
"text": ""
}
if pragma_line_oriented:
tokens.append([eof_token])
else:
tokens.append(eof_token)
format_dict = {'type': 'format'}
if use_fast_format:
if pragma_line_oriented:
format_dict['format'] = 'fast-lines'
else:
format_dict['format'] = 'fast'
format_dict['token_types'] = toktypes
else:
if pragma_line_oriented:
format_dict['format'] = 'tokens-lines'
else:
format_dict['format'] = 'tokens'
json_obj = [format_dict, tokens]
return json_obj
if __name__ == '__main__':
infile = [arg for arg in sys.argv[1:] if not arg.startswith('-')][-1]
use_fast_format = False
if '--fast' in sys.argv[1:]:
use_fast_format = True
fd = open(infile, 'r')
text = fd.read()
fd.close()
json_obj = lex(text, use_fast_format)
output = json.dumps(json_obj)
if not output.endswith('\\n'):
output += '\\n'
sys.stdout.write(output)
""")
code = fd.getvalue()
fd.close()
return code
if __name__ == "__main__":
if len(sys.argv) < 2:
usage(sys.stderr)
sys.exit(1)
if '-h' in sys.argv or '--help' in sys.argv:
usage(sys.stdout)
sys.exit(0)
# the last non-option arg is the tokendefs file.
tokendefs_fpath = None
non_option_args = [arg for arg in sys.argv[1:] if not arg.startswith('-')]
if len(non_option_args) != 1:
usage(sys.stderr)
sys.exit(1)
tokendefs_fpath = non_option_args[0]
fd = open(tokendefs_fpath, 'r')
tokendefs_lines = fd.read().splitlines()
fd.close()
(pragmas, toktypes, tokendefs, keyword_maps) = parse_tokendefs(tokendefs_lines)
code = codegen(pragmas, toktypes, tokendefs, keyword_maps)
sys.stdout.write(code)
#!/bin/bash
set -e
set -o pipefail
shopt -s expand_aliases
alias canonicaljson="python -c 'import json, sys; sys.stdout.write(json.dumps(json.load(sys.stdin), sort_keys=True))'"
if ! test -d tests
then
rm -f tests.tar
python b64dec.py tests.tar.base64
cat tests.tar | tar x
rm -f tests.tar
fi
for d in $( ls tests )
do
echo "Test $d"
cd tests/$d
../../mklexer.py tokendefs.txt > /tmp/lexer.py
echo " Python 2"
cat answer.txt | canonicaljson > /tmp/1
python2 /tmp/lexer.py input.txt | canonicaljson > /tmp/2
diff -q /tmp/1 /tmp/2
echo " Python 3"
python3 /tmp/lexer.py input.txt | canonicaljson > /tmp/2
diff -q /tmp/1 /tmp/2
echo " Python 2, --fast format"
cat answer-fast.txt | canonicaljson > /tmp/1
python2 /tmp/lexer.py --fast input.txt | canonicaljson > /tmp/2
diff -q /tmp/1 /tmp/2
echo " Python 3, --fast format"
python3 /tmp/lexer.py --fast input.txt | canonicaljson > /tmp/2
diff -q /tmp/1 /tmp/2
rm -f /tmp/lexer.py /tmp/1 /tmp/2
cd ../..
done
tests/                                                                                              000755  000765  000024  00000000000 13672231132 012302  5                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         tests/001/                                                                                          000755  000765  000024  00000000000 13672231132 012602  5                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         tests/006/                                                                                          000755  000765  000024  00000000000 13672232034 012611  5                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         tests/003/                                                                                          000755  000765  000024  00000000000 13672231132 012604  5                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         tests/004/                                                                                          000755  000765  000024  00000000000 13672231132 012605  5                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         tests/005/                                                                                          000755  000765  000024  00000000000 13672231132 012606  5                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         tests/002/                                                                                          000755  000765  000024  00000000000 13672231132 012603  5                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         tests/002/answer-fast.txt                                                                           000644  000765  000024  00000000320 13671454235 015603  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "fast-lines", "token_types": ["SYMBOL", "WSPACE"]},
 [
  [
   [0, "foo"],
   [1, " "],
   [0, "bar"]
  ],
  [
   [0, "bin"],
   [1, " "],
   [0, "baz"]
  ]
 ]
]
                                                                                                                                                                                                                                                                                                                tests/002/input.txt                                                                                 000644  000765  000024  00000000020 13671454235 014505  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         foo bar
bin baz
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                tests/002/answer.txt                                                                                000644  000765  000024  00000000701 13671454235 014653  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "tokens-lines"},
 [
  [
   {"type": "token", "token_type": "SYMBOL", "text": "foo"},
   {"type": "token", "token_type": "WSPACE", "text": " "},
   {"type": "token", "token_type": "SYMBOL", "text": "bar"}
  ],
  [
   {"type": "token", "token_type": "SYMBOL", "text": "bin"},
   {"type": "token", "token_type": "WSPACE", "text": " "},
   {"type": "token", "token_type": "SYMBOL", "text": "baz"}
  ]
 ]
]
                                                               tests/002/tokendefs.txt                                                                             000644  000765  000024  00000000057 13672231132 015330  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         #pragma line-oriented
SYMBOL
[a-z]+
WSPACE
\s+
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 tests/005/answer-fast.txt                                                                           000644  000765  000024  00000000320 13671454235 015606  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "fast-lines", "token_types": ["SYMBOL", "WSPACE", "EOF"]},
 [
  [
   [0, "foo"],
   [1, " "],
   [0, "bar"]
  ],
  [
   [0, "baz"]
  ],
  [
   [2, ""]
  ]
 ]
]
                                                                                                                                                                                                                                                                                                                tests/005/input.txt                                                                                 000644  000765  000024  00000000014 13671454235 014513  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         foo bar
baz
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    tests/005/answer.txt                                                                                000644  000765  000024  00000000611 13671454235 014656  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "tokens-lines"},
 [
  [
   {"type": "token", "token_type": "SYMBOL", "text": "foo"},
   {"type": "token", "token_type": "WSPACE", "text": " "},
   {"type": "token", "token_type": "SYMBOL", "text": "bar"}
  ],
  [
   {"type": "token", "token_type": "SYMBOL", "text": "baz"}
  ],
  [
   {"type": "token", "token_type": "EOF", "text": ""}
  ]
 ]
]
                                                                                                                       tests/005/tokendefs.txt                                                                             000644  000765  000024  00000000073 13672231132 015331  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         #pragma line-oriented
#pragma eof
SYMBOL
[a-z]+
WSPACE
\s+
                                                                                                                                                                                                                                                                                                                                                                                                                                                                     tests/004/answer-fast.txt                                                                           000644  000765  000024  00000000304 13671454235 015607  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "fast", "token_types": ["SYMBOL", "WSPACE", "EOF"]},
 [
  [0, "foo"],
  [1, " "],
  [0, "bar"],
  [1, "   "],
  [0, "baz"],
  [1, "\n"],
  [2, ""]
 ]
]
                                                                                                                                                                                                                                                                                                                            tests/004/input.txt                                                                                 000644  000765  000024  00000000016 13671454235 014514  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         foo bar   baz
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  tests/004/answer.txt                                                                                000644  000765  000024  00000000731 13671454235 014660  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "tokens"},
 [
  {"type": "token", "token_type": "SYMBOL", "text": "foo"},
  {"type": "token", "token_type": "WSPACE", "text": " "},
  {"type": "token", "token_type": "SYMBOL", "text": "bar"},
  {"type": "token", "token_type": "WSPACE", "text": "   "},
  {"type": "token", "token_type": "SYMBOL", "text": "baz"},
  {"type": "token", "token_type": "WSPACE", "text": "\n"},
  {"type": "token", "token_type": "EOF", "text": ""}
 ]
]
                                       tests/004/tokendefs.txt                                                                             000644  000765  000024  00000000045 13672231132 015327  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         #pragma eof
SYMBOL
[a-z]+
WSPACE
\s+
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           tests/003/answer-fast.txt                                                                           000644  000765  000024  00000000202 13671454235 015603  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "fast", "token_types": ["SYMBOL", "NUMBER", "WSPACE"]},
 [
  [0, "foo"],
  [0, "bar"]
 ]
]
                                                                                                                                                                                                                                                                                                                                                                                              tests/003/input.txt                                                                                 000644  000765  000024  00000000014 13671454235 014511  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         foo 123 bar
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    tests/003/answer.txt                                                                                000644  000765  000024  00000000261 13671454235 014655  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "tokens"},
 [
  {"type": "token", "token_type": "SYMBOL", "text": "foo"},
  {"type": "token", "token_type": "SYMBOL", "text": "bar"}
 ]
]
                                                                                                                                                                                                                                                                                                                                               tests/003/tokendefs.txt                                                                             000644  000765  000024  00000000105 13672231132 015323  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         #pragma discard WSPACE NUMBER
SYMBOL
[a-z]+
NUMBER
[0-9]+
WSPACE
\s+
                                                                                                                                                                                                                                                                                                                                                                                                                                                           tests/006/answer-fast.txt                                                                           000644  000765  000024  00000000255 13672231763 015616  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "fast", "token_types": [
     "SYMBOL", "WSPACE", "WHILE"
 ]},
 [
  [0, "whilezzz"],
  [1, " "],
  [2, "while"],
  [1, "\n"]
 ]
]
                                                                                                                                                                                                                                                                                                                                                   tests/006/input.txt                                                                                 000644  000765  000024  00000000017 13672231563 014515  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         whilezzz while
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 tests/006/answer.txt                                                                                000644  000765  000024  00000000456 13672231744 014665  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "tokens"},
 [
  {"type": "token", "token_type": "SYMBOL", "text": "whilezzz"},
  {"type": "token", "token_type": "WSPACE", "text": " "},
  {"type": "token", "token_type": "WHILE", "text": "while"},
  {"type": "token", "token_type": "WSPACE", "text": "\n"}
 ]
]
                                                                                                                                                                                                                  tests/006/tokendefs.txt                                                                             000644  000765  000024  00000000067 13672232034 015337  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         SYMBOL
[a-z]+
WSPACE
\s+

#pragma keywords
WHILE
while
                                                                                                                                                                                                                                                                                                                                                                                                                                                                         tests/001/answer-fast.txt                                                                           000644  000765  000024  00000000261 13671454235 015606  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "fast", "token_types": ["SYMBOL", "WSPACE"]},
 [
  [0, "foo"],
  [1, " "],
  [0, "bar"],
  [1, "   "],
  [0, "baz"],
  [1, "\n"]
 ]
]
                                                                                                                                                                                                                                                                                                                                               tests/001/input.txt                                                                                 000644  000765  000024  00000000016 13671454235 014511  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         foo bar   baz
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  tests/001/answer.txt                                                                                000644  000765  000024  00000000642 13671454235 014656  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         [
 {"type": "format", "format": "tokens"},
 [
  {"type": "token", "token_type": "SYMBOL", "text": "foo"},
  {"type": "token", "token_type": "WSPACE", "text": " "},
  {"type": "token", "token_type": "SYMBOL", "text": "bar"},
  {"type": "token", "token_type": "WSPACE", "text": "   "},
  {"type": "token", "token_type": "SYMBOL", "text": "baz"},
  {"type": "token", "token_type": "WSPACE", "text": "\n"}
 ]
]
                                                                                              tests/001/tokendefs.txt                                                                             000644  000765  000024  00000000031 13672231132 015317  0                                                                                                    ustar 00cell                            staff                           000000  000000                                                                                                                                                                         SYMBOL
[a-z]+
WSPACE
\s+
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment