|
#!/usr/bin/env python |
|
|
|
# mklexer.py: generate a Python lexer based on a token definitions file. |
|
# See https://gist.github.com/cellularmitosis/1da62db09d41703c5a505d0bac9d9056 |
|
|
|
# Copyright (c) 2020 Jason Pepas |
|
# Released under the terms of the MIT license. |
|
# See https://opensource.org/licenses/MIT |
|
|
|
import sys |
|
import os |
|
import pprint |
|
|
|
try: |
|
from StringIO import StringIO |
|
except ImportError: |
|
from io import StringIO |
|
|
|
def usage(fd): |
|
"""Prints the usage help to the given file descriptor.""" |
|
exe = os.path.basename(sys.argv[0]) |
|
w = fd.write |
|
if fd is sys.stderr: |
|
w("Error: bad usage.\n") |
|
w("\n") |
|
else: |
|
w("%s: generate a Python lexer based on a token definitions file.\n" % exe) |
|
w("\n") |
|
w("Display help:\n") |
|
w(" %s -h\n" % exe) |
|
w(" %s --help\n" % exe) |
|
w("\n") |
|
w("Generate a lexer using tokendefs.txt:\n") |
|
w(" %s tokendefs.txt > lexer.py\n" % exe) |
|
w(" chmod +x lexer.py\n") |
|
w(""" |
|
tokendefs.txt consists of pairs of TOKENTYPE and <regex> lines. |
|
|
|
Example tokendefs.txt: |
|
NUMBER |
|
-?\\d+(\\.\\d+)? |
|
SYMBOL |
|
[a-zA-Z_][a-zA-Z0-9_-]* |
|
|
|
Use the lexer on input.txt, producing the standard JSON token format: |
|
./lexer.py input.txt | jq . |
|
|
|
Two example tokens in standard JSON format: |
|
{'type': 'TOKEN', 'token_type': 'NUMBER', 'text': '3.14159'} |
|
{'type': 'TOKEN', 'token_type': 'SYMBOL', 'text': 'fibonacci'} |
|
|
|
Use the lexer on input.txt, producing "fast" array-based JSON tokens: |
|
./lexer.py --fast input.txt | jq . |
|
|
|
"fast" tokens are [<token type index>, <matched text>] pairs. |
|
|
|
The same example tokens, but in 'fast' JSON format: |
|
[0, '3.14159'] |
|
[1, 'fibonacci'] |
|
|
|
tokendefs.txt may also contain #pragma's: line-oriented, discard, eof, refine. |
|
""") |
|
|
|
# |
|
# Parsing. |
|
# |
|
|
|
def parse_tokendefs(lines): |
|
"""Parses the token definitions file, stopping at the first 'refine' pragma.""" |
|
tokendefs = [] |
|
pragmas = {} |
|
i = 0 |
|
while i < len(lines): |
|
line1 = lines[i] |
|
if len(line1) == 0: |
|
# skip blank lines. |
|
i += 1 |
|
continue |
|
|
|
words = line1.split() |
|
if words[0] == '#pragma': |
|
if len(words) == 1: |
|
raise Exception("Can't parse pragma: %s" % line1) |
|
pragma_name = words[1] |
|
if pragma_name in ['line-oriented', 'eof']: |
|
pragmas[pragma_name] = True |
|
i += 1 |
|
continue |
|
elif pragma_name == 'discard': |
|
discardable_token_types = words[2:] |
|
if len(discardable_token_types) == 0: |
|
raise Exception("Discard pragma with no token types listed") |
|
pragmas[pragma_name] = discardable_token_types |
|
i += 1 |
|
continue |
|
elif pragma_name == 'keywords': |
|
# this marks the start of the keywords section. |
|
break |
|
else: |
|
raise Exception("Unknown pragma '%s'" % pragma_name) |
|
elif line1[0] == '#': |
|
# skip comments. |
|
i += 1 |
|
continue |
|
|
|
i += 1 |
|
if i >= len(lines): |
|
raise Exception( |
|
"Line %d: Token type '%s' has no corresponding regex." \ |
|
% (i, tokentype) |
|
) |
|
line2 = lines[i] |
|
if len(line2) == 0: |
|
raise Exception("Line %d: Zero-length regex." % i+1) |
|
|
|
tokentype = line1 |
|
regex = line2 |
|
pair = (tokentype, regex) |
|
tokendefs.append(pair) |
|
i += 1 |
|
continue |
|
|
|
remaining_lines = lines[i:] |
|
(keyword_maps, toktypes) = parse_keywords(remaining_lines, tokendefs) |
|
if 'eof' in pragmas.keys(): |
|
toktypes.append('EOF') |
|
return (pragmas, toktypes, tokendefs, keyword_maps) |
|
|
|
def parse_keywords(lines, tokendefs): |
|
"""Parses the section starting at the keywords pragma.""" |
|
keywords_map = {} |
|
keywords_map_fast = {} |
|
toktypes = [pair[0] for pair in tokendefs] |
|
if len(lines) == 0: |
|
keyword_maps = (keywords_map, keywords_map_fast) |
|
return (keyword_maps, toktypes) |
|
assert lines[0].split()[0] == '#pragma' and lines[0].split()[1] == 'keywords' |
|
if len(lines) == 1: |
|
raise Exception("Keywords pragma found, but no keywords defined?") |
|
i = 1 |
|
while i < len(lines): |
|
line1 = lines[i] |
|
if len(line1) == 0: |
|
# skip blank lines |
|
i += 1 |
|
continue |
|
elif line1.split()[0] == '#pragma': |
|
raise Exception("Pragmas not allowed within keywords section.") |
|
elif line1.startswith('#'): |
|
# skip comments. |
|
i += 1 |
|
continue |
|
i += 1 |
|
if i >= len(lines): |
|
raise Exception( |
|
"Line %d: Token type '%s' has no corresponding keyword." \ |
|
% (i, tokentype) |
|
) |
|
line2 = lines[i] |
|
if len(line2) == 0: |
|
raise Exception("Line %d: Zero-length keyword." % i+1) |
|
tokentype = line1 |
|
keyword = line2 |
|
keywords_map[keyword] = tokentype |
|
toktypes.append(tokentype) |
|
i += 1 |
|
continue |
|
for keyword, toktype in keywords_map.items(): |
|
keywords_map_fast[keyword] = toktypes.index(toktype) |
|
keyword_maps = (keywords_map, keywords_map_fast) |
|
return (keyword_maps, toktypes) |
|
|
|
# |
|
# Code generation. |
|
# |
|
|
|
def codegen_pragmas(pragmas): |
|
"""Generates the Python code for pragmas.""" |
|
fd = StringIO() |
|
w = fd.write |
|
is_line_oriented = 'line-oriented' in pragmas.keys() |
|
w("pragma_line_oriented = %s\n" % is_line_oriented) |
|
has_eof_pragma = 'eof' in pragmas.keys() |
|
w("pragma_eof = %s\n" % has_eof_pragma) |
|
if 'discard' in pragmas.keys(): |
|
toktypes_string = '[%s]' % ','.join( |
|
[("'%s'" % token_type) for token_type in pragmas['discard']] |
|
) |
|
w("pragma_discard = %s\n" % toktypes_string) |
|
else: |
|
w("pragma_discard = []\n") |
|
w('\n') |
|
code = fd.getvalue() |
|
fd.close() |
|
return code |
|
|
|
def codegen_toktypes(toktypes): |
|
"""Generates the Python code of the toktypes array.""" |
|
assert len(toktypes) > 0, toktypes |
|
reprs = [toktype.__repr__() for toktype in toktypes] |
|
if len(reprs) == 1: |
|
return "toktypes = [%s]\n\n" % reprs[0] |
|
lines = [] |
|
linebuf = "toktypes = [%s" % reprs[0] |
|
for r in reprs[1:]: |
|
s = ', ' + r |
|
if len(linebuf) + len(s) < 80: |
|
linebuf += s |
|
continue |
|
else: |
|
linebuf += ',' |
|
lines.append(linebuf) |
|
linebuf = ' ' + r |
|
if len(lines) == 0: |
|
linebuf += ']\n\n' |
|
lines.append(linebuf) |
|
else: |
|
lines.append(linebuf) |
|
lines.append(']\n\n') |
|
return '\n'.join(lines) |
|
|
|
def codegen_tokendefs(tokendefs): |
|
"""Generates the Python code of the tokendefs table.""" |
|
def codegen_regex(regex_text): |
|
"""Generates the Python code of a regex.""" |
|
# do everything we can to avoid the backslash plague. |
|
if "'" not in regex_text: |
|
return "r'%s'" % regex_text |
|
elif '"' not in regex_text: |
|
return 'r"%s"' % regex_text |
|
elif "'''" not in regex_text and not regex_text.startswith("'") and not regex_text.endswith("'"): |
|
return "r'''%s'''" % regex_text |
|
elif '"""' not in regex_text and not regex_text.startswith('"') and not regex_text.endswith('"'): |
|
return 'r"""%s"""' % regex_text |
|
else: |
|
# oh well, at least we tried :shrug: |
|
return regex_text.__repr__() |
|
|
|
fd = StringIO() |
|
w = fd.write |
|
w("tokendefs = [\n") |
|
for token_type, regex in tokendefs: |
|
w(" ['%s', %s],\n" % (token_type, codegen_regex(regex))) |
|
w("]\n\n") |
|
code = fd.getvalue() |
|
fd.close() |
|
return code |
|
|
|
def codegen_keyword_maps(keyword_maps): |
|
"""Generates the Python code of the keyword maps.""" |
|
(keywords_map, keywords_map_fast) = keyword_maps |
|
s1 = 'keywords_map = ' |
|
s1 += pprint.pformat(keywords_map, width=80-len(s1)) + '\n\n' |
|
s2 = 'keywords_map_fast = ' |
|
s2 += pprint.pformat(keywords_map_fast, width=80-len(s2)) + '\n\n' |
|
return s1 + s2 |
|
|
|
def codegen(pragmas, toktypes, tokendefs, keyword_maps): |
|
"""Generates the Python code of the lexer.""" |
|
fd = StringIO() |
|
w = fd.write |
|
w("""#!/usr/bin/env python |
|
|
|
# DO NOT EDIT: this lexer was generated by mklexer.py. |
|
|
|
import sys |
|
import re |
|
import json |
|
|
|
""") |
|
pragmas_code = codegen_pragmas(pragmas) |
|
w(pragmas_code) |
|
toktypes_code = codegen_toktypes(toktypes) |
|
w(toktypes_code) |
|
tokendefs_code = codegen_tokendefs(tokendefs) |
|
w(tokendefs_code) |
|
keyword_maps_code = codegen_keyword_maps(keyword_maps) |
|
w(keyword_maps_code) |
|
w(""" |
|
def compile_regexes(): |
|
\"\"\"Compile the regexes.\"\"\" |
|
for pair in tokendefs: |
|
pair[1] = re.compile(pair[1]) |
|
|
|
compile_regexes() |
|
|
|
def get_linenum_charnum(text, offset): |
|
\"\"\"Returns the line number and character number of the offset.\"\"\" |
|
linenum = 1 |
|
charnum = 1 |
|
i = 0 |
|
while i < offset: |
|
if text[i] == '\\n': |
|
linenum += 1 |
|
charnum = 1 |
|
i += 1 |
|
continue |
|
else: |
|
charnum += 1 |
|
i += 1 |
|
continue |
|
return (linenum, charnum) |
|
|
|
def consume_next_token(text, offset, use_fast_format): |
|
\"\"\"Consumes next token from the given text input. |
|
Returns a (token, offset) pair. |
|
Throws if no tokens match.\"\"\" |
|
for i, pair in enumerate(tokendefs): |
|
(token_type, regex) = pair |
|
m = regex.match(text, offset) |
|
if m is None: |
|
continue |
|
matched_text = m.group() |
|
if use_fast_format: |
|
toktype_index = i |
|
if matched_text in keywords_map_fast: |
|
toktype_index = keywords_map_fast[matched_text] |
|
token = [toktype_index, matched_text] |
|
else: |
|
if matched_text in keywords_map: |
|
token_type = keywords_map[matched_text] |
|
token = { |
|
'type': 'token', |
|
'token_type': token_type, |
|
'text': matched_text, |
|
} |
|
new_offset = offset + len(matched_text) |
|
return (token, new_offset) |
|
# none of the token types matched |
|
(linenum, charnum) = get_linenum_charnum(text, offset) |
|
raise Exception( |
|
"Can't lex starting at line %d, character %d, context: '%s'" \\ |
|
% (linenum, charnum, text[offset:offset+32]) |
|
) |
|
|
|
def discard_tokens(tokens, use_fast_format): |
|
\"\"\"Discards any tokens specified by the 'discard' pragma.\"\"\" |
|
def make_discard_set(): |
|
if use_fast_format: |
|
discard_set = set() |
|
for i, pair in enumerate(tokendefs): |
|
(token_type, _) = pair |
|
if token_type in pragma_discard: |
|
discard_set.add(i) |
|
continue |
|
else: |
|
discard_set = set(pragma_discard) |
|
return discard_set |
|
|
|
discard_set = make_discard_set() |
|
kept_tokens = [] |
|
for token in tokens: |
|
if use_fast_format: |
|
toktype = token[0] |
|
else: |
|
toktype = token['token_type'] |
|
if toktype not in discard_set: |
|
kept_tokens.append(token) |
|
continue |
|
return kept_tokens |
|
|
|
def make_lines(tokens, use_fast_format): |
|
\"\"\"Return a line-oriented array-of-arrays from the given tokens.\"\"\" |
|
lines = [] |
|
line = [] |
|
for token in tokens: |
|
if use_fast_format: |
|
text = token[1] |
|
else: |
|
text = token['text'] |
|
|
|
if text == '\\n': |
|
lines.append(line) |
|
line = [] |
|
continue |
|
else: |
|
line.append(token) |
|
continue |
|
if len(line) > 0: |
|
lines.append(line) |
|
return lines |
|
|
|
def lex(text, use_fast_format): |
|
\"\"\"Returns a list of tokens for the given text input.\"\"\" |
|
tokens = [] |
|
offset = 0 |
|
while offset < len(text): |
|
(token, offset) = consume_next_token(text, offset, use_fast_format) |
|
tokens.append(token) |
|
continue |
|
tokens = discard_tokens(tokens, use_fast_format) |
|
if pragma_line_oriented: |
|
tokens = make_lines(tokens, use_fast_format) |
|
|
|
if pragma_eof: |
|
if use_fast_format: |
|
eof_token = [len(toktypes)-1, ""] |
|
else: |
|
eof_token = { |
|
"type": "token", |
|
"token_type": "EOF", |
|
"text": "" |
|
} |
|
if pragma_line_oriented: |
|
tokens.append([eof_token]) |
|
else: |
|
tokens.append(eof_token) |
|
|
|
format_dict = {'type': 'format'} |
|
if use_fast_format: |
|
if pragma_line_oriented: |
|
format_dict['format'] = 'fast-lines' |
|
else: |
|
format_dict['format'] = 'fast' |
|
format_dict['token_types'] = toktypes |
|
else: |
|
if pragma_line_oriented: |
|
format_dict['format'] = 'tokens-lines' |
|
else: |
|
format_dict['format'] = 'tokens' |
|
|
|
json_obj = [format_dict, tokens] |
|
return json_obj |
|
|
|
if __name__ == '__main__': |
|
infile = [arg for arg in sys.argv[1:] if not arg.startswith('-')][-1] |
|
use_fast_format = False |
|
if '--fast' in sys.argv[1:]: |
|
use_fast_format = True |
|
|
|
fd = open(infile, 'r') |
|
text = fd.read() |
|
fd.close() |
|
|
|
json_obj = lex(text, use_fast_format) |
|
|
|
output = json.dumps(json_obj) |
|
if not output.endswith('\\n'): |
|
output += '\\n' |
|
sys.stdout.write(output) |
|
""") |
|
code = fd.getvalue() |
|
fd.close() |
|
return code |
|
|
|
if __name__ == "__main__": |
|
if len(sys.argv) < 2: |
|
usage(sys.stderr) |
|
sys.exit(1) |
|
|
|
if '-h' in sys.argv or '--help' in sys.argv: |
|
usage(sys.stdout) |
|
sys.exit(0) |
|
|
|
# the last non-option arg is the tokendefs file. |
|
tokendefs_fpath = None |
|
non_option_args = [arg for arg in sys.argv[1:] if not arg.startswith('-')] |
|
if len(non_option_args) != 1: |
|
usage(sys.stderr) |
|
sys.exit(1) |
|
tokendefs_fpath = non_option_args[0] |
|
|
|
fd = open(tokendefs_fpath, 'r') |
|
tokendefs_lines = fd.read().splitlines() |
|
fd.close() |
|
|
|
(pragmas, toktypes, tokendefs, keyword_maps) = parse_tokendefs(tokendefs_lines) |
|
code = codegen(pragmas, toktypes, tokendefs, keyword_maps) |
|
sys.stdout.write(code) |