Skip to content

Instantly share code, notes, and snippets.

@nikhilr612
Created September 9, 2023 14:29
Show Gist options
  • Save nikhilr612/a25b80195e97edd7cda209f99c7f4c03 to your computer and use it in GitHub Desktop.
Save nikhilr612/a25b80195e97edd7cda209f99c7f4c03 to your computer and use it in GitHub Desktop.
A toy S-expressions parser in pure python
#
# A toy S-expressions lexer and parser written in pure python.
# Copyright 2023 Nikhil R.
#
from collections import namedtuple
from io import StringIO
T_INT_LITERAL = 1;
T_FLOAT_LITERAL = 2;
T_STRING_LITERAL = 3;
T_OPEN_PAREN = 4;
T_CLOSE_PAREN = 6;
T_SYMBOL = 7;
T_BOOLEAN = 8;
OPEN_PARENTHESIS = '([';
CLOSE_PARENTHESIS= '])';
STRING_QUOTES = '\"';
Token = namedtuple('Token', ['type', 'data']);
def lexer(file):
"""
Generator to yield Tokens from file-like objects.
"""
# Using an FSM with :
# 1 - await whitespace, result in symbol
# 2 - expect ) or ], or if neither then new token
# 3 - string, await "
# 4 - expect digits, result in int
# 5 - expect digits, result in float
state = 2;
char_buf = [];
while (ch := file.read(1)):
if state == 1:
if ch.isspace():
symbol = ''.join(char_buf);
char_buf.clear();
state = 2;
if symbol == "True":
yield Token(T_BOOLEAN, True);
elif symbol == "False":
yield Token(T_BOOLEAN, False);
else:
yield Token(T_SYMBOL, symbol);
elif ch in CLOSE_PARENTHESIS:
symbol = ''.join(char_buf);
char_buf.clear();
state = 2;
if symbol == "True":
yield Token(T_BOOLEAN, True);
elif symbol == "False":
yield Token(T_BOOLEAN, False);
else:
yield Token(T_SYMBOL, symbol);
yield Token(T_CLOSE_PAREN, ch);
else:
char_buf.append(ch);
elif state == 2:
if ch in CLOSE_PARENTHESIS:
yield Token(T_CLOSE_PAREN, ch);
elif ch in OPEN_PARENTHESIS:
yield Token(T_OPEN_PAREN, ch);
elif ch in STRING_QUOTES:
state = 3
elif ch.isspace():
pass
elif ch.isdigit() or ch in '-+':
state = 4;
char_buf.append(ch);
else:
char_buf.append(ch);
state = 1;
elif state == 3:
if ch == '\\':
next_char = file.read(1)
char_buf.append(ch);
char_buf.append(next_char);
elif ch in STRING_QUOTES:
s = ''.join(char_buf).encode('utf-8').decode('unicode-escape');
state = 2;
char_buf.clear();
yield Token(T_STRING_LITERAL, s)
else:
char_buf.append(ch);
elif state == 4:
if ch == '.' or ch == 'e':
state = 5;
char_buf.append(ch);
elif ch.isdigit() or (len(char_buf) == 0 and ch == '-'):
char_buf.append(ch);
elif ch.isspace():
s = ''.join(char_buf)
char_buf.clear();
try:
val = int(s);
except:
print("Invalid int literal: " + s);
else:
state = 2;
yield Token(T_INT_LITERAL, val);
elif ch in CLOSE_PARENTHESIS:
s = ''.join(char_buf)
char_buf.clear();
try:
val = int(s);
except:
print("Invalid int literal: " + s);
else:
state = 2;
yield Token(T_INT_LITERAL, val);
yield Token(T_CLOSE_PAREN, ch);
else:
raise Exception("Invalid character '" + ch + "' expecting WHITESPACE or DIGIT")
return;
elif state == 5:
if ch.isspace():
s = ''.join(char_buf);
char_buf.clear();
try:
val = float(s);
except:
raise Exception("Invalid float literal " + s);
return;
else:
state = 2;
yield Token(T_FLOAT_LITERAL, val);
elif ch in CLOSE_PARENTHESIS:
s = ''.join(char_buf);
char_buf.clear();
try:
val = float(s);
except:
raise Exception("Invalid float literal " + s);
return;
else:
state = 2;
yield Token(T_FLOAT_LITERAL, val);
yield Token(T_CLOSE_PAREN, ch);
elif (ch in '-e') or ch.isdigit():
char_buf.append(ch);
else:
raise Exception("Invalid character '" + ch + "' expecting WHITESPACE or DIGIT")
##
# TODO: Remove the extreme code duplication that plagues this script.
##
def _recursive_build_ast(lex, parent):
token = next(lex)
try:
while token:
if token.type == T_OPEN_PAREN:
new_list = [];
parent.append(new_list);
_recursive_build_ast(lex, new_list);
elif token.type == T_CLOSE_PAREN:
return;
else:
parent.append(token);
token = next(lex);
except StopIteration:
raise Exception("Mismatched parenthesis.");
def build_ast(lex):
"""
Return the abstract syntax tree formed by processing the tokens yielded by the lexer.
"""
root = [];
token = next(lex, None)
while token:
if token.type == T_OPEN_PAREN:
new_list = [];
root.append(new_list);
_recursive_build_ast(lex, new_list);
elif token.type != T_CLOSE_PAREN:
root.append(token);
else:
raise Exception("Mismatched parenthesis.");
token = next(lex, None);
return root;
def _test_lex_string(s):
sfile = StringIO(s);
return list(lexer(sfile));
def _test_parse_string(s):
sfile = StringIO(s);
return build_ast(lexer(sfile));
def _run_tests():
assert _test_lex_string("(]") == [Token(T_OPEN_PAREN, '('), Token(T_CLOSE_PAREN, ']')];
d = _test_lex_string("( 1 1e3)");
assert d == [
Token(T_OPEN_PAREN, '('),
Token(T_INT_LITERAL, 1),
Token(T_FLOAT_LITERAL, 1000.0),
Token(T_CLOSE_PAREN, ')')
];
d = _test_lex_string('(define (sin x) ("not implemented"))');
assert d == [
Token(T_OPEN_PAREN, '('),
Token(T_SYMBOL, 'define'),
Token(T_OPEN_PAREN, '('),
Token(T_SYMBOL, 'sin'),
Token(T_SYMBOL, 'x'),
Token(T_CLOSE_PAREN, ')'),
Token(T_OPEN_PAREN, '('),
Token(T_STRING_LITERAL, 'not implemented'),
Token(T_CLOSE_PAREN, ')'),
Token(T_CLOSE_PAREN, ')')
];
d = _test_lex_string('(string-escape "\\t")')
assert d == [
Token(T_OPEN_PAREN, '('),
Token(T_SYMBOL, 'string-escape'),
Token(T_STRING_LITERAL, '\t'),
Token(T_CLOSE_PAREN, ')')
];
d = _test_parse_string("(display True (cond [(> a 5) 'A] [(!= a 0) 'B]) (random symbol))(another -1 -2.3e-2)");
assert d == [
[ # (
Token(T_SYMBOL, 'display'), # display
Token(T_BOOLEAN, True), # True
[ # (
Token(T_SYMBOL, 'cond'), # cond
[ # [
[Token(T_SYMBOL, '>'), Token(T_SYMBOL, 'a'), Token(T_INT_LITERAL, 5)], # (> a 5)
Token(T_SYMBOL, "'A") # 'A
], # ]
[ # [
[Token(T_SYMBOL, '!='), Token(T_SYMBOL, 'a'), Token(T_INT_LITERAL, 0)], # (!= a 0)
Token(T_SYMBOL, "'B") # 'B
], # ]
], # )
[Token(T_SYMBOL, 'random'), Token(T_SYMBOL, 'symbol')] # (random symbol)
], # )
[ # (
Token(T_SYMBOL, 'another'), # another
Token(T_INT_LITERAL, -1), # -1
Token(T_FLOAT_LITERAL, -0.023) # -2.3e-2
] # )
];
if __name__ == "__main__":
_run_tests();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment