Skip to content

Instantly share code, notes, and snippets.

@bensimner
Last active April 1, 2022 14:37
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
Star You must be signed in to star a gist
Embed
What would you like to do?
A Bytecode compiler + interpreter for a toy postfix arithmetic language
""" A simple bytecode compiler and interpreter
for a toy language
has all the same stages that Python itself has:
tokenize:
turns the source code into a list of tokens
parse:
turns the list of tokens into an AST
compile:
compiles the AST into a bytecode program
interpret:
runs the bytecode program to get the result
"""
import re
from enum import IntEnum
####################################
##### TOKENIZER ######
####################################
class TokenType(IntEnum):
NUMBER_LITERAL = 0x00
LPAREN = 0x01
RPAREN = 0x02
PLUS = 0x03
STAR = 0x04
class TokenInfo:
def __init__(self, type, string):
self.type = type
self.string = string
def __repr__(self):
return f"TokenInfo({self.type.name},{self.string!r})"
def tokenize(code):
"""given a source program
tokenize it by producing a stream of tokens
each token is a TokenType or a tuple (TokenType,Value)
e.g. "(1+2)*3" should give [LPAREN, (NUMBER_LITERAL,1), PLUS, (NUMBER_LITERAL,2), R_PAREN, STAR, (NUMBER_LITERAL,3)]
"""
tokens = []
# split code up at symbols, numbers, and whitespace
for s in re.split(r"\s+|(\d+)|([\(\)+*])", code):
if s is None:
continue
if s.isnumeric():
tokens.append(TokenInfo(TokenType.NUMBER_LITERAL, s))
elif s == "(":
tokens.append(TokenInfo(TokenType.LPAREN, s))
elif s == ")":
tokens.append(TokenInfo(TokenType.RPAREN, s))
elif s == "+":
tokens.append(TokenInfo(TokenType.PLUS, s))
elif s == "*":
tokens.append(TokenInfo(TokenType.STAR, s))
elif s == "":
# empty strings will fail to parse later on
pass
else:
raise SyntaxError(f"unknown token '{s}'")
return tokens
####################################
##### PARSER ######
####################################
class ASTNode(IntEnum):
NUMBER = 0x00
OPERATION = 0x01
class ASTOperation(IntEnum):
ADD = 0x00
MUL = 0x01
def parse(tokens):
"""parses the program into an ast
using the grammar:
expr -> aexpr | aexpr "+" expr
aexpr -> bexpr | bexpr "*" aexpr
bexpr -> INT | "(" expr ")"
"""
tokens = list(tokens)
def get_next_token():
try:
return tokens.pop(0)
except IndexError:
raise SyntaxError("unexpected EOF")
def parse_expr():
lhs = parse_aexpr()
if tokens and tokens[0].type == TokenType.PLUS:
t = get_next_token()
assert t.type == TokenType.PLUS
rhs = parse_expr()
return (ASTNode.OPERATION, ASTOperation.ADD, lhs, rhs)
else:
return lhs
def parse_aexpr():
lhs = parse_bexpr()
if tokens and tokens[0].type == TokenType.STAR:
t = get_next_token()
assert t.type == TokenType.STAR
rhs = parse_expr()
return (ASTNode.OPERATION, ASTOperation.MUL, lhs, rhs)
else:
return lhs
def parse_bexpr():
t = get_next_token()
if t.type == TokenType.NUMBER_LITERAL:
return (ASTNode.NUMBER, int(t.string))
elif t.type == TokenType.LPAREN:
expr = parse_expr()
t = get_next_token()
if t.type == TokenType.RPAREN:
return expr
else:
raise SyntaxError(f"invalid syntax on {t.string}")
else:
raise SyntaxError(f"invalid syntax on {t.string}")
ast = parse_expr()
if tokens:
raise SyntaxError("expected EOF")
return ast
####################################
##### Bytecode Compiler ######
####################################
class Bytecode(IntEnum):
"""our bytecode optypes
a program is a sequence of bytes
each pair of bytes is one bytecode "instruction"
the first byte is the opcode (one of the below)
the second is the 1-byte argument.
"""
NUM = 0x00
ADD = 0x01
MUL = 0x02
def compile(ast):
""" compiles our parsed AST into bytecode
"""
program = bytearray()
def compile_astnode(node):
op = node[0]
if op == ASTNode.NUMBER:
b = node[1]
program.append(Bytecode.NUM)
if 0 <= b <= 255:
program.append(b)
else:
raise SyntaxError("literals must be integers in the range 0-255")
elif op == ASTNode.OPERATION:
operator = node[1]
lhs = node[2]
rhs = node[3]
compile_astnode(lhs)
compile_astnode(rhs)
# operations bytecode have empty arg
if operator == ASTOperation.ADD:
program.append(Bytecode.ADD)
program.append(0)
elif operator == ASTOperation.MUL:
program.append(Bytecode.MUL)
program.append(0)
compile_astnode(ast)
return bytes(program)
def instructions(bytecode):
"""helper function to get the list of the instructions given a bytecode program
as (opcode byte,arg byte) pairs
"""
instrs = []
for i in range(0, len(bytecode), 2):
opcode = bytecode[i]
arg = bytecode[i+1]
instrs.append((opcode,arg))
return instrs
def dis(bytecode):
"""given a bytecode program, "disassemble" it to some human-readable text
"""
lines = []
for (op, arg) in instructions(bytecode):
name = Bytecode(op).name
if op == Bytecode.NUM:
lines.append(f"{name} {arg}")
else:
lines.append(f"{name}")
return ";".join(lines)
####################################
##### Interpreter ######
####################################
def interpret(bytecode):
"""given a bytecode program, run it and return the result
"""
stack = []
for (op, arg) in instructions(bytecode):
if op == Bytecode.NUM:
stack.append(arg)
elif op == Bytecode.ADD:
rhs = stack.pop()
lhs = stack.pop()
stack.append(lhs+rhs)
elif op == Bytecode.MUL:
rhs = stack.pop()
lhs = stack.pop()
stack.append(lhs*rhs)
# we say the result of the program is the value from the stack
if len(stack) == 1:
value = stack.pop()
return value
else:
# if there is more than 1 thing on the stack at the end
# then we say it is not a valid program
# (as an example)
raise ValueError(f"invalid stack at end of program: {stack}")
####################################
##### REPL ######
####################################
def main():
"""main repl
read-eval-print-loop
"""
while True:
line = input("> ")
try:
tokens = tokenize(line)
print(f"[{tokens=}]")
ast = parse(tokens)
print(f"[{ast=}]")
except SyntaxError as e:
print(f"! syntax error: {e}")
continue
try:
bytecode = compile(ast)
print(f"[{bytecode=}]")
print(f"[{dis(bytecode)=}]")
except Exception as e:
print(f"! compilation error: {e}")
continue
try:
print(interpret(bytecode))
except Exception as e:
print(f"! runtime error: {e}")
continue
if __name__ == "__main__":
main()
EXAMPLE_OUTPUT = """\
> 1+2
[tokens=[<__main__.TokenInfo object at 0x7feb44f5a070>, <__main__.TokenInfo object at 0x7feb44f5a310>, <__main__.TokenInfo object at 0x7feb44f5a490>]]
[ast=(<ASTNode.OPERATION: 1>, <ASTOperation.ADD: 0>, (<ASTNode.NUMBER: 0>, 1), (<ASTNode.NUMBER: 0>, 2))]
[bytecode=b'\x00\x01\x00\x02\x01\x00']
[dis(bytecode)='NUM 1;NUM 2;ADD']
3
> 1+2*3
[tokens=[<__main__.TokenInfo object at 0x7feb44f972b0>, <__main__.TokenInfo object at 0x7feb44f97310>, <__main__.TokenInfo object at 0x7feb44f03970>, <__main__.TokenInfo object at 0x7feb44f03940>, <__main__.TokenInfo object at 0x7feb44f03a60>]]
[ast=(<ASTNode.OPERATION: 1>, <ASTOperation.ADD: 0>, (<ASTNode.NUMBER: 0>, 1), (<ASTNode.OPERATION: 1>, <ASTOperation.MUL: 1>, (<ASTNode.NUMBER: 0>, 2), (<ASTNode.NUMBER: 0>, 3)))]
[bytecode=b'\x00\x01\x00\x02\x00\x03\x02\x00\x01\x00']
[dis(bytecode)='NUM 1;NUM 2;NUM 3;MUL;ADD']
7
> (1+2)*3
[tokens=[<__main__.TokenInfo object at 0x7feb44f03820>, <__main__.TokenInfo object at 0x7feb44f036a0>, <__main__.TokenInfo object at 0x7feb44f03a00>, <__main__.TokenInfo object at 0x7feb44f03a90>, <__main__.TokenInfo object at 0x7feb44f03b20>, <__main__.TokenInfo object at 0x7feb44f03bb0>, <__main__.TokenInfo object at 0x7feb44f03c40>]]
[ast=(<ASTNode.OPERATION: 1>, <ASTOperation.MUL: 1>, (<ASTNode.OPERATION: 1>, <ASTOperation.ADD: 0>, (<ASTNode.NUMBER: 0>, 1), (<ASTNode.NUMBER: 0>, 2)), (<ASTNode.NUMBER: 0>, 3))]
[bytecode=b'\x00\x01\x00\x02\x01\x00\x00\x03\x02\x00']
[dis(bytecode)='NUM 1;NUM 2;ADD;NUM 3;MUL']
9
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment