A Bytecode compiler + interpreter for a toy postfix arithmetic language
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ A simple bytecode compiler and interpreter | |
| for a toy language | |
| has all the same stages that Python itself has: | |
| tokenize: | |
| turns the source code into a list of tokens | |
| parse: | |
| turns the list of tokens into an AST | |
| compile: | |
| compiles the AST into a bytecode program | |
| interpret: | |
| runs the bytecode program to get the result | |
| """ | |
| import re | |
| from enum import IntEnum | |
| #################################### | |
| ##### TOKENIZER ###### | |
| #################################### | |
| class TokenType(IntEnum): | |
| NUMBER_LITERAL = 0x00 | |
| LPAREN = 0x01 | |
| RPAREN = 0x02 | |
| PLUS = 0x03 | |
| STAR = 0x04 | |
| class TokenInfo: | |
| def __init__(self, type, string): | |
| self.type = type | |
| self.string = string | |
| def __repr__(self): | |
| return f"TokenInfo({self.type.name},{self.string!r})" | |
| def tokenize(code): | |
| """given a source program | |
| tokenize it by producing a stream of tokens | |
| each token is a TokenType or a tuple (TokenType,Value) | |
| e.g. "(1+2)*3" should give [LPAREN, (NUMBER_LITERAL,1), PLUS, (NUMBER_LITERAL,2), R_PAREN, STAR, (NUMBER_LITERAL,3)] | |
| """ | |
| tokens = [] | |
| # split code up at symbols, numbers, and whitespace | |
| for s in re.split(r"\s+|(\d+)|([\(\)+*])", code): | |
| if s is None: | |
| continue | |
| if s.isnumeric(): | |
| tokens.append(TokenInfo(TokenType.NUMBER_LITERAL, s)) | |
| elif s == "(": | |
| tokens.append(TokenInfo(TokenType.LPAREN, s)) | |
| elif s == ")": | |
| tokens.append(TokenInfo(TokenType.RPAREN, s)) | |
| elif s == "+": | |
| tokens.append(TokenInfo(TokenType.PLUS, s)) | |
| elif s == "*": | |
| tokens.append(TokenInfo(TokenType.STAR, s)) | |
| elif s == "": | |
| # empty strings will fail to parse later on | |
| pass | |
| else: | |
| raise SyntaxError(f"unknown token '{s}'") | |
| return tokens | |
| #################################### | |
| ##### PARSER ###### | |
| #################################### | |
| class ASTNode(IntEnum): | |
| NUMBER = 0x00 | |
| OPERATION = 0x01 | |
| class ASTOperation(IntEnum): | |
| ADD = 0x00 | |
| MUL = 0x01 | |
| def parse(tokens): | |
| """parses the program into an ast | |
| using the grammar: | |
| expr -> aexpr | aexpr "+" expr | |
| aexpr -> bexpr | bexpr "*" aexpr | |
| bexpr -> INT | "(" expr ")" | |
| """ | |
| tokens = list(tokens) | |
| def get_next_token(): | |
| try: | |
| return tokens.pop(0) | |
| except IndexError: | |
| raise SyntaxError("unexpected EOF") | |
| def parse_expr(): | |
| lhs = parse_aexpr() | |
| if tokens and tokens[0].type == TokenType.PLUS: | |
| t = get_next_token() | |
| assert t.type == TokenType.PLUS | |
| rhs = parse_expr() | |
| return (ASTNode.OPERATION, ASTOperation.ADD, lhs, rhs) | |
| else: | |
| return lhs | |
| def parse_aexpr(): | |
| lhs = parse_bexpr() | |
| if tokens and tokens[0].type == TokenType.STAR: | |
| t = get_next_token() | |
| assert t.type == TokenType.STAR | |
| rhs = parse_expr() | |
| return (ASTNode.OPERATION, ASTOperation.MUL, lhs, rhs) | |
| else: | |
| return lhs | |
| def parse_bexpr(): | |
| t = get_next_token() | |
| if t.type == TokenType.NUMBER_LITERAL: | |
| return (ASTNode.NUMBER, int(t.string)) | |
| elif t.type == TokenType.LPAREN: | |
| expr = parse_expr() | |
| t = get_next_token() | |
| if t.type == TokenType.RPAREN: | |
| return expr | |
| else: | |
| raise SyntaxError(f"invalid syntax on {t.string}") | |
| else: | |
| raise SyntaxError(f"invalid syntax on {t.string}") | |
| ast = parse_expr() | |
| if tokens: | |
| raise SyntaxError("expected EOF") | |
| return ast | |
| #################################### | |
| ##### Bytecode Compiler ###### | |
| #################################### | |
| class Bytecode(IntEnum): | |
| """our bytecode optypes | |
| a program is a sequence of bytes | |
| each pair of bytes is one bytecode "instruction" | |
| the first byte is the opcode (one of the below) | |
| the second is the 1-byte argument. | |
| """ | |
| NUM = 0x00 | |
| ADD = 0x01 | |
| MUL = 0x02 | |
| def compile(ast): | |
| """ compiles our parsed AST into bytecode | |
| """ | |
| program = bytearray() | |
| def compile_astnode(node): | |
| op = node[0] | |
| if op == ASTNode.NUMBER: | |
| b = node[1] | |
| program.append(Bytecode.NUM) | |
| if 0 <= b <= 255: | |
| program.append(b) | |
| else: | |
| raise SyntaxError("literals must be integers in the range 0-255") | |
| elif op == ASTNode.OPERATION: | |
| operator = node[1] | |
| lhs = node[2] | |
| rhs = node[3] | |
| compile_astnode(lhs) | |
| compile_astnode(rhs) | |
| # operations bytecode have empty arg | |
| if operator == ASTOperation.ADD: | |
| program.append(Bytecode.ADD) | |
| program.append(0) | |
| elif operator == ASTOperation.MUL: | |
| program.append(Bytecode.MUL) | |
| program.append(0) | |
| compile_astnode(ast) | |
| return bytes(program) | |
| def instructions(bytecode): | |
| """helper function to get the list of the instructions given a bytecode program | |
| as (opcode byte,arg byte) pairs | |
| """ | |
| instrs = [] | |
| for i in range(0, len(bytecode), 2): | |
| opcode = bytecode[i] | |
| arg = bytecode[i+1] | |
| instrs.append((opcode,arg)) | |
| return instrs | |
| def dis(bytecode): | |
| """given a bytecode program, "disassemble" it to some human-readable text | |
| """ | |
| lines = [] | |
| for (op, arg) in instructions(bytecode): | |
| name = Bytecode(op).name | |
| if op == Bytecode.NUM: | |
| lines.append(f"{name} {arg}") | |
| else: | |
| lines.append(f"{name}") | |
| return ";".join(lines) | |
| #################################### | |
| ##### Interpreter ###### | |
| #################################### | |
| def interpret(bytecode): | |
| """given a bytecode program, run it and return the result | |
| """ | |
| stack = [] | |
| for (op, arg) in instructions(bytecode): | |
| if op == Bytecode.NUM: | |
| stack.append(arg) | |
| elif op == Bytecode.ADD: | |
| rhs = stack.pop() | |
| lhs = stack.pop() | |
| stack.append(lhs+rhs) | |
| elif op == Bytecode.MUL: | |
| rhs = stack.pop() | |
| lhs = stack.pop() | |
| stack.append(lhs*rhs) | |
| # we say the result of the program is the value from the stack | |
| if len(stack) == 1: | |
| value = stack.pop() | |
| return value | |
| else: | |
| # if there is more than 1 thing on the stack at the end | |
| # then we say it is not a valid program | |
| # (as an example) | |
| raise ValueError(f"invalid stack at end of program: {stack}") | |
| #################################### | |
| ##### REPL ###### | |
| #################################### | |
| def main(): | |
| """main repl | |
| read-eval-print-loop | |
| """ | |
| while True: | |
| line = input("> ") | |
| try: | |
| tokens = tokenize(line) | |
| print(f"[{tokens=}]") | |
| ast = parse(tokens) | |
| print(f"[{ast=}]") | |
| except SyntaxError as e: | |
| print(f"! syntax error: {e}") | |
| continue | |
| try: | |
| bytecode = compile(ast) | |
| print(f"[{bytecode=}]") | |
| print(f"[{dis(bytecode)=}]") | |
| except Exception as e: | |
| print(f"! compilation error: {e}") | |
| continue | |
| try: | |
| print(interpret(bytecode)) | |
| except Exception as e: | |
| print(f"! runtime error: {e}") | |
| continue | |
| if __name__ == "__main__": | |
| main() | |
| EXAMPLE_OUTPUT = """\ | |
| > 1+2 | |
| [tokens=[<__main__.TokenInfo object at 0x7feb44f5a070>, <__main__.TokenInfo object at 0x7feb44f5a310>, <__main__.TokenInfo object at 0x7feb44f5a490>]] | |
| [ast=(<ASTNode.OPERATION: 1>, <ASTOperation.ADD: 0>, (<ASTNode.NUMBER: 0>, 1), (<ASTNode.NUMBER: 0>, 2))] | |
| [bytecode=b'\x00\x01\x00\x02\x01\x00'] | |
| [dis(bytecode)='NUM 1;NUM 2;ADD'] | |
| 3 | |
| > 1+2*3 | |
| [tokens=[<__main__.TokenInfo object at 0x7feb44f972b0>, <__main__.TokenInfo object at 0x7feb44f97310>, <__main__.TokenInfo object at 0x7feb44f03970>, <__main__.TokenInfo object at 0x7feb44f03940>, <__main__.TokenInfo object at 0x7feb44f03a60>]] | |
| [ast=(<ASTNode.OPERATION: 1>, <ASTOperation.ADD: 0>, (<ASTNode.NUMBER: 0>, 1), (<ASTNode.OPERATION: 1>, <ASTOperation.MUL: 1>, (<ASTNode.NUMBER: 0>, 2), (<ASTNode.NUMBER: 0>, 3)))] | |
| [bytecode=b'\x00\x01\x00\x02\x00\x03\x02\x00\x01\x00'] | |
| [dis(bytecode)='NUM 1;NUM 2;NUM 3;MUL;ADD'] | |
| 7 | |
| > (1+2)*3 | |
| [tokens=[<__main__.TokenInfo object at 0x7feb44f03820>, <__main__.TokenInfo object at 0x7feb44f036a0>, <__main__.TokenInfo object at 0x7feb44f03a00>, <__main__.TokenInfo object at 0x7feb44f03a90>, <__main__.TokenInfo object at 0x7feb44f03b20>, <__main__.TokenInfo object at 0x7feb44f03bb0>, <__main__.TokenInfo object at 0x7feb44f03c40>]] | |
| [ast=(<ASTNode.OPERATION: 1>, <ASTOperation.MUL: 1>, (<ASTNode.OPERATION: 1>, <ASTOperation.ADD: 0>, (<ASTNode.NUMBER: 0>, 1), (<ASTNode.NUMBER: 0>, 2)), (<ASTNode.NUMBER: 0>, 3))] | |
| [bytecode=b'\x00\x01\x00\x02\x01\x00\x00\x03\x02\x00'] | |
| [dis(bytecode)='NUM 1;NUM 2;ADD;NUM 3;MUL'] | |
| 9 | |
| """ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment