Created
April 14, 2018 07:29
-
-
Save Michi83/7f7a9d3b32941699e490dea001a7d180 to your computer and use it in GitHub Desktop.
Implementation of Tiny BASIC using recursive descent
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import deque | |
# This is a demonstration of the recursive descent algorithm used to implement | |
# interpreters for a programming languages. As an example, Tiny BASIC is | |
# implemented here but of course the same technique can also be used to | |
# implement other languages as well. | |
# | |
# https://en.wikipedia.org/wiki/Tiny_BASIC | |
# | |
# It is always a good idea to break up a project into several simple steps and | |
# so we will implement our interpreter in these three steps: | |
# - Step 1 - Tokenization | |
# - Step 2 - Parsing | |
# - Step 3 - Interpretation | |
# Step 1 - Tokenization | |
# | |
# First we need to break up the code into "tokens", e.g. numbers, strings, | |
# operators... We begin by reading the first character of the code. That will | |
# usually be enough to tell which type of token it is. Then we read more | |
# characters until we find one that cannot possibly be part of the same token. | |
# We repeat this process for the rest of the code until we reach the end. | |
# | |
# For instance, if the first character is a digit, we read more characters | |
# until we find one that is not a digit. | |
# Each token has a type (e.g. "NUMBER") and some also have a value (e.g. the | |
# numeric value of a number token). | |
class Token: | |
def __init__(self, type, value=None): | |
self.type = type | |
self.value = value | |
# Breaks up a line of code into tokens, puts them in a queue and returns it. | |
def tokenize(line): | |
def digit(): | |
return line[to] >= "0" and line[to] <= "9" | |
def letter(): | |
return lowercase() or uppercase() | |
def lowercase(): | |
return line[to] >= "a" and line[to] <= "z" | |
def uppercase(): | |
return line[to] >= "A" and line[to] <= "Z" | |
keywords = {"CLEAR", "END", "GOSUB", "GOTO", "IF", "INPUT", "LET", "LIST", | |
"PRINT", "RETURN", "RUN", "THEN"} | |
tokens = deque() | |
to = 0 | |
line += "\r" | |
while True: | |
from_ = to | |
# Carriage return. | |
if line[to] == "\r": | |
tokens.append(Token("CR")) | |
break | |
# Skip spaces. | |
elif line[to] == " ": | |
to += 1 | |
while line[to] == " ": | |
to += 1 | |
# Strings. | |
elif line[to] == "\"": | |
to += 1 | |
while line[to] != "\"": | |
if line[to] == "\r": | |
raise Exception("MATCHING QUOTATION MARK NOT FOUND") | |
to += 1 | |
to += 1 | |
tokens.append(Token("STRING", line[from_ + 1:to - 1])) | |
# Single character tokens. | |
elif line[to] in {"(", ")", "*", "+", ",", "-", "/"}: | |
to += 1 | |
tokens.append(Token(line[from_])) | |
# Numbers. | |
elif digit(): | |
to += 1 | |
while digit(): | |
to += 1 | |
tokens.append(Token("NUMBER", int(line[from_:to]))) | |
# Relational operators. | |
elif line[to] in {"<", "=", ">"}: | |
to += 1 | |
if line[from_] == "<" and line[to] in {"=", ">"}: | |
to += 1 | |
elif line[from_] == ">" and line[to] in {"<", "="}: | |
to += 1 | |
substring = line[from_:to] | |
# >< is an alias for <>. | |
if substring == "><": | |
substring = "<>" | |
tokens.append(Token(substring)) | |
# Keywords and variables. | |
elif letter(): | |
to += 1 | |
while letter(): | |
to += 1 | |
substring = line[from_:to].upper() | |
if substring in keywords: | |
tokens.append(Token(substring)) | |
elif len(substring) == 1: | |
tokens.append(Token("VAR", substring)) | |
else: | |
raise Exception("UNRECOGNIZED KEYWORD %s" % substring) | |
else: | |
raise Exception("UNRECOGNIZED TOKEN %s" % line[to]) | |
return tokens | |
# Step 2 - Parsing | |
# | |
# Now that we have tokens, we need to relate them to one another, i.e. we need | |
# to construct a syntax tree. | |
# | |
# In order to accomplish this, we must define "grammar rules" for the language, | |
# that describe how language constructs are composed of other language | |
# constructs and individual tokens. For instance, if a language only contains | |
# numbers, plus signs and multiplication signs the grammar could be as follows: | |
# | |
# - A sum consists of a product followed by a plus sign and another product | |
# zero or more times. | |
# - A product consists of a number followed by a multiplication sign and | |
# another number zero or more times. | |
# | |
# Note how these rules already handle operator precedence. A sum contains | |
# products but products contain numbers. Thus, individual numbers bind more | |
# closely to products than to sums. | |
# | |
# Since defining the grammar in words is verbose and error-prone, there is a | |
# symbolic notation for grammar rules called EBNF: | |
# https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form | |
# | |
# Tiny BASIC's grammar in EBNF-like notation, as implemented here, is: | |
# line = [number] statement "CR" | |
# statement = "PRINT" expr-list | |
# "IF" expression relop expression "THEN" statement | |
# "GOTO" expression | |
# "INPUT" var-list | |
# "LET" var "=" expression | |
# "GOSUB" expression | |
# "RETURN" | |
# "CLEAR" | |
# "LIST" | |
# "RUN" | |
# "END" | |
# expr-list = (string | expression) {"," (string | expression)} | |
# var-list = var {"," var} | |
# expression = ["+" | "-"] term {("+" | "-") term} | |
# term = factor {("*" | "/") factor} | |
# factor = var | number | "(" expression ")" | |
# relop = "<" | "<=" | "<>" | "=" | ">" | ">=" | |
# | |
# (Square brackets denote optional parts, curly braces denote parts repeated | |
# zero or more times, vertical bars denote alternatives.) | |
# | |
# With our grammar defined, the recursive descent algorithm comes into play. | |
# For each grammar rule we implement a function that groups tokens according to | |
# its eponymous rule and returns part of a syntax tree. These functions will | |
# frequently call one another and themselves recursively. | |
# Constructs a syntax tree for a given line of code and returns its root token. | |
def parse(line): | |
# Fetch the next token from the token queue and optionally check its type. | |
def advance(*expected): | |
if len(expected) == 0 or tokens[0].type in expected: | |
return tokens.popleft() | |
else: | |
raise Exception("UNEXPECTED TOKEN %s" % tokens[0].type) | |
# Functions implementing grammar rules. | |
def statement(): | |
if tokens[0].type == "PRINT": | |
token = advance() | |
token.right = expr_list() | |
elif tokens[0].type == "IF": | |
token = advance() | |
temp = expression() | |
token.left = relop() | |
token.left.left = temp | |
token.left.right = expression() | |
advance("THEN") | |
token.right = statement() | |
elif tokens[0].type == "GOTO": | |
token = advance() | |
token.right = expression() | |
elif tokens[0].type == "INPUT": | |
token = advance() | |
token.right = var_list() | |
elif tokens[0].type == "LET": | |
token = advance() | |
token.left = advance("VAR") | |
advance("=") | |
token.right = expression() | |
elif tokens[0].type == "GOSUB": | |
token = advance() | |
token.right = expression() | |
elif tokens[0].type == "RETURN": | |
token = advance() | |
elif tokens[0].type == "CLEAR": | |
token = advance() | |
elif tokens[0].type == "LIST": | |
token = advance() | |
elif tokens[0].type == "RUN": | |
token = advance() | |
elif tokens[0].type == "END": | |
token = advance() | |
else: | |
raise Exception("UNEXPECTED TOKEN %s" % tokens[0].type) | |
return token | |
def expr_list(): | |
token_list = [] | |
if tokens[0].type == "STRING": | |
token_list.append(advance()) | |
else: | |
token_list.append(expression()) | |
while tokens[0].type == ",": | |
advance() | |
if tokens[0].type == "STRING": | |
token_list.append(advance()) | |
else: | |
token_list.append(expression()) | |
return token_list | |
def var_list(): | |
token_list = [advance("VAR")] | |
while tokens[0].type == ",": | |
advance() | |
token_list.append(advance("VAR")) | |
return token_list | |
def expression(): | |
# Unary plus has no effect and is simply skipped. | |
if tokens[0].type == "+": | |
advance() | |
token = term() | |
# -x is rewritten as 0 - x. | |
elif tokens[0].type == "-": | |
token = advance() | |
token.left = Token("NUMBER", 0) | |
token.right = term() | |
else: | |
token = term() | |
while tokens[0].type in {"+", "-"}: | |
tokens[0].left = token | |
token = advance() | |
token.right = term() | |
return token | |
def term(): | |
token = factor() | |
while tokens[0].type in {"*", "/"}: | |
tokens[0].left = token | |
token = advance() | |
token.right = factor() | |
return token | |
def factor(): | |
if tokens[0].type in {"VAR", "NUMBER"}: | |
token = advance() | |
else: | |
advance("(") | |
token = expression() | |
advance(")") | |
return token | |
def relop(): | |
return advance("<", "<=", "<>", "=", ">", ">=") | |
tokens = tokenize(line) | |
if tokens[0].type == "NUMBER": | |
token = advance() | |
if tokens[0].type != "CR": | |
token.right = statement() | |
token.line = line | |
else: | |
token = statement() | |
advance("CR") | |
return token | |
# Step 3 - Interpretation | |
# | |
# Now that we have a syntax tree, we need a function to recursively traverse | |
# the tree and perform the appropriate action at each node, sometimes also | |
# returning results. | |
# Some instructions return "signal objects" instructing the system to do | |
# something special. Each signal has a type, e.g. "GOTO", and some also have a | |
# value, e.g. a line number. | |
class Signal: | |
def __init__(self, type, value=None): | |
self.type = type | |
self.value = value | |
def interpret(token): | |
if token.type == "*": | |
left = interpret(token.left) | |
right = interpret(token.right) | |
return left * right | |
elif token.type == "+": | |
left = interpret(token.left) | |
right = interpret(token.right) | |
return left + right | |
elif token.type == "-": | |
left = interpret(token.left) | |
right = interpret(token.right) | |
return left - right | |
elif token.type == "/": | |
left = interpret(token.left) | |
right = interpret(token.right) | |
if right == 0: | |
raise Exception("DIVISION BY ZERO") | |
return left // right | |
elif token.type == "<": | |
left = interpret(token.left) | |
right = interpret(token.right) | |
return left < right | |
elif token.type == "<=": | |
left = interpret(token.left) | |
right = interpret(token.right) | |
return left <= right | |
elif token.type == "<>": | |
left = interpret(token.left) | |
right = interpret(token.right) | |
return left != right | |
elif token.type == "=": | |
left = interpret(token.left) | |
right = interpret(token.right) | |
return left == right | |
elif token.type == ">": | |
left = interpret(token.left) | |
right = interpret(token.right) | |
return left > right | |
elif token.type == ">=": | |
left = interpret(token.left) | |
right = interpret(token.right) | |
return left >= right | |
elif token.type == "CLEAR": | |
program.next = None | |
variables.clear() | |
elif token.type == "END": | |
return Signal("END") | |
elif token.type == "GOSUB": | |
return Signal("GOSUB", interpret(token.right)) | |
elif token.type == "GOTO": | |
return Signal("GOTO", interpret(token.right)) | |
elif token.type == "IF": | |
if interpret(token.left): | |
return interpret(token.right) | |
elif token.type == "INPUT": | |
for child in token.right: | |
while True: | |
try: | |
variables[child.value] = int(input("? ")) | |
break | |
except ValueError: | |
print("INVALID INPUT") | |
elif token.type == "LET": | |
variables[token.left.value] = interpret(token.right) | |
elif token.type == "LIST": | |
subtoken = program.next | |
while subtoken is not None: | |
print(subtoken.line) | |
subtoken = subtoken.next | |
elif token.type == "NUMBER": | |
return token.value | |
elif token.type == "PRINT": | |
for child in token.right: | |
print(interpret(child), end="") | |
print() | |
elif token.type == "RETURN": | |
return Signal("RETURN") | |
elif token.type == "RUN": | |
return Signal("RUN") | |
elif token.type == "STRING": | |
return token.value | |
elif token.type == "VAR": | |
if token.value in variables: | |
return variables[token.value] | |
else: | |
return 0 | |
# The user program is a linked list of instructions, ordered by line number. It | |
# always contains a dummy instruction with the line number -1 at the beginning, | |
# simplifying insertions and deletions. | |
program = Token("NUMBER", -1) | |
program.next = None | |
variables = {} | |
# Inserts a new instruction into the program. | |
def insert(new_token): | |
# Find the last instruction whose line number is less than the new | |
# instruction's line number. | |
token = program | |
while token.next is not None and token.next.value < new_token.value: | |
token = token.next | |
# Insert the new instruction between the instruction found in the previous | |
# step and its successor or replace the successor, as appropriate. | |
if token.next is None or token.next.value > new_token.value: | |
new_token.next = token.next | |
else: | |
new_token.next = token.next.next | |
token.next = new_token | |
# Delete an instruction with a particular line number. | |
def delete(number): | |
# See first comment of the insert function. | |
token = program | |
while token.next is not None and token.next.value < number: | |
token = token.next | |
# If the instruction's successor's line number matches, unchain it. | |
if token.next is not None and token.next.value == number: | |
token.next = token.next.next | |
def run(): | |
# Find an instruction with a particular line number. | |
def find(number): | |
token = program.next | |
while token is not None and token.value < number: | |
token = token.next | |
if token is not None and token.value == number: | |
return token | |
else: | |
raise Exception("LINE NUMBER %s NOT FOUND" % number) | |
stack = [] # The call stack for the gosub-return-mechanism. | |
token = program.next | |
while token is not None: | |
# Interpret the current line of code. | |
result = interpret(token.right) | |
# Advance to the next line of code. | |
token = token.next | |
# Handle signals. | |
if isinstance(result, Signal): | |
if result.type == "END": | |
return | |
elif result.type == "GOSUB": | |
stack.append(token) | |
token = find(result.value) | |
elif result.type == "GOTO": | |
token = find(result.value) | |
elif result.type == "RETURN": | |
if len(stack) == 0: | |
raise Exception("RETURN WITHOUT GOSUB") | |
token = stack.pop() | |
while True: | |
try: | |
line = input("> ") | |
token = parse(line) | |
if token.type == "NUMBER": | |
if hasattr(token, "right"): | |
insert(token) | |
else: | |
delete(token.value) | |
else: | |
result = interpret(token) | |
if isinstance(result, Signal) and result.type == "RUN": | |
run() | |
except Exception as exception: | |
print(exception) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment