Skip to content

Instantly share code, notes, and snippets.

@Michi83
Created April 14, 2018 07:29
Show Gist options
  • Save Michi83/7f7a9d3b32941699e490dea001a7d180 to your computer and use it in GitHub Desktop.
Save Michi83/7f7a9d3b32941699e490dea001a7d180 to your computer and use it in GitHub Desktop.
Implementation of Tiny BASIC using recursive descent
from collections import deque
# This is a demonstration of the recursive descent algorithm used to implement
# interpreters for a programming languages. As an example, Tiny BASIC is
# implemented here but of course the same technique can also be used to
# implement other languages as well.
#
# https://en.wikipedia.org/wiki/Tiny_BASIC
#
# It is always a good idea to break up a project into several simple steps and
# so we will implement our interpreter in these three steps:
# - Step 1 - Tokenization
# - Step 2 - Parsing
# - Step 3 - Interpretation
# Step 1 - Tokenization
#
# First we need to break up the code into "tokens", e.g. numbers, strings,
# operators... We begin by reading the first character of the code. That will
# usually be enough to tell which type of token it is. Then we read more
# characters until we find one that cannot possibly be part of the same token.
# We repeat this process for the rest of the code until we reach the end.
#
# For instance, if the first character is a digit, we read more characters
# until we find one that is not a digit.
# Each token has a type (e.g. "NUMBER") and some also have a value (e.g. the
# numeric value of a number token).
class Token:
def __init__(self, type, value=None):
self.type = type
self.value = value
# Breaks up a line of code into tokens, puts them in a queue and returns it.
def tokenize(line):
def digit():
return line[to] >= "0" and line[to] <= "9"
def letter():
return lowercase() or uppercase()
def lowercase():
return line[to] >= "a" and line[to] <= "z"
def uppercase():
return line[to] >= "A" and line[to] <= "Z"
keywords = {"CLEAR", "END", "GOSUB", "GOTO", "IF", "INPUT", "LET", "LIST",
"PRINT", "RETURN", "RUN", "THEN"}
tokens = deque()
to = 0
line += "\r"
while True:
from_ = to
# Carriage return.
if line[to] == "\r":
tokens.append(Token("CR"))
break
# Skip spaces.
elif line[to] == " ":
to += 1
while line[to] == " ":
to += 1
# Strings.
elif line[to] == "\"":
to += 1
while line[to] != "\"":
if line[to] == "\r":
raise Exception("MATCHING QUOTATION MARK NOT FOUND")
to += 1
to += 1
tokens.append(Token("STRING", line[from_ + 1:to - 1]))
# Single character tokens.
elif line[to] in {"(", ")", "*", "+", ",", "-", "/"}:
to += 1
tokens.append(Token(line[from_]))
# Numbers.
elif digit():
to += 1
while digit():
to += 1
tokens.append(Token("NUMBER", int(line[from_:to])))
# Relational operators.
elif line[to] in {"<", "=", ">"}:
to += 1
if line[from_] == "<" and line[to] in {"=", ">"}:
to += 1
elif line[from_] == ">" and line[to] in {"<", "="}:
to += 1
substring = line[from_:to]
# >< is an alias for <>.
if substring == "><":
substring = "<>"
tokens.append(Token(substring))
# Keywords and variables.
elif letter():
to += 1
while letter():
to += 1
substring = line[from_:to].upper()
if substring in keywords:
tokens.append(Token(substring))
elif len(substring) == 1:
tokens.append(Token("VAR", substring))
else:
raise Exception("UNRECOGNIZED KEYWORD %s" % substring)
else:
raise Exception("UNRECOGNIZED TOKEN %s" % line[to])
return tokens
# Step 2 - Parsing
#
# Now that we have tokens, we need to relate them to one another, i.e. we need
# to construct a syntax tree.
#
# In order to accomplish this, we must define "grammar rules" for the language,
# that describe how language constructs are composed of other language
# constructs and individual tokens. For instance, if a language only contains
# numbers, plus signs and multiplication signs the grammar could be as follows:
#
# - A sum consists of a product followed by a plus sign and another product
# zero or more times.
# - A product consists of a number followed by a multiplication sign and
# another number zero or more times.
#
# Note how these rules already handle operator precedence. A sum contains
# products but products contain numbers. Thus, individual numbers bind more
# closely to products than to sums.
#
# Since defining the grammar in words is verbose and error-prone, there is a
# symbolic notation for grammar rules called EBNF:
# https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form
#
# Tiny BASIC's grammar in EBNF-like notation, as implemented here, is:
# line = [number] statement "CR"
# statement = "PRINT" expr-list
# "IF" expression relop expression "THEN" statement
# "GOTO" expression
# "INPUT" var-list
# "LET" var "=" expression
# "GOSUB" expression
# "RETURN"
# "CLEAR"
# "LIST"
# "RUN"
# "END"
# expr-list = (string | expression) {"," (string | expression)}
# var-list = var {"," var}
# expression = ["+" | "-"] term {("+" | "-") term}
# term = factor {("*" | "/") factor}
# factor = var | number | "(" expression ")"
# relop = "<" | "<=" | "<>" | "=" | ">" | ">="
#
# (Square brackets denote optional parts, curly braces denote parts repeated
# zero or more times, vertical bars denote alternatives.)
#
# With our grammar defined, the recursive descent algorithm comes into play.
# For each grammar rule we implement a function that groups tokens according to
# its eponymous rule and returns part of a syntax tree. These functions will
# frequently call one another and themselves recursively.
# Constructs a syntax tree for a given line of code and returns its root token.
def parse(line):
# Fetch the next token from the token queue and optionally check its type.
def advance(*expected):
if len(expected) == 0 or tokens[0].type in expected:
return tokens.popleft()
else:
raise Exception("UNEXPECTED TOKEN %s" % tokens[0].type)
# Functions implementing grammar rules.
def statement():
if tokens[0].type == "PRINT":
token = advance()
token.right = expr_list()
elif tokens[0].type == "IF":
token = advance()
temp = expression()
token.left = relop()
token.left.left = temp
token.left.right = expression()
advance("THEN")
token.right = statement()
elif tokens[0].type == "GOTO":
token = advance()
token.right = expression()
elif tokens[0].type == "INPUT":
token = advance()
token.right = var_list()
elif tokens[0].type == "LET":
token = advance()
token.left = advance("VAR")
advance("=")
token.right = expression()
elif tokens[0].type == "GOSUB":
token = advance()
token.right = expression()
elif tokens[0].type == "RETURN":
token = advance()
elif tokens[0].type == "CLEAR":
token = advance()
elif tokens[0].type == "LIST":
token = advance()
elif tokens[0].type == "RUN":
token = advance()
elif tokens[0].type == "END":
token = advance()
else:
raise Exception("UNEXPECTED TOKEN %s" % tokens[0].type)
return token
def expr_list():
token_list = []
if tokens[0].type == "STRING":
token_list.append(advance())
else:
token_list.append(expression())
while tokens[0].type == ",":
advance()
if tokens[0].type == "STRING":
token_list.append(advance())
else:
token_list.append(expression())
return token_list
def var_list():
token_list = [advance("VAR")]
while tokens[0].type == ",":
advance()
token_list.append(advance("VAR"))
return token_list
def expression():
# Unary plus has no effect and is simply skipped.
if tokens[0].type == "+":
advance()
token = term()
# -x is rewritten as 0 - x.
elif tokens[0].type == "-":
token = advance()
token.left = Token("NUMBER", 0)
token.right = term()
else:
token = term()
while tokens[0].type in {"+", "-"}:
tokens[0].left = token
token = advance()
token.right = term()
return token
def term():
token = factor()
while tokens[0].type in {"*", "/"}:
tokens[0].left = token
token = advance()
token.right = factor()
return token
def factor():
if tokens[0].type in {"VAR", "NUMBER"}:
token = advance()
else:
advance("(")
token = expression()
advance(")")
return token
def relop():
return advance("<", "<=", "<>", "=", ">", ">=")
tokens = tokenize(line)
if tokens[0].type == "NUMBER":
token = advance()
if tokens[0].type != "CR":
token.right = statement()
token.line = line
else:
token = statement()
advance("CR")
return token
# Step 3 - Interpretation
#
# Now that we have a syntax tree, we need a function to recursively traverse
# the tree and perform the appropriate action at each node, sometimes also
# returning results.
# Some instructions return "signal objects" instructing the system to do
# something special. Each signal has a type, e.g. "GOTO", and some also have a
# value, e.g. a line number.
class Signal:
def __init__(self, type, value=None):
self.type = type
self.value = value
def interpret(token):
if token.type == "*":
left = interpret(token.left)
right = interpret(token.right)
return left * right
elif token.type == "+":
left = interpret(token.left)
right = interpret(token.right)
return left + right
elif token.type == "-":
left = interpret(token.left)
right = interpret(token.right)
return left - right
elif token.type == "/":
left = interpret(token.left)
right = interpret(token.right)
if right == 0:
raise Exception("DIVISION BY ZERO")
return left // right
elif token.type == "<":
left = interpret(token.left)
right = interpret(token.right)
return left < right
elif token.type == "<=":
left = interpret(token.left)
right = interpret(token.right)
return left <= right
elif token.type == "<>":
left = interpret(token.left)
right = interpret(token.right)
return left != right
elif token.type == "=":
left = interpret(token.left)
right = interpret(token.right)
return left == right
elif token.type == ">":
left = interpret(token.left)
right = interpret(token.right)
return left > right
elif token.type == ">=":
left = interpret(token.left)
right = interpret(token.right)
return left >= right
elif token.type == "CLEAR":
program.next = None
variables.clear()
elif token.type == "END":
return Signal("END")
elif token.type == "GOSUB":
return Signal("GOSUB", interpret(token.right))
elif token.type == "GOTO":
return Signal("GOTO", interpret(token.right))
elif token.type == "IF":
if interpret(token.left):
return interpret(token.right)
elif token.type == "INPUT":
for child in token.right:
while True:
try:
variables[child.value] = int(input("? "))
break
except ValueError:
print("INVALID INPUT")
elif token.type == "LET":
variables[token.left.value] = interpret(token.right)
elif token.type == "LIST":
subtoken = program.next
while subtoken is not None:
print(subtoken.line)
subtoken = subtoken.next
elif token.type == "NUMBER":
return token.value
elif token.type == "PRINT":
for child in token.right:
print(interpret(child), end="")
print()
elif token.type == "RETURN":
return Signal("RETURN")
elif token.type == "RUN":
return Signal("RUN")
elif token.type == "STRING":
return token.value
elif token.type == "VAR":
if token.value in variables:
return variables[token.value]
else:
return 0
# The user program is a linked list of instructions, ordered by line number. It
# always contains a dummy instruction with the line number -1 at the beginning,
# simplifying insertions and deletions.
program = Token("NUMBER", -1)
program.next = None
variables = {}
# Inserts a new instruction into the program.
def insert(new_token):
# Find the last instruction whose line number is less than the new
# instruction's line number.
token = program
while token.next is not None and token.next.value < new_token.value:
token = token.next
# Insert the new instruction between the instruction found in the previous
# step and its successor or replace the successor, as appropriate.
if token.next is None or token.next.value > new_token.value:
new_token.next = token.next
else:
new_token.next = token.next.next
token.next = new_token
# Delete an instruction with a particular line number.
def delete(number):
# See first comment of the insert function.
token = program
while token.next is not None and token.next.value < number:
token = token.next
# If the instruction's successor's line number matches, unchain it.
if token.next is not None and token.next.value == number:
token.next = token.next.next
def run():
# Find an instruction with a particular line number.
def find(number):
token = program.next
while token is not None and token.value < number:
token = token.next
if token is not None and token.value == number:
return token
else:
raise Exception("LINE NUMBER %s NOT FOUND" % number)
stack = [] # The call stack for the gosub-return-mechanism.
token = program.next
while token is not None:
# Interpret the current line of code.
result = interpret(token.right)
# Advance to the next line of code.
token = token.next
# Handle signals.
if isinstance(result, Signal):
if result.type == "END":
return
elif result.type == "GOSUB":
stack.append(token)
token = find(result.value)
elif result.type == "GOTO":
token = find(result.value)
elif result.type == "RETURN":
if len(stack) == 0:
raise Exception("RETURN WITHOUT GOSUB")
token = stack.pop()
while True:
try:
line = input("> ")
token = parse(line)
if token.type == "NUMBER":
if hasattr(token, "right"):
insert(token)
else:
delete(token.value)
else:
result = interpret(token)
if isinstance(result, Signal) and result.type == "RUN":
run()
except Exception as exception:
print(exception)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment