Skip to content

Instantly share code, notes, and snippets.

@eamars
Created November 28, 2016 14:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eamars/3c28959e9dcff70ac567f5755558640d to your computer and use it in GitHub Desktop.
Save eamars/3c28959e9dcff70ac567f5755558640d to your computer and use it in GitHub Desktop.
import re
import sys
# Restrictions:
# Integer constants must be short.
# Stack size must not exceed 1024.
# Integer is the only type.
# Logical operators cannot be nested.
class Scanner:
'''The interface comprises the methods lookahead and consume.
Other methods should not be called from outside of this class.'''
def __init__(self, input_file):
'''Reads the whole input_file to input_string, which remains constant.
current_char_index counts how many characters of input_string have
been consumed.
current_token holds the most recently found token and the
corresponding part of input_string.'''
# source code of the program to be compiled
self.input_string = input_file.read()
# index where the unprocessed part of input_string starts
self.current_char_index = 0
# a pair (most recently read token, matched substring of input_string)
self.current_token = self.get_token()
def skip_white_space(self):
'''Consumes all characters in input_string up to the next
non-white-space character.'''
# white space: [ \t\n\r\f\v]
WHITE_SPACE = ['\n', '\t', '\n', '\r', '\f', '\v', ' ']
while (self.current_char_index < len(self.input_string)) and self.input_string[self.current_char_index] in WHITE_SPACE:
self.current_char_index += 1
def get_token(self):
'''Returns the next token and the part of input_string it matched.
The returned token is None if there is no next token.
The characters up to the end of the token are consumed.'''
self.skip_white_space()
# find the longest prefix of input_string that matches a token
token, longest = None, ''
for (t, r) in Token.token_regexp:
match = re.match(r, self.input_string[self.current_char_index:])
if match and match.end() > len(longest):
token, longest = t, match.group()
# consume the token by moving the index to the end of the matched part
self.current_char_index += len(longest)
return (token, longest)
def lookahead(self):
'''Returns the next token without consuming it.
Returns None if there is no next token.'''
return self.current_token[0]
def consume(self, *tokens):
'''Returns the next token and consumes it, if it is in tokens.
Raises an exception otherwise.
If the token is a number or an identifier, its value is returned
instead of the token.'''
if self.current_token[0] not in tokens:
raise Exception("Invalid Syntax")
ret = None
if self.current_token[0] == Token.ID or self.current_token[0] == Token.NUM:
ret = self.current_token[1]
else:
ret = self.current_token[0]
self.current_token = self.get_token()
return ret
class Token:
# The following enumerates all tokens.
DO = 'DO'
ELSE = 'ELSE'
END = 'END'
IF = 'IF'
THEN = 'THEN'
WHILE = 'WHILE'
SEM = 'SEM'
BEC = 'BEC'
LESS = 'LESS'
EQ = 'EQ'
GRTR = 'GRTR'
LEQ = 'LEQ'
NEQ = 'NEQ'
GEQ = 'GEQ'
ADD = 'ADD'
SUB = 'SUB'
MUL = 'MUL'
DIV = 'DIV'
LPAR = 'LPAR'
RPAR = 'RPAR'
NUM = 'NUM'
ID = 'ID'
WRITE = 'WRITE'
READ = 'READ'
AND = 'AND'
OR = 'OR'
NOT = 'NOT'
# The following list gives the regular expression to match a token.
# The order in the list matters for mimicking Flex behaviour.
# Longer matches are preferred over shorter ones.
# For same-length matches, the first in the list is preferred.
token_regexp = [
(DO, 'do'),
(ELSE, 'else'),
(END, 'end'),
(IF, 'if'),
(THEN, 'then'),
(WHILE, 'while'),
(SEM, ';'),
(BEC, ':='),
(LESS, '<'),
(EQ, '='),
(GRTR, '>'),
(LEQ, '<='),
(NEQ, '!='),
(GEQ, '>='),
(ADD, '[+]'), # + is special in regular expressions
(SUB, '-'),
(MUL, '[*]'),
(DIV, '/'),
(LPAR, '[(]'), # ( is special in regular expressions
(RPAR, '[)]'), # ) is special in regular expressions
(NUM, '[0-9]+'), # NUM in regular expression, inc dot
(WRITE, 'write'),
(READ, 'read'),
(AND, 'and'),
(OR, 'or'),
(NOT, 'not'),
(ID, '[a-z]+'),
]
class Symbol_Table:
'''A symbol table maps identifiers to locations.'''
def __init__(self):
self.symbol_table = {}
def size(self):
'''Returns the number of entries in the symbol table.'''
return len(self.symbol_table)
def location(self, identifier):
'''Returns the location of an identifier. If the identifier is not in
the symbol table, it is entered with a new location. Locations are
numbered sequentially starting with 0.'''
if identifier in self.symbol_table:
return self.symbol_table[identifier]
index = len(self.symbol_table)
self.symbol_table[identifier] = index
return index
class Label:
def __init__(self):
self.current_label = 0
def next(self):
'''Returns a new, unique label.'''
self.current_label += 1
return 'l' + str(self.current_label)
def indent(s, level):
return ' '*level + s + '\n'
# Each of the following classes is a kind of node in the abstract syntax tree.
# indented(level) returns a string that shows the tree levels by indentation.
# code() returns a string with JVM bytecode implementing the tree fragment.
# true_code/false_code(label) jumps to label if the condition is/is not true.
# Execution of the generated code leaves the value of expressions on the stack.
class Program_AST:
def __init__(self, program):
self.program = program
def __repr__(self):
return repr(self.program)
def indented(self, level):
return self.program.indented(level)
def code(self):
program = self.program.code()
local = symbol_table.size()
java_scanner = symbol_table.location('Java Scanner')
return '.class public Program\n' + \
'.super java/lang/Object\n' + \
'.method public <init>()V\n' + \
'aload_0\n' + \
'invokenonvirtual java/lang/Object/<init>()V\n' + \
'return\n' + \
'.end method\n' + \
'.method public static main([Ljava/lang/String;)V\n' + \
'.limit locals ' + str(local) + '\n' + \
'.limit stack 1024\n' + \
'new java/util/Scanner\n' + \
'dup\n' + \
'getstatic java/lang/System.in Ljava/io/InputStream;\n' + \
'invokespecial java/util/Scanner.<init>(Ljava/io/InputStream;)V\n' + \
'astore ' + str(java_scanner) + '\n' + \
program + \
'return\n' + \
'.end method\n'
class Statements_AST:
def __init__(self, statements):
self.statements = statements
def __repr__(self):
result = repr(self.statements[0])
for st in self.statements[1:]:
result += '; ' + repr(st)
return result
def indented(self, level):
result = indent('Statement(s)', level)
for st in self.statements:
result += st.indented(level+1)
return result
def code(self):
result = ''
for st in self.statements:
result += st.code()
return result
class If_AST:
def __init__(self, condition, then):
self.condition = condition
self.then = then
def __repr__(self):
return 'if ' + repr(self.condition) + ' then ' + \
repr(self.then) + ' end'
def indented(self, level):
return indent('If-Then', level) + \
self.condition.indented(level+1) + \
self.then.indented(level+1)
def code(self):
# Rewrite
l1 = label_generator.next()
return self.condition.false_code(l1) + \
self.then.code() + \
l1 + ':\n'
class If_Else_AST:
def __init__(self, condition, if_then, else_then):
self.condition = condition
self.if_then = if_then
self.else_then = else_then
def __repr__(self):
return 'if ' + repr(self.condition) + ' then ' + \
repr(self.if_then) + ' else ' + \
repr(self.else_then) + ' end'
def indented(self, level):
return indent('If-Then-Else', level) + \
self.condition.indented(level + 1) + \
self.if_then.indented(level + 1) + \
self.else_then.indented(level + 1)
def code(self):
# Rewrite
l1 = label_generator.next()
l2 = label_generator.next()
return self.condition.false_code(l1) + \
self.if_then.code() + \
'goto ' + l2 + '\n' + \
l1 + ':\n' + \
self.else_then.code() + \
l2 + ':\n'
class While_AST:
def __init__(self, condition, body):
self.condition = condition
self.body = body
def __repr__(self):
return 'while ' + repr(self.condition) + ' do ' + \
repr(self.body) + ' end'
def indented(self, level):
return indent('While-Do', level) + \
self.condition.indented(level+1) + \
self.body.indented(level+1)
def code(self):
l1 = label_generator.next()
l2 = label_generator.next()
return l1 + ':\n' + \
self.condition.false_code(l2) + \
self.body.code() + \
'goto ' + l1 + '\n' + \
l2 + ':\n'
class Assign_AST:
def __init__(self, identifier, expression):
self.identifier = identifier
self.expression = expression
def __repr__(self):
return repr(self.identifier) + ':=' + repr(self.expression)
def indented(self, level):
return indent('Assign', level) + \
self.identifier.indented(level+1) + \
self.expression.indented(level+1)
def code(self):
loc = symbol_table.location(self.identifier.identifier)
return self.expression.code() + \
'istore ' + str(loc) + '\n'
class Write_AST:
def __init__(self, expression):
self.expression = expression
def __repr__(self):
return 'write ' + repr(self.expression)
def indented(self, level):
return indent('Write', level) + self.expression.indented(level+1)
def code(self):
return 'getstatic java/lang/System/out Ljava/io/PrintStream;\n' + \
self.expression.code() + \
'invokestatic java/lang/String/valueOf(I)Ljava/lang/String;\n' + \
'invokevirtual java/io/PrintStream/println(Ljava/lang/String;)V\n'
class Read_AST:
def __init__(self, identifier):
self.identifier = identifier
def __repr__(self):
return 'read ' + repr(self.identifier)
def indented(self, level):
return indent('Read', level) + self.identifier.indented(level+1)
def code(self):
java_scanner = symbol_table.location('Java Scanner')
loc = symbol_table.location(self.identifier.identifier)
return 'aload ' + str(java_scanner) + '\n' + \
'invokevirtual java/util/Scanner.nextInt()I\n' + \
'istore ' + str(loc) + '\n'
class Comparison_AST:
def __init__(self, left, op, right):
self.left = left
self.op = op
self.right = right
def __repr__(self):
op = { Token.LESS:'<', Token.EQ:'=', Token.GRTR:'>',
Token.LEQ:'<=', Token.NEQ:'!=', Token.GEQ:'>=' }
return repr(self.left) + op[self.op] + repr(self.right)
def indented(self, level):
return indent(self.op, level) + \
self.left.indented(level+1) + \
self.right.indented(level+1)
def true_code(self, label):
op = { Token.LESS:'if_icmplt', Token.EQ:'if_icmpeq',
Token.GRTR:'if_icmpgt', Token.LEQ:'if_icmple',
Token.NEQ:'if_icmpne', Token.GEQ:'if_icmpge' }
return self.left.code() + \
self.right.code() + \
op[self.op] + ' ' + label + '\n'
def false_code(self, label):
# Negate each comparison because of jump to "false" label.
op = { Token.LESS:'if_icmpge', Token.EQ:'if_icmpne',
Token.GRTR:'if_icmple', Token.LEQ:'if_icmpgt',
Token.NEQ:'if_icmpeq', Token.GEQ:'if_icmplt' }
return self.left.code() + \
self.right.code() + \
op[self.op] + ' ' + label + '\n'
class Expression_AST:
def __init__(self, left, op, right):
self.left = left
self.op = op
self.right = right
def __repr__(self):
op = { Token.ADD:'+', Token.SUB:'-', Token.MUL:'*', Token.DIV:'/' }
return '(' + repr(self.left) + op[self.op] + repr(self.right) + ')'
def indented(self, level):
return indent(self.op, level) + \
self.left.indented(level+1) + \
self.right.indented(level+1)
def code(self):
op = { Token.ADD:'iadd', Token.SUB:'isub',
Token.MUL:'imul', Token.DIV:'idiv' }
return self.left.code() + \
self.right.code() + \
op[self.op] + '\n'
class Number_AST:
def __init__(self, number):
self.number = number
def __repr__(self):
return self.number
def indented(self, level):
return indent(self.number, level)
def code(self): # works only for short numbers
return 'sipush ' + self.number + '\n'
class Identifier_AST:
def __init__(self, identifier):
self.identifier = identifier
def __repr__(self):
return self.identifier
def indented(self, level):
return indent(self.identifier, level)
def code(self):
loc = symbol_table.location(self.identifier)
return 'iload ' + str(loc) + '\n'
class BooleanExpression_AST:
def __init__(self, left, op, right):
self.left = left
self.op = op
self.right = right
def __repr__(self):
op = { Token.AND:'and', Token.OR:'or'}
return '(' + repr(self.left) + op[self.op] + repr(self.right) + ')'
def indented(self, level):
return indent(self.op, level) + \
self.left.indented(level+1) + \
self.right.indented(level+1)
def false_code(self, label):
if self.op == Token.AND:
return self.left.false_code(label) + \
self.right.false_code(label)
elif self.op == Token.OR:
l1 = label_generator.next()
return self.left.true_code(l1) + \
self.right.false_code(label) + \
l1 + ':\n'
def true_code(self, label):
if self.op == Token.AND:
l1 = label_generator.next()
return self.left.false_code(l1) + \
self.right.true_code(label) + \
l1 + ':\n'
elif self.op == Token.OR:
return self.left.true_code(label) + \
self.right.true_code(label)
class BooleanNot_AST:
def __init__(self, booleanfactor):
self.booleanfactor = booleanfactor
def __repr__(self):
return '(' + repr(self.left) + 'not' + repr(self.right) + ')'
def indented(self, level):
return indent('not', level) + \
self.booleanfactor.indented(level+1)
def false_code(self, label):
return self.booleanfactor.true_code(label)
def true_code(self, label):
return self.booleanfactor.false_code(label)
# The following methods comprise the recursive-descent parser.
def program():
# Program = Statements
sts = statements()
return Program_AST(sts)
def statements():
# Statements = Statement (; Statement)*
# Include first statement and use while loop to scan any
# further statments
result = [statement()]
while scanner.lookahead() == Token.SEM:
scanner.consume(Token.SEM)
st = statement()
result.append(st)
return Statements_AST(result)
def statement():
# Statement = If | While | Assignment | Write | Read
if scanner.lookahead() == Token.IF:
return if_statement()
elif scanner.lookahead() == Token.WHILE:
return while_statement()
elif scanner.lookahead() == Token.ID:
return assignment()
elif scanner.lookahead() == Token.WRITE:
return write_statement()
elif scanner.lookahead() == Token.READ:
return read_statement()
else: # error
return scanner.consume(Token.IF, Token.WHILE, Token.ID, \
Token.WRITE, Token.READ)
def if_statement():
# If = if Comparison then Statements (end | else Statements end)
scanner.consume(Token.IF)
condition = boolean_expression()
scanner.consume(Token.THEN)
then = statements()
# if Comparison then Statements end
if scanner.lookahead() == Token.END:
scanner.consume(Token.END)
return If_AST(condition, then)
elif scanner.lookahead() == Token.ELSE:
scanner.consume(Token.ELSE)
else_then = statements()
scanner.consume(Token.END)
return If_Else_AST(condition, then, else_then)
else: # error
return scanner.consume(Token.END, Token.ELSE)
def while_statement():
# While = while Comparison do Statements end
scanner.consume(Token.WHILE)
condition = boolean_expression()
scanner.consume(Token.DO)
body = statements()
scanner.consume(Token.END)
return While_AST(condition, body)
def boolean_expression():
be = boolean_term()
while scanner.lookahead() == Token.OR:
op = scanner.consume(Token.OR)
tree = boolean_term()
be = BooleanExpression_AST(be, op, tree)
return be
def boolean_term():
bt = boolean_factor()
while scanner.lookahead() == Token.AND:
op = scanner.consume(Token.AND)
tree = boolean_factor()
bt = BooleanExpression_AST(bt, op, tree)
return bt
def boolean_factor():
if scanner.lookahead() == Token.NOT:
op = scanner.consume(Token.NOT)
bf = boolean_factor()
return BooleanNot_AST(bf)
else:
return comparison()
def assignment():
# Assignment = identifier := Expression
ident = identifier()
scanner.consume(Token.BEC)
expr = expression()
return Assign_AST(ident, expr)
def read_statement():
scanner.consume(Token.READ)
ident = identifier()
return Read_AST(ident)
def write_statement():
scanner.consume(Token.WRITE)
expr = expression()
return Write_AST(expr)
def comparison():
# Comparison = Expression Relation Expression
left = expression()
op = scanner.consume(Token.LESS, Token.EQ, Token.GRTR,
Token.LEQ, Token.NEQ, Token.GEQ)
right = expression()
return Comparison_AST(left, op, right)
def expression():
# Expression = Term ((+ | -) Term)*
result = term()
while scanner.lookahead() in [Token.ADD, Token.SUB]:
op = scanner.consume(Token.ADD, Token.SUB)
tree = term()
result = Expression_AST(result, op, tree)
return result
def term():
# Term = Factor ((* | /) Factor)*
result = factor()
while scanner.lookahead() in [Token.MUL, Token.DIV]:
op = scanner.consume(Token.MUL, Token.DIV)
tree = factor()
result = Expression_AST(result, op, tree)
return result
def factor():
# Factor = (Expression) | number | identifier
if scanner.lookahead() == Token.LPAR:
scanner.consume(Token.LPAR)
result = expression()
scanner.consume(Token.RPAR)
return result
elif scanner.lookahead() == Token.NUM:
value = scanner.consume(Token.NUM)
return Number_AST(value)
elif scanner.lookahead() == Token.ID:
return identifier()
else: # error
return scanner.consume(Token.LPAR, Token.NUM, Token.ID)
def identifier():
# Anything except for reserved words and symbols
value = scanner.consume(Token.ID)
return Identifier_AST(value)
# Initialise scanner, symbol table and label generator.
scanner = Scanner(sys.stdin)
symbol_table = Symbol_Table()
symbol_table.location('Java Scanner') # fix a location for the Java Scanner
label_generator = Label()
# Uncomment the following to test the scanner without the parser.
# Show all tokens in the input.
#
# token = scanner.lookahead()
# while token != None:
# print(scanner.consume(token))
# token = scanner.lookahead()
# exit()
# Call the parser.
ast = program()
if scanner.lookahead() != None:
raise Exception('end of input expected but token ' + repr(scanner.lookahead()) + ' found')
# Uncomment the following to test the parser without the code generator.
# Show the syntax tree with levels indicated by indentation.
#
#print(ast.indented(0), end='')
#exit()
# Call the code generator.
# Translate the abstract syntax tree to JVM bytecode.
# It can be assembled to a class file by Jasmin: http://jasmin.sourceforge.net/
print(ast.code(), end='')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment