alexaleluia12/compiler.py

## compiler.py
# font
# http://chimera.labs.oreilly.com/books/1230000000393/ch02.html#_writing_a_simple_recursive_descent_parser

"""
grammar

    expr ::= expr + term
         |   expr - term
         |   term

    term ::= term * factor
         |   term / factor
         |   factor

    factor ::= ( expr )
           |   NUM
"""

import re
import collections

# Token specification
NUM    = r'(?P<NUM>\d+)'
PLUS   = r'(?P<PLUS>\+)'
MINUS  = r'(?P<MINUS>-)'
TIMES  = r'(?P<TIMES>\*)'
DIVIDE = r'(?P<DIVIDE>/)'
LPAREN = r'(?P<LPAREN>\()'
RPAREN = r'(?P<RPAREN>\))'
WS     = r'(?P<WS>\s+)'

master_pat = re.compile('|'.join([NUM, PLUS, MINUS, TIMES,
                                  DIVIDE, LPAREN, RPAREN, WS]))

# Tokenizer
Token = collections.namedtuple('Token', ['type','value'])

def generate_tokens(text):
    scanner = master_pat.scanner(text)
    # iter(callable, sentinel) callable is called until it returns the sentinel.
    for m in iter(scanner.match, None):
        tok = Token(m.lastgroup, m.group())
        if tok.type != 'WS':
            yield tok

# Parser
class ExpressionEvaluator:
    '''
    Implementation of a recursive descent parser.   Each method
    implements a single grammar rule.  Use the ._accept() method
    to test and accept the current lookahead token.  Use the ._expect()
    method to exactly match and discard the next token on on the input
    (or raise a SyntaxError if it doesn't match).
    '''

    def parse(self,text):
        self.tokens = generate_tokens(text)
        self.tok = None             # Last symbol consumed
        self.nexttok = None         # Next symbol tokenized
        self._advance()             # Load first lookahead token
        return self.expr()

    def _advance(self):
        'Advance one token ahead'
        self.tok, self.nexttok = self.nexttok, next(self.tokens, None)

    def _accept(self,toktype):
        'Test and consume the next token if it matches toktype'
        if self.nexttok and self.nexttok.type == toktype:
            self._advance()
            return True
        else:
            return False

    def _expect(self,toktype):
        'Consume next token if it matches toktype or raise SyntaxError'
        if not self._accept(toktype):
            raise SyntaxError('Expected ' + toktype)

    # Grammar rules follow

    def expr(self):
        "expression ::= term { ('+'|'-') term }*"

        exprval = self.term()
        while self._accept('PLUS') or self._accept('MINUS'):
            op = self.tok.type
            right = self.term()
            if op == 'PLUS':
                exprval += right
            elif op == 'MINUS':
                exprval -= right
        return exprval

    def term(self):
        "term ::= factor { ('*'|'/') factor }*"

        termval = self.factor()
        while self._accept('TIMES') or self._accept('DIVIDE'):
            op = self.tok.type
            right = self.factor()
            if op == 'TIMES':
                termval *= right
            elif op == 'DIVIDE':
                termval /= right
        return termval

    def factor(self):
        "factor ::= NUM | ( expr )"

        if self._accept('NUM'):
            return int(self.tok.value)
        elif self._accept('LPAREN'):
            exprval = self.expr()
            self._expect('RPAREN')
            return exprval
        else:
            raise SyntaxError('Expected NUMBER or LPAREN')

# use
"""
>>> e = ExpressionEvaluator()
>>> e.parse('2')
2
>>> e.parse('2 + 3')
5
>>> e.parse('2 + 3 * 4')
14
>>> e.parse('2 + (3 + 4) * 5')
37
>>> e.parse('2 + (3 + * 4)')
"""

# to build a parse tree

class ExpressionTreeBuilder(ExpressionEvaluator):
    def expr(self):
        "expression ::= term { ('+'|'-') term }"

        exprval = self.term()
        while self._accept('PLUS') or self._accept('MINUS'):
            op = self.tok.type
            right = self.term()
            if op == 'PLUS':
                exprval = ('+', exprval, right)
            elif op == 'MINUS':
                exprval = ('-', exprval, right)
        return exprval

    def term(self):
        "term ::= factor { ('*'|'/') factor }"

        termval = self.factor()
        while self._accept('TIMES') or self._accept('DIVIDE'):
            op = self.tok.type
            right = self.factor()
            if op == 'TIMES':
                termval = ('*', termval, right)
            elif op == 'DIVIDE':
                termval = ('/', termval, right)
        return termval

    def factor(self):
        'factor ::= NUM | ( expr )'

        if self._accept('NUM'):
            return int(self.tok.value)
        elif self._accept('LPAREN'):
            exprval = self.expr()
            self._expect('RPAREN')
            return exprval
        else:
            raise SyntaxError('Expected NUMBER or LPAREN')

#use
"""
>>> e = ExpressionTreeBuilder()
>>> e.parse('2 + 3')
('+', 2, 3)
>>> e.parse('2 + 3 * 4')
('+', 2, ('*', 3, 4))
>>> e.parse('2 + (3 + 4) * 5')
('+', 2, ('*', ('+', 3, 4), 5))
>>> e.parse('2 + 3 + 4')
('+', ('+', 2, 3), 4)
>>>
"""
	# font
	# http://chimera.labs.oreilly.com/books/1230000000393/ch02.html#_writing_a_simple_recursive_descent_parser

	"""
	grammar

	expr ::= expr + term
	\| expr - term
	\| term

	term ::= term * factor
	\| term / factor
	\| factor

	factor ::= ( expr )
	\| NUM
	"""

	import re
	import collections

	# Token specification
	NUM = r'(?P<NUM>\d+)'
	PLUS = r'(?P<PLUS>\+)'
	MINUS = r'(?P<MINUS>-)'
	TIMES = r'(?P<TIMES>\*)'
	DIVIDE = r'(?P<DIVIDE>/)'
	LPAREN = r'(?P<LPAREN>\()'
	RPAREN = r'(?P<RPAREN>\))'
	WS = r'(?P<WS>\s+)'

	master_pat = re.compile('\|'.join([NUM, PLUS, MINUS, TIMES,
	DIVIDE, LPAREN, RPAREN, WS]))

	# Tokenizer
	Token = collections.namedtuple('Token', ['type','value'])

	def generate_tokens(text):
	scanner = master_pat.scanner(text)
	# iter(callable, sentinel) callable is called until it returns the sentinel.
	for m in iter(scanner.match, None):
	tok = Token(m.lastgroup, m.group())
	if tok.type != 'WS':
	yield tok

	# Parser
	class ExpressionEvaluator:
	'''
	Implementation of a recursive descent parser. Each method
	implements a single grammar rule. Use the ._accept() method
	to test and accept the current lookahead token. Use the ._expect()
	method to exactly match and discard the next token on on the input
	(or raise a SyntaxError if it doesn't match).
	'''

	def parse(self,text):
	self.tokens = generate_tokens(text)
	self.tok = None # Last symbol consumed
	self.nexttok = None # Next symbol tokenized
	self._advance() # Load first lookahead token
	return self.expr()

	def _advance(self):
	'Advance one token ahead'
	self.tok, self.nexttok = self.nexttok, next(self.tokens, None)

	def _accept(self,toktype):
	'Test and consume the next token if it matches toktype'
	if self.nexttok and self.nexttok.type == toktype:
	self._advance()
	return True
	else:
	return False

	def _expect(self,toktype):
	'Consume next token if it matches toktype or raise SyntaxError'
	if not self._accept(toktype):
	raise SyntaxError('Expected ' + toktype)

	# Grammar rules follow

	def expr(self):
	"expression ::= term { ('+'\|'-') term }*"

	exprval = self.term()
	while self._accept('PLUS') or self._accept('MINUS'):
	op = self.tok.type
	right = self.term()
	if op == 'PLUS':
	exprval += right
	elif op == 'MINUS':
	exprval -= right
	return exprval

	def term(self):
	"term ::= factor { (''\|'/') factor }"

	termval = self.factor()
	while self._accept('TIMES') or self._accept('DIVIDE'):
	op = self.tok.type
	right = self.factor()
	if op == 'TIMES':
	termval *= right
	elif op == 'DIVIDE':
	termval /= right
	return termval

	def factor(self):
	"factor ::= NUM \| ( expr )"

	if self._accept('NUM'):
	return int(self.tok.value)
	elif self._accept('LPAREN'):
	exprval = self.expr()
	self._expect('RPAREN')
	return exprval
	else:
	raise SyntaxError('Expected NUMBER or LPAREN')

	# use
	"""
	>>> e = ExpressionEvaluator()
	>>> e.parse('2')
	2
	>>> e.parse('2 + 3')
	5
	>>> e.parse('2 + 3 * 4')
	14
	>>> e.parse('2 + (3 + 4) * 5')
	37
	>>> e.parse('2 + (3 + * 4)')
	"""

	# to build a parse tree

	class ExpressionTreeBuilder(ExpressionEvaluator):
	def expr(self):
	"expression ::= term { ('+'\|'-') term }"

	exprval = self.term()
	while self._accept('PLUS') or self._accept('MINUS'):
	op = self.tok.type
	right = self.term()
	if op == 'PLUS':
	exprval = ('+', exprval, right)
	elif op == 'MINUS':
	exprval = ('-', exprval, right)
	return exprval

	def term(self):
	"term ::= factor { ('*'\|'/') factor }"

	termval = self.factor()
	while self._accept('TIMES') or self._accept('DIVIDE'):
	op = self.tok.type
	right = self.factor()
	if op == 'TIMES':
	termval = ('*', termval, right)
	elif op == 'DIVIDE':
	termval = ('/', termval, right)
	return termval

	def factor(self):
	'factor ::= NUM \| ( expr )'

	if self._accept('NUM'):
	return int(self.tok.value)
	elif self._accept('LPAREN'):
	exprval = self.expr()
	self._expect('RPAREN')
	return exprval
	else:
	raise SyntaxError('Expected NUMBER or LPAREN')

	#use
	"""
	>>> e = ExpressionTreeBuilder()
	>>> e.parse('2 + 3')
	('+', 2, 3)
	>>> e.parse('2 + 3 * 4')
	('+', 2, ('*', 3, 4))
	>>> e.parse('2 + (3 + 4) * 5')
	('+', 2, ('*', ('+', 3, 4), 5))
	>>> e.parse('2 + 3 + 4')
	('+', ('+', 2, 3), 4)
	>>>
	"""