Last active
December 16, 2015 17:58
-
-
Save jagt/5473794 to your computer and use it in GitHub Desktop.
Simple LL(1) lexer and parser for a random data structure
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Simple LL(1) lexer and parser for a random data structure | |
# Based on description on book Language Implementation Patterns | |
class Lexer(object): | |
WS, NAME, STRING, NUM, LBRACK, RBRACK, LBRACE, RBRACE, COMMA, EQUAL, COMMENT,END = range(12); | |
EOF = '' # StringIO gives '' when nothing to read | |
def __init__(self, inio): | |
self.inio = inio | |
self.consume() | |
def consume(self): | |
self.char = self.inio.read(1) | |
def match(self, c): | |
if (self.char == c): | |
self.consume() | |
else: | |
raise RuntimeError("Expecting %s, getting %s" % (c, self.char)) | |
def next_token(self): | |
while (self.char != Lexer.EOF): | |
while self.char != Lexer.EOF and self.char in ' \n\t\r': self.consume() # consume all white space | |
if self.char == '[': self.consume(); return (Lexer.LBRACK, '[') | |
elif self.char == ']': self.consume(); return (Lexer.RBRACK, ']') | |
elif self.char == '{': self.consume(); return (Lexer.LBRACE, '{') | |
elif self.char == '}': self.consume(); return (Lexer.RBRACE, '}') | |
elif self.char == '=': self.consume(); return (Lexer.EQUAL, '=') | |
elif self.char == ',': self.consume(); return (Lexer.COMMA, ',') | |
elif self.char.isalpha(): | |
namels = [self.char]; self.consume() | |
while self.char.isalpha() or self.char.isdigit() or self.char == '_': | |
namels.append(self.char) | |
self.consume() | |
return (Lexer.NAME, ''.join(namels)) | |
elif self.char.isdigit() or self.char == '.': | |
numls = [self.char]; self.consume() | |
while self.char.isdigit() or self.char == '.': | |
numls.append(self.char) | |
self.consume() | |
return (Lexer.NUM, ''.join(numls)) | |
elif self.char in '"\'': | |
strls = [self.char]; self.consume(); | |
while self.char != strls[0]: | |
strls.append(self.char) | |
self.consume() | |
strls.append(self.char) # add closing char | |
self.consume() | |
return (Lexer.STRING, ''.join(strls)) | |
elif self.char == '/': | |
self.consume() | |
self.match('/') | |
while self.char != '\n': | |
self.consume() | |
return (Lexer.COMMENT, "#comment") | |
else: | |
raise RuntimeError("Illegal character %s" % self.char) | |
return (Lexer.END, '') | |
class Parser(object): | |
class ParseTreeNode(list): | |
def __init__(self, name): | |
list.__init__(self) | |
self.name = name | |
def __str__(self): | |
return self.name + ':' + list.__str__(self) | |
def __repr__(self): | |
return self.name + ':' + list.__repr__(self) | |
def __init__(self, lexer): | |
self.lexer = lexer | |
self.consume() | |
self.parse_tree_root = None | |
self.parse_tree_current = None | |
def consume(self): | |
self.tok = lexer.next_token() | |
if self.test(Lexer.COMMENT): # ignore as whitespace | |
self.consume() | |
def match(self, tokid): | |
if self.test(tokid): | |
val = self.tok[1] | |
self.consume(); | |
return val | |
else: | |
self.error(tokid) | |
def test(self, tokid): | |
return self.tok[0] == tokid | |
def print_parse(func): | |
def _wrapper(self): | |
print '# parsing %s' % func.func_name | |
val = func(self) | |
print '# end parsing %s' % func.func_name | |
return val | |
_wrapper.func_name = func.func_name | |
return _wrapper | |
# build very basic parse tree through decorator | |
def build_parse_tree(func): | |
def _wrapper(self): | |
node = Parser.ParseTreeNode(func.func_name) | |
if self.parse_tree_root is None: | |
self.parse_tree_current = self.parse_tree_root = node | |
else: | |
self.parse_tree_current.append(node) | |
before = self.parse_tree_current | |
self.parse_tree_current = node | |
val = func(self) | |
self.parse_tree_current = before | |
return val | |
_wrapper.func_name = func.func_name | |
return _wrapper | |
def error(self, tokid): | |
raise RuntimeError("Expecting tokken %d, getting %d" % (tokid, self.tok[0])) | |
@build_parse_tree | |
@print_parse | |
def record(self): | |
name = self.match(Lexer.NAME) | |
self.match(Lexer.EQUAL) | |
value = self.value() | |
return (name , value) | |
@build_parse_tree | |
@print_parse | |
def literal(self): | |
if self.test(Lexer.STRING): | |
return self.match(Lexer.STRING) | |
elif self.test(Lexer.NUM): | |
num = self.match(Lexer.NUM) | |
if '.' in num: | |
return float(num) | |
else: | |
return int(num) | |
else: | |
self.error(-1) | |
@build_parse_tree | |
@print_parse | |
def list(self): | |
ls = [] | |
self.match(Lexer.LBRACK) | |
if self.test(Lexer.RBRACK): | |
self.match(Lexer.RBRACK) | |
else: | |
ls.append(self.literal()) | |
while self.test(Lexer.COMMA): | |
self.match(Lexer.COMMA) | |
ls.append(self.literal()) | |
self.match(Lexer.RBRACK) | |
return ls; | |
@build_parse_tree | |
@print_parse | |
def value(self): | |
if self.test(Lexer.LBRACE): | |
value = {} | |
self.match(Lexer.LBRACE); | |
while self.test(Lexer.NAME): | |
record = self.record(); | |
value[record[0]] = record[1] | |
self.match(Lexer.RBRACE); | |
elif self.test(Lexer.LBRACK): | |
value = self.list() | |
elif self.test(Lexer.STRING) or self.test(Lexer.NUM): | |
value = self.literal(); | |
else: | |
self.error(-1) | |
return value | |
def start(self): | |
self.parse_tree_root = self.parse_tree_current = None | |
return self.record() | |
if __name__ == '__main__': | |
s = ''' | |
NAME = { | |
KEY1 = "Value" | |
KEY2 = ["These are my twisted words", 253, 20.0] | |
NEST_NAME = { | |
// single line comment | |
KEY3 = "Value" | |
KEY4 = [] | |
} | |
KEY5 = [253, "Daily Days"] | |
KEY8 = ["foo", 253] | |
} | |
''' | |
from StringIO import StringIO | |
from pprint import pprint | |
lexer = Lexer(StringIO(s)) | |
parser = Parser(lexer) | |
pprint(parser.start()) | |
pprint(parser.parse_tree_root) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment