Skip to content

Instantly share code, notes, and snippets.

@jagt
Last active December 16, 2015 17:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jagt/5473794 to your computer and use it in GitHub Desktop.
Save jagt/5473794 to your computer and use it in GitHub Desktop.
Simple LL(1) lexer and parser for a random data structure
# Simple LL(1) lexer and parser for a random data structure
# Based on description on book Language Implementation Patterns
class Lexer(object):
WS, NAME, STRING, NUM, LBRACK, RBRACK, LBRACE, RBRACE, COMMA, EQUAL, COMMENT,END = range(12);
EOF = '' # StringIO gives '' when nothing to read
def __init__(self, inio):
self.inio = inio
self.consume()
def consume(self):
self.char = self.inio.read(1)
def match(self, c):
if (self.char == c):
self.consume()
else:
raise RuntimeError("Expecting %s, getting %s" % (c, self.char))
def next_token(self):
while (self.char != Lexer.EOF):
while self.char != Lexer.EOF and self.char in ' \n\t\r': self.consume() # consume all white space
if self.char == '[': self.consume(); return (Lexer.LBRACK, '[')
elif self.char == ']': self.consume(); return (Lexer.RBRACK, ']')
elif self.char == '{': self.consume(); return (Lexer.LBRACE, '{')
elif self.char == '}': self.consume(); return (Lexer.RBRACE, '}')
elif self.char == '=': self.consume(); return (Lexer.EQUAL, '=')
elif self.char == ',': self.consume(); return (Lexer.COMMA, ',')
elif self.char.isalpha():
namels = [self.char]; self.consume()
while self.char.isalpha() or self.char.isdigit() or self.char == '_':
namels.append(self.char)
self.consume()
return (Lexer.NAME, ''.join(namels))
elif self.char.isdigit() or self.char == '.':
numls = [self.char]; self.consume()
while self.char.isdigit() or self.char == '.':
numls.append(self.char)
self.consume()
return (Lexer.NUM, ''.join(numls))
elif self.char in '"\'':
strls = [self.char]; self.consume();
while self.char != strls[0]:
strls.append(self.char)
self.consume()
strls.append(self.char) # add closing char
self.consume()
return (Lexer.STRING, ''.join(strls))
elif self.char == '/':
self.consume()
self.match('/')
while self.char != '\n':
self.consume()
return (Lexer.COMMENT, "#comment")
else:
raise RuntimeError("Illegal character %s" % self.char)
return (Lexer.END, '')
class Parser(object):
class ParseTreeNode(list):
def __init__(self, name):
list.__init__(self)
self.name = name
def __str__(self):
return self.name + ':' + list.__str__(self)
def __repr__(self):
return self.name + ':' + list.__repr__(self)
def __init__(self, lexer):
self.lexer = lexer
self.consume()
self.parse_tree_root = None
self.parse_tree_current = None
def consume(self):
self.tok = lexer.next_token()
if self.test(Lexer.COMMENT): # ignore as whitespace
self.consume()
def match(self, tokid):
if self.test(tokid):
val = self.tok[1]
self.consume();
return val
else:
self.error(tokid)
def test(self, tokid):
return self.tok[0] == tokid
def print_parse(func):
def _wrapper(self):
print '# parsing %s' % func.func_name
val = func(self)
print '# end parsing %s' % func.func_name
return val
_wrapper.func_name = func.func_name
return _wrapper
# build very basic parse tree through decorator
def build_parse_tree(func):
def _wrapper(self):
node = Parser.ParseTreeNode(func.func_name)
if self.parse_tree_root is None:
self.parse_tree_current = self.parse_tree_root = node
else:
self.parse_tree_current.append(node)
before = self.parse_tree_current
self.parse_tree_current = node
val = func(self)
self.parse_tree_current = before
return val
_wrapper.func_name = func.func_name
return _wrapper
def error(self, tokid):
raise RuntimeError("Expecting tokken %d, getting %d" % (tokid, self.tok[0]))
@build_parse_tree
@print_parse
def record(self):
name = self.match(Lexer.NAME)
self.match(Lexer.EQUAL)
value = self.value()
return (name , value)
@build_parse_tree
@print_parse
def literal(self):
if self.test(Lexer.STRING):
return self.match(Lexer.STRING)
elif self.test(Lexer.NUM):
num = self.match(Lexer.NUM)
if '.' in num:
return float(num)
else:
return int(num)
else:
self.error(-1)
@build_parse_tree
@print_parse
def list(self):
ls = []
self.match(Lexer.LBRACK)
if self.test(Lexer.RBRACK):
self.match(Lexer.RBRACK)
else:
ls.append(self.literal())
while self.test(Lexer.COMMA):
self.match(Lexer.COMMA)
ls.append(self.literal())
self.match(Lexer.RBRACK)
return ls;
@build_parse_tree
@print_parse
def value(self):
if self.test(Lexer.LBRACE):
value = {}
self.match(Lexer.LBRACE);
while self.test(Lexer.NAME):
record = self.record();
value[record[0]] = record[1]
self.match(Lexer.RBRACE);
elif self.test(Lexer.LBRACK):
value = self.list()
elif self.test(Lexer.STRING) or self.test(Lexer.NUM):
value = self.literal();
else:
self.error(-1)
return value
def start(self):
self.parse_tree_root = self.parse_tree_current = None
return self.record()
if __name__ == '__main__':
s = '''
NAME = {
KEY1 = "Value"
KEY2 = ["These are my twisted words", 253, 20.0]
NEST_NAME = {
// single line comment
KEY3 = "Value"
KEY4 = []
}
KEY5 = [253, "Daily Days"]
KEY8 = ["foo", 253]
}
'''
from StringIO import StringIO
from pprint import pprint
lexer = Lexer(StringIO(s))
parser = Parser(lexer)
pprint(parser.start())
pprint(parser.parse_tree_root)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment