Skip to content

Instantly share code, notes, and snippets.

@tangentstorm
Created April 17, 2016 17:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tangentstorm/d015c830b17a768aa13b63683db4a751 to your computer and use it in GitHub Desktop.
Save tangentstorm/d015c830b17a768aa13b63683db4a751 to your computer and use it in GitHub Desktop.
"""
>>> ebnf_sc.scan('x = A | b')
[('x', 'IDENT', 0), ('=', '=', 2), ('A', 'IDENT', 4), ('|', '|', 6), ('b', 'IDENT', 8)]
"""
from collections import namedtuple
from warnings import warn
import string
def T(tag, doc, args):
"""Creates a new tuple type."""
res = namedtuple(tag, args)
if doc: res.__doc__+=' : '+doc
return res
def TB(tag, doc, args=['body']): return T(tag, doc, args)
def TI(tag, doc, args=['item']): return T(tag, doc, args)
def TN(tag, doc, args=['name']): return T(tag, doc, args)
def T2(tag, doc, args=['name','body']): return T(tag, doc, args)
Gram = T('Gram', 'contains grammar rules (may inherit from `bases`).',
['name', 'bases', 'doc', 'body'])
Def = T('Def', 'define a named rule.', ['name','body'])
Ref = TN('Ref', 'refer to (invoke) a named rule')
Any = T('Any', 'match anything', [])
Not = TI('Not', 'fail if the pattern would match, but do not consume')
Skip = TI('Skip', 'match the pattern, but hide it from other rules')
Emp = T('Emp', 'empty pattern (always matches)', [])
Emp = Emp() # since it doesn't need arguments
Lit = TI('Lit', 'match literal character (using ==)')
Str = TI('Str', 'match a string of literals')
Seq = TB('Seq', 'match a sequence of patterns')
Grp = TB('Grp', 'same as Seq, but renders in parentheses')
Alt = TB('Alt', 'match any of the alternatives')
Rep = TI('Rep', 'match 1 or more repetitions.')
Opt = TI('Opt', 'match 0 or 1 repetitions.')
Orp = TI('Orp', 'match 0 or more repetitions.')
Var = T2('Var', 'save matched string in a variable.')
Val = TN('Val', 'match against the saved value.')
New = T2('New', 'build a new class/tuple instance')
Arg = TB('Arg', 'pass matched data as arg to containing "New"')
def node_type(node):
return node.__class__.__name__
class Dispatcher:
"""Provides a simple generic dispatch mechanism based on method names"""
def _find_handler(self, prefix, node):
"""(prefix, namedtuple) -> callable"""
return getattr(self, '_'.join([prefix, node_type(node)]), self._unhandled)
def _unhandled(self, node, *a, **kw):
"""Warn about unrecognized node types. (Just for development.)"""
raise ValueError("no handler found for %s" % node_type(node))
def dispatch(self, prefix, node, *a, **kw):
"""Find and invoke a handler for the given node."""
h = self._find_handler(prefix, node)
return h(node, *a, **kw)
class Cursor:
def __init__(self, seq:[any]):
self.seq = seq # sequence (probably a string)
self.val = None # current value
self.pos = -1 # current position
self.fwd()
def fwd(self)->any:
"""Move forward in the sequence and return the next item."""
end = len(self.seq)
self.pos = min(self.pos+1, end)
self.val = None if self.pos == end else self.seq[self.pos]
return self
def at_end(self)->bool:
"""Are we at the end of the sequence?"""
return self.val is None
Match = namedtuple("Match", ['txt', 'pos'])
Match.__doc__ = "Match Result"
class Fail:
"""Value to indicate failure."""
def __repr__(self):
return "FAIL"
FAIL = Fail()
class M(namedtuple("M", ['val', 'cur', 'env'])):
"""Internal Match State"""
@property
def matched(self):
return self.val is not FAIL
class Matcher(Dispatcher):
"""
A simple matcher for regular languages.
>>> m = Matcher(Alt([Lit("x"), Lit("y")])) # like re.compile("x|y")
>>> m.match('xyz') # should match "x" at position 0
Match(txt='x', pos=0)
>>> m.match("zyx") # should fail to match
FAIL
>>> Matcher(Emp).match("hello")
Match(txt='', pos=0)
>>> m = Matcher(Seq([Lit("a"), Alt([Lit("a"), Lit("b")])]))
>>> m.match("ab")
Match(txt='ab', pos=0)
>>> m.match("ac")
FAIL
>>> Matcher(Str("hello")).match("hello")
Match(txt='hello', pos=0)
>>> Matcher(Rep(Lit("a"))).match("aaabbbccc")
Match(txt='aaa', pos=0)
>>> m = Matcher(Opt(Lit("a")))
>>> m.match("abc")
Match(txt='a', pos=0)
>>> m.match("xyz")
Match(txt='', pos=0)
>>> m = Matcher(Orp(Lit("a")))
>>> m.match("aaabc")
Match(txt='aaa', pos=0)
>>> m.match("xyz")
Match(txt='', pos=0)
"""
def __init__(self, node):
self.root = node
def _match(self, node, cur, env):
"""returns a match state tuple (the `M` class)"""
return self.dispatch('match', node, cur, env)
def match(self, s:str):
cur = Cursor(s)
env = {}
return self._match(self.root, cur, env).val
# class Matcher:
def match_Lit(self, node, cur, env):
return (M(Match(cur.val, cur.pos), cur.fwd(), env) if cur.val == node.item
else M(FAIL, cur, env))
# class Matcher:
def match_Alt(self, node, cur, env):
for item in node.body:
m = self._match(item, cur, env)
if m.matched: return m
return m # last failure
# class Matcher:
def match_Emp(self, node, cur, env):
return M(Match("", cur.pos), cur, env)
def _join(self, matches):
"""helper to join match results for Seq and Str"""
if matches is FAIL: return FAIL
else: return Match(''.join(v.txt for v in matches), matches[0].pos)
def match_Seq(self, node, cur, env):
vals = []
for item in node.body:
res = self._match(item, cur, env)
if res.val is FAIL: return M(FAIL, res.cur, env)
else:
val, cur, env = res
vals.append(val)
return M(self._join(vals), res.cur, env)
def match_Str(self, node, cur, env):
return self._match(Seq([Lit(c) for c in node.item]), cur, env)
def match_Rep(self, node, cur, env):
vals = []
while True:
res = self._match(node.item, cur, env)
if res.val is FAIL: break
else:
val, cur, env = res
vals.append(val)
return M(self._join(vals or FAIL), cur, env)
def match_Opt(self, node, cur, env):
return self._match(Alt([node.item, Emp]), cur, env)
def match_Orp(self, node, cur, env):
return self._match(Opt(Rep(node.item)), cur, env)
class Scanner:
def __init__(self, rules: [(str, namedtuple)]):
self.order = [rule[0] for rule in rules] # test rules in given order
self.rules = dict(rules)
# default whitespace handler:
if '_' not in rules:
self.order.insert(0, '_')
self.rules['_'] = Alt([Lit(chr(i)) for i in range(33)])
def gen_tokens(self, txt):
cur = Cursor(txt)
env = {}
matcher = Matcher(Emp)
while not cur.at_end():
for rule in self.order:
m = matcher._match(self.rules[rule], cur, env)
if m.matched:
match, cur, env = m
if rule != '_':
yield (match.txt, rule, match.pos)
break
else: continue
else:
raise ValueError("unrecognized character at position %i : '%s'"
% (cur.pos, cur.val))
def scan(self, txt):
return list(self.gen_tokens(txt))
ebnf_src = '''\
main = { rule } .
rule = IDENT "=" expr "." .
expr = term { "|" term } .
term = factor { factor } .
factor = IDENT | STRING | "{" expr "}" | "[" expr "]" | "(" expr ")" .
'''
ECHR, SQ, DQ = ['\\', "'", '"']
LETTER = Alt([Lit(ch) for ch in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'])
STRCHR = Alt([Seq([Lit(ECHR), Alt([ Lit(ECHR), Lit(DQ) ])]),
Alt([Lit(ch) for ch in map(chr, range(32,127)) if ch not in '"\\'])])
ebnf_sc = Scanner([(ch, Lit(ch)) for ch in "{([=|.])}"]
+ [('IDENT', Rep(LETTER)),
('STRING', Alt([Seq([Lit(DQ), Rep(STRCHR), Lit(DQ)])]))])
base = Gram('ebnf', [], "rules common to all grammars", [
Def('main', Orp('token')),
Def('token',Seq([Skip(Orp(Ref('space'))),
Alt([Ref('STRING'), Ref('NUMBER'),
Ref('IDENT'), Ref('DELIM'),
Rep(Not(Ref('space')))])])),
Def('space', Orp('White')),
# character classes:
Def('White', Alt([chr(c) for c in range(33)])),
Def('Upper', Alt(list(string.ascii_uppercase))),
Def('Lower', Alt(list(string.ascii_lowercase))),
Def('Alpha', Alt([Ref('Lower'), Ref('Upper')])),
Def('Under', Lit('_')),
Def('Neg', Lit('-')),
Def('Digit', Alt([Lit(c) for c in string.digits])),
Def('Hexit', Alt([Ref('Digit')]+[Lit(c) for c in 'abcdefABCDEF'])),
Def('Alnum', Alt([Ref('Under'), Ref('Alpha'), Ref('Digit')])),
# simple patterns:
Def('IDENT', Seq([Alt([Ref('Under'),Ref('Alpha')]), Orp(Ref('Alnum'))])),
Def('NUMBER',Seq([Opt(Ref('Neg')), Rep(Ref('Digit')),
Orp([Ref('Under'),
Ref('Digit'),Ref('Digit'),Ref('Digit')])])),
Def('STRING', Seq([Lit(DQ), Rep(Ref('STRCHR')), Lit(DQ)])),
Def('STRCHR', Alt([Seq([Lit(ECHR), Alt([ Lit(ECHR), Lit(DQ) ])]),
Not(DQ) ])),
Def('DELIM', Alt(list('(){}[]'))),
])
ebnf = Gram('ebnf', [base], "ebnf meta-grammar (for parsing grammars)", [
Def('main', Orp(Ref('rule'))),
Def('rule', Seq([Var('name', Ref('IDENT')),
Lit('='), Ref('expr'), Lit('.') ])),
Def('expr', Seq([ Ref('term'), Orp([Lit('|'), Ref('term') ]) ])),
Def('term', Seq([ Ref('factor'), Rep(Ref('factor')) ])),
Def('factor', Alt([Ref('IDENT'), Ref('STRING'),
Ref('rep'), Ref('opt'), Ref('grp')])),
Def('rep', Seq([Lit('{'), New(Rep, Ref('expr')), Lit('}')])), # 'x*'
Def('opt', Seq([Lit('['), New(Opt, Ref('expr')), Lit(']')])), # 'x?'
Def('grp', Seq([Lit('('), New(Grp, Ref('expr')), Lit(')')])), # '(x)'
])
HOME = {} # arbitrary dictionary object
class World(dict):
def __init__(self, proto=HOME):
super(World, self).__init__()
self.proto = proto
def __getattr__(self, name):
# called when attribute has no local definition.
return getattr(self.proto, name)
def __getitem__(self, key):
if key in self.keys(): return super(World, self)[key]
else: return self.proto[key]
def changed(self, key, val):
"""Forks a new world, with one key changed."""
res = World(self)
res[key] = val
return res
class Grin(Dispatcher):
"""Grammar Interpreter"""
def __init__(self, root):
super(Grin,self).__init__(root)
self.init(root)
def parse(self, src):
self.env = World()
self.src, self.pos, self.ch = src, 0, ''
self.page, self.line, self.col = 0, 0, 0
# (inside `class Grin`...)
def match_Ref(self, node, cur, env):
# pass in fresh World, then discard changes
res = self.match(self.defs[node.name], cur, World())
if res.val is FAIL: return (FAIL, env)
else: return (res.val, res[1], env)
def match_Not(self, node, cur, env):
res = self.match(node.item, cur, env)
if res.val is FAIL: return (None, cur, res[1])
else: return (FAIL, res[1])
def match_Var(self, node, cur, env):
res = self.match(node.item, cur, env)
if res.val is FAIL: return res
else: return (res.val, cur, env.changed(node.name, res.val))
def match_Act(self, node, cur, env):
raise NotImplementedError('no semantic actions yet.')
def match_Box(self, node, cur, env):
raise NotImplementedError('no tree matching yet.')
# (still inside `class Grin`...)
def init(self, node):
return self.dispatch('init', node)
def init_Gram(self, node):
self.defs = {}
for child in node.body: self.init(child)
def init_Def(self, node):
self.defs[node.name] = node
if __name__=="__main__":
print(Grin(ebnf).parse(ebnf_src))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment