Created
April 17, 2016 17:44
-
-
Save tangentstorm/d015c830b17a768aa13b63683db4a751 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
>>> ebnf_sc.scan('x = A | b') | |
[('x', 'IDENT', 0), ('=', '=', 2), ('A', 'IDENT', 4), ('|', '|', 6), ('b', 'IDENT', 8)] | |
""" | |
from collections import namedtuple | |
from warnings import warn | |
import string | |
def T(tag, doc, args): | |
"""Creates a new tuple type.""" | |
res = namedtuple(tag, args) | |
if doc: res.__doc__+=' : '+doc | |
return res | |
def TB(tag, doc, args=['body']): return T(tag, doc, args) | |
def TI(tag, doc, args=['item']): return T(tag, doc, args) | |
def TN(tag, doc, args=['name']): return T(tag, doc, args) | |
def T2(tag, doc, args=['name','body']): return T(tag, doc, args) | |
Gram = T('Gram', 'contains grammar rules (may inherit from `bases`).', | |
['name', 'bases', 'doc', 'body']) | |
Def = T('Def', 'define a named rule.', ['name','body']) | |
Ref = TN('Ref', 'refer to (invoke) a named rule') | |
Any = T('Any', 'match anything', []) | |
Not = TI('Not', 'fail if the pattern would match, but do not consume') | |
Skip = TI('Skip', 'match the pattern, but hide it from other rules') | |
Emp = T('Emp', 'empty pattern (always matches)', []) | |
Emp = Emp() # since it doesn't need arguments | |
Lit = TI('Lit', 'match literal character (using ==)') | |
Str = TI('Str', 'match a string of literals') | |
Seq = TB('Seq', 'match a sequence of patterns') | |
Grp = TB('Grp', 'same as Seq, but renders in parentheses') | |
Alt = TB('Alt', 'match any of the alternatives') | |
Rep = TI('Rep', 'match 1 or more repetitions.') | |
Opt = TI('Opt', 'match 0 or 1 repetitions.') | |
Orp = TI('Orp', 'match 0 or more repetitions.') | |
Var = T2('Var', 'save matched string in a variable.') | |
Val = TN('Val', 'match against the saved value.') | |
New = T2('New', 'build a new class/tuple instance') | |
Arg = TB('Arg', 'pass matched data as arg to containing "New"') | |
def node_type(node): | |
return node.__class__.__name__ | |
class Dispatcher: | |
"""Provides a simple generic dispatch mechanism based on method names""" | |
def _find_handler(self, prefix, node): | |
"""(prefix, namedtuple) -> callable""" | |
return getattr(self, '_'.join([prefix, node_type(node)]), self._unhandled) | |
def _unhandled(self, node, *a, **kw): | |
"""Warn about unrecognized node types. (Just for development.)""" | |
raise ValueError("no handler found for %s" % node_type(node)) | |
def dispatch(self, prefix, node, *a, **kw): | |
"""Find and invoke a handler for the given node.""" | |
h = self._find_handler(prefix, node) | |
return h(node, *a, **kw) | |
class Cursor: | |
def __init__(self, seq:[any]): | |
self.seq = seq # sequence (probably a string) | |
self.val = None # current value | |
self.pos = -1 # current position | |
self.fwd() | |
def fwd(self)->any: | |
"""Move forward in the sequence and return the next item.""" | |
end = len(self.seq) | |
self.pos = min(self.pos+1, end) | |
self.val = None if self.pos == end else self.seq[self.pos] | |
return self | |
def at_end(self)->bool: | |
"""Are we at the end of the sequence?""" | |
return self.val is None | |
Match = namedtuple("Match", ['txt', 'pos']) | |
Match.__doc__ = "Match Result" | |
class Fail: | |
"""Value to indicate failure.""" | |
def __repr__(self): | |
return "FAIL" | |
FAIL = Fail() | |
class M(namedtuple("M", ['val', 'cur', 'env'])): | |
"""Internal Match State""" | |
@property | |
def matched(self): | |
return self.val is not FAIL | |
class Matcher(Dispatcher): | |
""" | |
A simple matcher for regular languages. | |
>>> m = Matcher(Alt([Lit("x"), Lit("y")])) # like re.compile("x|y") | |
>>> m.match('xyz') # should match "x" at position 0 | |
Match(txt='x', pos=0) | |
>>> m.match("zyx") # should fail to match | |
FAIL | |
>>> Matcher(Emp).match("hello") | |
Match(txt='', pos=0) | |
>>> m = Matcher(Seq([Lit("a"), Alt([Lit("a"), Lit("b")])])) | |
>>> m.match("ab") | |
Match(txt='ab', pos=0) | |
>>> m.match("ac") | |
FAIL | |
>>> Matcher(Str("hello")).match("hello") | |
Match(txt='hello', pos=0) | |
>>> Matcher(Rep(Lit("a"))).match("aaabbbccc") | |
Match(txt='aaa', pos=0) | |
>>> m = Matcher(Opt(Lit("a"))) | |
>>> m.match("abc") | |
Match(txt='a', pos=0) | |
>>> m.match("xyz") | |
Match(txt='', pos=0) | |
>>> m = Matcher(Orp(Lit("a"))) | |
>>> m.match("aaabc") | |
Match(txt='aaa', pos=0) | |
>>> m.match("xyz") | |
Match(txt='', pos=0) | |
""" | |
def __init__(self, node): | |
self.root = node | |
def _match(self, node, cur, env): | |
"""returns a match state tuple (the `M` class)""" | |
return self.dispatch('match', node, cur, env) | |
def match(self, s:str): | |
cur = Cursor(s) | |
env = {} | |
return self._match(self.root, cur, env).val | |
# class Matcher: | |
def match_Lit(self, node, cur, env): | |
return (M(Match(cur.val, cur.pos), cur.fwd(), env) if cur.val == node.item | |
else M(FAIL, cur, env)) | |
# class Matcher: | |
def match_Alt(self, node, cur, env): | |
for item in node.body: | |
m = self._match(item, cur, env) | |
if m.matched: return m | |
return m # last failure | |
# class Matcher: | |
def match_Emp(self, node, cur, env): | |
return M(Match("", cur.pos), cur, env) | |
def _join(self, matches): | |
"""helper to join match results for Seq and Str""" | |
if matches is FAIL: return FAIL | |
else: return Match(''.join(v.txt for v in matches), matches[0].pos) | |
def match_Seq(self, node, cur, env): | |
vals = [] | |
for item in node.body: | |
res = self._match(item, cur, env) | |
if res.val is FAIL: return M(FAIL, res.cur, env) | |
else: | |
val, cur, env = res | |
vals.append(val) | |
return M(self._join(vals), res.cur, env) | |
def match_Str(self, node, cur, env): | |
return self._match(Seq([Lit(c) for c in node.item]), cur, env) | |
def match_Rep(self, node, cur, env): | |
vals = [] | |
while True: | |
res = self._match(node.item, cur, env) | |
if res.val is FAIL: break | |
else: | |
val, cur, env = res | |
vals.append(val) | |
return M(self._join(vals or FAIL), cur, env) | |
def match_Opt(self, node, cur, env): | |
return self._match(Alt([node.item, Emp]), cur, env) | |
def match_Orp(self, node, cur, env): | |
return self._match(Opt(Rep(node.item)), cur, env) | |
class Scanner: | |
def __init__(self, rules: [(str, namedtuple)]): | |
self.order = [rule[0] for rule in rules] # test rules in given order | |
self.rules = dict(rules) | |
# default whitespace handler: | |
if '_' not in rules: | |
self.order.insert(0, '_') | |
self.rules['_'] = Alt([Lit(chr(i)) for i in range(33)]) | |
def gen_tokens(self, txt): | |
cur = Cursor(txt) | |
env = {} | |
matcher = Matcher(Emp) | |
while not cur.at_end(): | |
for rule in self.order: | |
m = matcher._match(self.rules[rule], cur, env) | |
if m.matched: | |
match, cur, env = m | |
if rule != '_': | |
yield (match.txt, rule, match.pos) | |
break | |
else: continue | |
else: | |
raise ValueError("unrecognized character at position %i : '%s'" | |
% (cur.pos, cur.val)) | |
def scan(self, txt): | |
return list(self.gen_tokens(txt)) | |
ebnf_src = '''\ | |
main = { rule } . | |
rule = IDENT "=" expr "." . | |
expr = term { "|" term } . | |
term = factor { factor } . | |
factor = IDENT | STRING | "{" expr "}" | "[" expr "]" | "(" expr ")" . | |
''' | |
ECHR, SQ, DQ = ['\\', "'", '"'] | |
LETTER = Alt([Lit(ch) for ch in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ']) | |
STRCHR = Alt([Seq([Lit(ECHR), Alt([ Lit(ECHR), Lit(DQ) ])]), | |
Alt([Lit(ch) for ch in map(chr, range(32,127)) if ch not in '"\\'])]) | |
ebnf_sc = Scanner([(ch, Lit(ch)) for ch in "{([=|.])}"] | |
+ [('IDENT', Rep(LETTER)), | |
('STRING', Alt([Seq([Lit(DQ), Rep(STRCHR), Lit(DQ)])]))]) | |
base = Gram('ebnf', [], "rules common to all grammars", [ | |
Def('main', Orp('token')), | |
Def('token',Seq([Skip(Orp(Ref('space'))), | |
Alt([Ref('STRING'), Ref('NUMBER'), | |
Ref('IDENT'), Ref('DELIM'), | |
Rep(Not(Ref('space')))])])), | |
Def('space', Orp('White')), | |
# character classes: | |
Def('White', Alt([chr(c) for c in range(33)])), | |
Def('Upper', Alt(list(string.ascii_uppercase))), | |
Def('Lower', Alt(list(string.ascii_lowercase))), | |
Def('Alpha', Alt([Ref('Lower'), Ref('Upper')])), | |
Def('Under', Lit('_')), | |
Def('Neg', Lit('-')), | |
Def('Digit', Alt([Lit(c) for c in string.digits])), | |
Def('Hexit', Alt([Ref('Digit')]+[Lit(c) for c in 'abcdefABCDEF'])), | |
Def('Alnum', Alt([Ref('Under'), Ref('Alpha'), Ref('Digit')])), | |
# simple patterns: | |
Def('IDENT', Seq([Alt([Ref('Under'),Ref('Alpha')]), Orp(Ref('Alnum'))])), | |
Def('NUMBER',Seq([Opt(Ref('Neg')), Rep(Ref('Digit')), | |
Orp([Ref('Under'), | |
Ref('Digit'),Ref('Digit'),Ref('Digit')])])), | |
Def('STRING', Seq([Lit(DQ), Rep(Ref('STRCHR')), Lit(DQ)])), | |
Def('STRCHR', Alt([Seq([Lit(ECHR), Alt([ Lit(ECHR), Lit(DQ) ])]), | |
Not(DQ) ])), | |
Def('DELIM', Alt(list('(){}[]'))), | |
]) | |
ebnf = Gram('ebnf', [base], "ebnf meta-grammar (for parsing grammars)", [ | |
Def('main', Orp(Ref('rule'))), | |
Def('rule', Seq([Var('name', Ref('IDENT')), | |
Lit('='), Ref('expr'), Lit('.') ])), | |
Def('expr', Seq([ Ref('term'), Orp([Lit('|'), Ref('term') ]) ])), | |
Def('term', Seq([ Ref('factor'), Rep(Ref('factor')) ])), | |
Def('factor', Alt([Ref('IDENT'), Ref('STRING'), | |
Ref('rep'), Ref('opt'), Ref('grp')])), | |
Def('rep', Seq([Lit('{'), New(Rep, Ref('expr')), Lit('}')])), # 'x*' | |
Def('opt', Seq([Lit('['), New(Opt, Ref('expr')), Lit(']')])), # 'x?' | |
Def('grp', Seq([Lit('('), New(Grp, Ref('expr')), Lit(')')])), # '(x)' | |
]) | |
HOME = {} # arbitrary dictionary object | |
class World(dict): | |
def __init__(self, proto=HOME): | |
super(World, self).__init__() | |
self.proto = proto | |
def __getattr__(self, name): | |
# called when attribute has no local definition. | |
return getattr(self.proto, name) | |
def __getitem__(self, key): | |
if key in self.keys(): return super(World, self)[key] | |
else: return self.proto[key] | |
def changed(self, key, val): | |
"""Forks a new world, with one key changed.""" | |
res = World(self) | |
res[key] = val | |
return res | |
class Grin(Dispatcher): | |
"""Grammar Interpreter""" | |
def __init__(self, root): | |
super(Grin,self).__init__(root) | |
self.init(root) | |
def parse(self, src): | |
self.env = World() | |
self.src, self.pos, self.ch = src, 0, '' | |
self.page, self.line, self.col = 0, 0, 0 | |
# (inside `class Grin`...) | |
def match_Ref(self, node, cur, env): | |
# pass in fresh World, then discard changes | |
res = self.match(self.defs[node.name], cur, World()) | |
if res.val is FAIL: return (FAIL, env) | |
else: return (res.val, res[1], env) | |
def match_Not(self, node, cur, env): | |
res = self.match(node.item, cur, env) | |
if res.val is FAIL: return (None, cur, res[1]) | |
else: return (FAIL, res[1]) | |
def match_Var(self, node, cur, env): | |
res = self.match(node.item, cur, env) | |
if res.val is FAIL: return res | |
else: return (res.val, cur, env.changed(node.name, res.val)) | |
def match_Act(self, node, cur, env): | |
raise NotImplementedError('no semantic actions yet.') | |
def match_Box(self, node, cur, env): | |
raise NotImplementedError('no tree matching yet.') | |
# (still inside `class Grin`...) | |
def init(self, node): | |
return self.dispatch('init', node) | |
def init_Gram(self, node): | |
self.defs = {} | |
for child in node.body: self.init(child) | |
def init_Def(self, node): | |
self.defs[node.name] = node | |
if __name__=="__main__": | |
print(Grin(ebnf).parse(ebnf_src)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment