Last active
April 30, 2017 09:34
-
-
Save mgd020/3fc49d169196017895aa9f310760457d to your computer and use it in GitHub Desktop.
Python implementation of Earley recogniser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import absolute_import, division, print_function, unicode_literals | |
from collections import namedtuple | |
from itertools import chain | |
class Rule(namedtuple('Rule', ['lhs', 'rhs'])): | |
def __new__(cls, lhs, rhs): | |
return super(Rule, cls).__new__(cls, lhs, tuple(rhs)) # ensure hashable | |
class Item(namedtuple('Item', ['rule', 'index', 'origin'])): | |
def __str__(self): | |
return '{} -> {} ({})'.format( | |
self.rule.lhs, | |
' '.join(self.rule.rhs[:self.index] + ('.',) + self.rule.rhs[self.index:]), | |
self.origin, | |
) | |
def symbol(self): | |
return self.rule.rhs[self.index] | |
def next(self): | |
return self._replace(index=self.index + 1) | |
def set_add(s, e): | |
"""Add element to set, returning True if it was not already in the set.""" | |
l = len(s) | |
s.add(e) | |
return l != len(s) | |
def earley(grammar, tokens, verbose=False): | |
""" | |
Earley recogniser. | |
grammar: a list of Rules | |
tokens: a iterator of input tokens | |
""" | |
start_item = Item(rule=grammar[0], index=0, origin=0) | |
s = [{start_item}] | |
lhs_rules = {} # {Rule.lhs: [Rule]} | |
for rule in grammar: | |
lhs_rules.setdefault(rule.lhs, []).append(rule) | |
for k, token in enumerate(chain(tokens, (None,))): # each token and then END | |
sk = s[k] | |
items = list(sk) | |
sk1 = set() | |
s.append(sk1) | |
for item in items: | |
try: | |
symbol = item.symbol() | |
except IndexError: # completion: add origins to s[k] | |
oitems = items if item.origin == k else s[item.origin] | |
for oitem in oitems: | |
try: | |
if oitem.symbol() == item.rule.lhs: | |
nitem = oitem.next() | |
set_add(sk, nitem) and items.append(nitem) | |
except IndexError: | |
pass # origin rule is complete, so it can't possibly match | |
else: | |
try: | |
rules = lhs_rules[symbol] | |
except KeyError: # scanning: check for matching token | |
if symbol == token: | |
sk1.add(item.next()) | |
else: # prediction: expand possible rules | |
for rule in rules: | |
nitem = Item(rule, 0, k) | |
set_add(sk, nitem) and items.append(nitem) | |
if verbose: | |
for item in items: | |
print(item) | |
if token: | |
print('s[%d]:' % k, token) | |
if token and not sk1: | |
return False | |
end_item = start_item._replace(index=len(start_item.rule.rhs)) | |
return end_item in s[k] | |
g = [ | |
Rule('S', 'A'), | |
Rule('A', ''), | |
Rule('A', 'B'), | |
Rule('B', 'A'), | |
] | |
assert earley(g, 'a', True) is False | |
assert earley(g, '', True) is True |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment