Created
September 20, 2014 13:20
-
-
Save veer66/44d1adec9545a647bb0a to your computer and use it in GitHub Desktop.
Simple Earley Parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import sys | |
import re | |
import copy | |
import pprint | |
pp = pprint.PrettyPrinter(indent=4) | |
class EarleyParser(object): | |
def __init__(self): | |
pass | |
def run_cmd(self): | |
if len(sys.argv) != 2: | |
print("Usage: python", sys.argv[0], "<grammar in json format>", | |
file=sys.stderr) | |
exit(1) | |
self.grammar = json.load(open(sys.argv[1])) | |
for line in sys.stdin: | |
chart = self.parse(re.split('[\s+]', re.split('[\r\n]', line)[0])) | |
pp.pprint(chart) | |
def add_state(self, rule, c, i, p=0): | |
state = copy.copy(rule) | |
state["i"] = i | |
state["p"] = p | |
state_repr = json.dumps(state) | |
if state_repr not in self.idx[c]: | |
self.chart[c].append(state) | |
self.idx[c][state_repr] = True | |
def parse(self, words): | |
self.words = words | |
self.idx = [{} for i in range(len(self.words) +1)] | |
self.chart = [[] for i in range(len(self.words) + 1)] | |
self.add_state(self.grammar["start"], c=0, i=0) | |
for i in range(len(self.words) + 1): | |
for state in self.chart[i]: | |
if self.is_incomplete(state): | |
if self.is_next_nonterminal(state): | |
self.predict(state, i) | |
else: | |
self.scan(state, i) | |
else: | |
self.complete(state, i) | |
return self.chart | |
def is_incomplete(self, state): | |
return state["p"] < len(state["r"]) | |
def is_next_nonterminal(self, state): | |
return re.match('[A-Z]', state["r"][state["p"]]) is not None | |
def predict(self, state, j): | |
b = state["r"][state["p"]] | |
for rule in filter(lambda rule: rule["l"] == b, self.grammar["rules"]): | |
self.add_state(rule, c=j, i=j) | |
def scan(self, state, j): | |
b = state["r"][state["p"]] | |
w = self.words[j] | |
if b in self.grammar["dix"][w]: | |
self.add_state({"l": b, "r": [w]}, c=j+1, i=j, p=1) | |
def complete(self, state, k): | |
b = state["l"] | |
j = state["i"] | |
for _state in self.chart[j]: | |
if self.is_incomplete(_state) and _state["r"][_state["p"]] == b: | |
self.add_state(_state, c=k, p=_state["p"]+1, i=_state["i"]) | |
parser = EarleyParser() | |
parser.run_cmd() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment