-
-
Save judofyr/80bd41893985f49b7550294b295d3afe to your computer and use it in GitHub Desktop.
Parser for CFG based on Glushkov's construction algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/build |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pylab import * | |
data = loadtxt('build/complexity.txt') | |
n = len(data) | |
# Fit a polynom on the first half | |
fst = data[:n//2] | |
p2 = polyfit(fst[:, 0], fst[:, 1], 2) | |
p3 = polyfit(fst[:, 0], fst[:, 1], 3) | |
plot(data[:, 0], data[:, 1], label="Data") | |
plot(data[:, 0], polyval(p2, data[:, 0]), label="n^2, fitted") | |
plot(data[:, 0], polyval(p3, data[:, 0]), label="n^3, fitted") | |
legend(loc="best") | |
show() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require_relative 'glush' | |
require 'benchmark' | |
# This has an exponential amount of parse trees, | |
# but should be recognized in O(n^k) where k<=3. | |
S = Grammar.build { | |
lazy \ | |
def s | |
c("n") | (s >> c("+") >> s) | |
end | |
s | |
} | |
(50..130).each do |i| | |
input = "n" + ("+n")*i | |
print i, " " | |
time = Benchmark.measure { | |
m = Matcher.new(S) | |
m.parse(input) or raise "failed to parse" | |
} | |
puts time.real | |
$stdout.flush | |
end | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'set' | |
require 'pp' | |
class Parser | |
attr_accessor :is_empty | |
def empty? | |
if defined?(@is_empty) | |
@is_empty | |
else | |
raise "empty is not computed, #{self.class}" | |
end | |
end | |
def >>(other) | |
Seq.new(self, other) | |
end | |
def |(other) | |
Alt.new(self, other) | |
end | |
def many | |
many1.maybe | |
end | |
def many1 | |
Many1.new(self) | |
end | |
def maybe | |
Maybe.new(self) | |
end | |
end | |
class Char < Parser | |
def initialize(char) | |
@char = char | |
end | |
def matches?(char) | |
@char == char | |
end | |
def calculate_empty(b) | |
@is_empty = false | |
end | |
def first_set | |
Set[self] | |
end | |
def last_set | |
Set[self] | |
end | |
def each_pair | |
end | |
end | |
class Eps < Parser | |
def empty? | |
true | |
end | |
def calculate_empty(b) | |
@is_empty = true | |
end | |
def first_set | |
Set[] | |
end | |
def last_set | |
Set[] | |
end | |
def each_pair | |
end | |
end | |
class Alt < Parser | |
def initialize(left, right) | |
@left = left | |
@right = right | |
end | |
def calculate_empty(b) | |
@is_empty = @left.calculate_empty(b) | @right.calculate_empty(b) | |
end | |
def first_set | |
@first_set ||= @left.first_set | @right.first_set | |
end | |
def last_set | |
@last_set ||= @left.last_set | @right.last_set | |
end | |
def each_pair(&blk) | |
@left.each_pair(&blk) | |
@right.each_pair(&blk) | |
end | |
end | |
class Seq < Parser | |
def initialize(left, right) | |
@left = left | |
@right = right | |
end | |
def calculate_empty(b) | |
@is_empty = @left.calculate_empty(b) & @right.calculate_empty(b) | |
end | |
def first_set | |
@first_set ||= @left.empty? ? (@left.first_set | @right.first_set) : @left.first_set | |
end | |
def last_set | |
@last_set ||= @right.empty? ? (@left.last_set | @right.last_set) : @right.last_set | |
end | |
def each_pair(&blk) | |
@left.each_pair(&blk) | |
@right.each_pair(&blk) | |
@left.last_set.each do |a| | |
@right.first_set.each do |b| | |
yield a, b | |
end | |
end | |
end | |
end | |
class Maybe < Parser | |
def initialize(child) | |
@child = child | |
end | |
def calculate_empty(b) | |
@child.calculate_empty(b) | |
@is_empty = true | |
end | |
def first_set | |
@child.first_set | |
end | |
def last_set | |
@child.last_set | |
end | |
def each_pair(&blk) | |
@child.each_pair(&blk) | |
end | |
end | |
class Many1 < Parser | |
def initialize(child) | |
@child = child | |
end | |
def calculate_empty(b) | |
@is_empty = @child.calculate_empty(b) | |
end | |
def first_set | |
@child.first_set | |
end | |
def last_set | |
@child.last_set | |
end | |
def each_pair(&blk) | |
@child.each_pair(&blk) | |
@child.last_set.each do |a| | |
@child.first_set.each do |b| | |
yield a, b | |
end | |
end | |
end | |
end | |
class Rule | |
def initialize(&blk) | |
@code = blk | |
end | |
def call | |
RuleCall.new(self) | |
end | |
def force | |
@force ||= @code.call | |
end | |
def each_pair(&blk) | |
force.each_pair(&blk) | |
force.last_set.each do |a| | |
# This is maybe a bit hacky since a Rule strictly speaking isn't | |
# a parser. This is here to represent the fact that the last_set | |
# will complete this rule. | |
yield a, self | |
end | |
end | |
end | |
class RuleCall < Parser | |
attr_reader :rule | |
def initialize(rule) | |
@rule = rule | |
end | |
def calculate_empty(b) | |
@is_empty = b.include?(@rule) | |
end | |
def first_set | |
Set[self] | |
end | |
def last_set | |
Set[self] | |
end | |
def each_pair(&blk) | |
end | |
end | |
class Grammar | |
def self.build(&blk) | |
new(&blk) | |
end | |
attr_reader :transitions, :start_parser | |
def initialize(&blk) | |
@rules = [] | |
@start_parser = instance_eval(&blk) | |
if !@start_parser.is_a?(Parser) | |
raise "block must return a parser" | |
end | |
_compute_empty | |
_compute_transitions | |
end | |
def c(c) | |
Char.new(c) | |
end | |
def eps | |
Eps.new | |
end | |
def call(n) | |
case n | |
when Symbol | |
send(n) | |
else | |
n.call | |
end | |
end | |
def sep_by(p, sep) | |
sep_by1(p, sep).maybe | |
end | |
def sep_by1(p, sep) | |
call(p) >> (call(sep) >> call(p)).many | |
end | |
def end_by(p, sep) | |
end_by1(p, sep).maybe | |
end | |
def end_by1(p, sep) | |
(call(p) >> call(sep)).many1 | |
end | |
def lazy(name) | |
old = method(name) | |
rule = Rule.new { old.call } | |
@rules << rule | |
define_singleton_method(name) { rule.call } | |
nil | |
end | |
private | |
def _compute_empty | |
empty_rules = Set.new | |
begin | |
before = empty_rules.size | |
@rules.each do |rule| | |
is_empty = rule.force.calculate_empty(empty_rules) | |
empty_rules << rule if is_empty | |
end | |
after = empty_rules.size | |
end while before != after | |
@start_parser.calculate_empty(empty_rules) | |
end | |
def _compute_transitions | |
@transitions = Hash.new { |h, k| h[k] = Set.new } | |
@rules.each do |rule| | |
rule.each_pair do |a, b| | |
@transitions[a] << b | |
end | |
end | |
@start_parser.each_pair do |a, b| | |
@transitions[a] << b | |
end | |
@start_parser.last_set.each do |lst| | |
@transitions[lst] << :eof | |
end | |
end | |
end | |
# The left_pos refers to the position where the current rule started. | |
State = Struct.new(:parser, :left_pos) | |
Call = Struct.new(:rule, :left_pos) | |
class CallState | |
attr_reader :returns | |
def initialize | |
@returns = [] | |
@finished = Set.new | |
end | |
def add_return(state) | |
@returns << state | |
end | |
def finish(pos) | |
@returns.freeze | |
if @finished.include?(pos) | |
false | |
else | |
@finished << pos | |
true | |
end | |
end | |
end | |
class Matcher | |
def initialize(grammar) | |
@parser = grammar.start_parser | |
@transitions = grammar.transitions | |
@call_states = {} | |
end | |
def parse(input) | |
if input.empty? | |
return @parser.empty? | |
end | |
active_states = nil | |
input.each_char.each_with_index do |char, idx| | |
if active_states.nil? | |
active_states = parse_first(char, idx) | |
else | |
active_states = parse_next(active_states, char, idx) | |
end | |
# quick return when there's no more active states | |
return false if active_states.empty? | |
end | |
# parse the end-of-string marker | |
active_states = parse_next(active_states, nil, input.size) | |
active_states.any? { |s| s.parser == :eof } | |
end | |
def parse_first(char, pos) | |
new_states = [] | |
@parser.first_set.each do |target| | |
transition(target, char, pos, nil) do |new_state| | |
new_states << new_state | |
end | |
end | |
new_states | |
end | |
def parse_next(active_states, char, pos) | |
new_states = [] | |
transition_from_states(active_states, char, pos) do |new_state| | |
new_states << new_state | |
end | |
new_states | |
end | |
def transition_from_states(states, char, pos, &blk) | |
states.each do |state| | |
@transitions[state.parser].each do |target| | |
transition(target, char, pos, state.left_pos, &blk) | |
end | |
end | |
end | |
# Attempt to move into the parser-node, yielding states | |
def transition(parser, char, pos, left_pos, &blk) | |
case parser | |
when :eof | |
if char.nil? | |
yield State.new(:eof, left_pos) | |
end | |
when Char | |
if parser.matches?(char) | |
yield State.new(parser, left_pos) | |
end | |
when RuleCall | |
call = Call.new(parser.rule, pos) | |
call_state = @call_states[call] | |
if !call_state | |
# This is the first time this rule has been called at this posiotn | |
# Setup the returns array | |
@call_states[call] = call_state = CallState.new | |
# ... and kick off the first actions | |
parser.rule.force.first_set.each do |fst| | |
transition(fst, char, pos, pos, &blk) | |
end | |
end | |
# and push the state which we want to complete after the rule succeeds | |
call_state.add_return(State.new(parser, left_pos)) | |
when Rule | |
# This implies that a rule is complete | |
# Find the call which it corresponds to | |
call = Call.new(parser, left_pos) | |
call_state = @call_states.fetch(call) | |
if call_state.finish(pos) | |
# Now we can transition out | |
transition_from_states(call_state.returns, char, pos, &blk) | |
end | |
else | |
raise "ehm: #{parser.class}" | |
end | |
end | |
end | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
task :default => :test | |
desc "Run tests" | |
task :test do | |
sh 'ruby', 'test.rb' | |
end | |
file "build" do | |
mkdir "build" | |
end | |
file "build/complexity.txt" => "build" do | |
sh 'ruby complexity.rb | tee build/complexity.txt' | |
end | |
task :complexity => "build/complexity.txt" do | |
sh 'python3', 'complexity.py' | |
end | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require_relative 'glush' | |
require 'minitest/autorun' | |
class TestParser < Minitest::Test | |
def recognize?(parser, str) | |
m = Matcher.new(parser) | |
m.parse(str) | |
end | |
## Matched parens | |
PAREN = Grammar.build { | |
lazy \ | |
def paren | |
eps | (c("(") >> paren >> c(")")) | |
end | |
paren | |
} | |
def test_paren_empty | |
assert recognize?(PAREN, "") | |
end | |
def test_paren_one | |
assert recognize?(PAREN, "()") | |
end | |
def test_paren_two | |
assert recognize?(PAREN, "(())") | |
end | |
def test_paren_close_missing | |
refute recognize?(PAREN, "(()") | |
end | |
def test_paren_close_extra | |
refute recognize?(PAREN, "())") | |
end | |
## n+n | |
S = Grammar.build { | |
lazy \ | |
def s | |
c("n") | (s >> c("+") >> s) | |
end | |
s | |
} | |
def test_s_one | |
assert recognize?(S, "n") | |
end | |
def test_s_two | |
assert recognize?(S, "n+n") | |
refute recognize?(S, "n+") | |
end | |
def test_s_three | |
assert recognize?(S, "n+n+n") | |
refute recognize?(S, "n+n+") | |
end | |
## Combinators | |
A = Grammar.build { c("a").many } | |
AP = Grammar.build { c("a").many1 } | |
def test_rep | |
assert recognize?(A, "") | |
assert recognize?(A, "a") | |
assert recognize?(A, "aaaaa") | |
refute recognize?(AP, "") | |
assert recognize?(AP, "a") | |
assert recognize?(AP, "aaaaa") | |
end | |
ONE_LST = Grammar.build { | |
def one | |
c("1") | |
end | |
def sep | |
c(",") >> c(" ").many | |
end | |
sep_by(:one, :sep) | |
} | |
def test_sep_by | |
assert recognize?(ONE_LST, "") | |
assert recognize?(ONE_LST, "1") | |
assert recognize?(ONE_LST, "1,1") | |
assert recognize?(ONE_LST, "1, 1") | |
refute recognize?(ONE_LST, "1, ") | |
end | |
ONE_LST_END = Grammar.build { | |
def one | |
c("1") | |
end | |
def sep | |
c(",") >> c(" ").many | |
end | |
end_by(:one, :sep) | |
} | |
def test_end_by | |
assert recognize?(ONE_LST_END, "") | |
refute recognize?(ONE_LST_END, "1") | |
assert recognize?(ONE_LST_END, "1,") | |
assert recognize?(ONE_LST_END, "1, 1,") | |
assert recognize?(ONE_LST_END, "1, ") | |
end | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment