Created
November 14, 2013 04:30
-
-
Save nurse/7461379 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogiri' | |
# XMLLexer - SAX with scanner like API | |
class XMLLexer | |
def initialize(path) | |
@f = Fiber.new do | |
parser = Nokogiri::HTML::SAX::Parser.new(DocProxy.new) | |
parser.parse_file(path) | |
raise EOFError | |
end | |
@buf = [] | |
end | |
# Returns an event | |
def next | |
return @buf.pop unless @buf.empty? | |
@f.resume | |
end | |
# Returns an event without moving the pointer | |
def peek | |
return @buf[0] unless @buf.empty? | |
@buf[0] = @f.resume | |
end | |
# Returns true if given condition is correct, otherwise raise error | |
def verify(*arg) | |
ary = self.next | |
validate(ary, arg) | |
nil | |
end | |
# Returns true and consumes the item if given condition is correct. | |
# Otherwise returns false and keep the item in the buffer. | |
def check(*arg) | |
ary = self.peek | |
begin | |
validate(ary, arg) | |
self.next | |
true | |
rescue RuntimeError | |
false | |
end | |
end | |
# get text from the XML's text node | |
def gets | |
ary = self.next | |
raise "'characters' is expected, but '#{ary[0]}'" if ary[0] != :characters | |
str = ary[1] | |
while true | |
ary = self.peek | |
case ary[0] | |
when :characters | |
self.next | |
str << ary[1] | |
next | |
when :start_element | |
case ary[1] | |
when 'br' | |
self.next | |
verify :end_element, 'br' | |
next | |
end | |
end | |
break | |
end | |
str | |
end | |
private | |
def validate(ary, arg) # :nodoc: | |
type = arg[0] | |
raise "'#{type}' is expected, but '#{ary.inspect}'" if type != ary[0] | |
case arg[0] | |
when :characters | |
case arg[1] | |
when :blank | |
unless /\A\s*\z/ =~ ary[1] | |
raise "blank is expected but '#{ary[1].inspect}'" | |
end | |
when Regexp | |
unless arg[1] =~ ary[1] | |
raise "characters:'#{ary[1].inspect}' is expected to match '#{arg[1]}'" | |
end | |
when String | |
if arg[1] != ary[1] | |
raise "'#{arg[1]}' is expected but '#{ary[1].inspect}'" | |
end | |
when nil | |
else | |
raise 'invalid expectation' | |
end | |
when :start_element | |
return if arg.size == 1 | |
raise "start_element: '#{arg[1]}' is expected, but '#{ary[1]}'" if arg[1] != ary[1] | |
return if arg.size == 2 | |
case arg[2] | |
when Hash | |
attrs = ary[2] | |
arg[2].each_pair do |k, v| | |
case v | |
when Regexp | |
vs = ary[2].assoc(k) | |
raise unless vs | |
val = vs[1] | |
raise unless val | |
unless v =~ val | |
raise "start_element:#{arg[1]} '#{k}'='#{val}' is not match /#{v}/" | |
end | |
when String | |
vs = ary[2].assoc(k) | |
raise unless vs | |
val = vs[1] | |
raise unless val | |
if v != val | |
raise "start_element:#{arg[1]} '#{k}'='#{val}' != '#{v}'" | |
end | |
end | |
end | |
end | |
else | |
result = ary[0, arg.size] | |
raise "'#{arg}' is expected, but '#{result}'" if arg != result | |
end | |
end | |
end | |
class DocProxy < Nokogiri::XML::SAX::Document | |
def cdata_block(str) Fiber.yield __method__, str end | |
def characters(str) Fiber.yield __method__, str end | |
def comment(str) Fiber.yield __method__, str end | |
def end_document; Fiber.yield [__method__] end | |
def end_element(name) Fiber.yield __method__, name end | |
def end_element_namespace(name,prefix,uri) Fiber.yield __method__, name, prefix, uri end | |
def error(str) Fiber.yield __method__, str end | |
def start_document; Fiber.yield [__method__] end | |
def start_element(name, attrs) Fiber.yield __method__, name, attrs end | |
def start_element_namespace(name, attrs, prefix,url, ns) Fiber.yield __method__, name, attrs end | |
def warning(str) Fiber.yield __method__, str end | |
def xmldecl(version, encoding, standalone) Fiber.yield __method__, version, encoding, standalone end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment