Skip to content

Instantly share code, notes, and snippets.

@nurse
Created November 14, 2013 04:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nurse/7461379 to your computer and use it in GitHub Desktop.
Save nurse/7461379 to your computer and use it in GitHub Desktop.
require 'nokogiri'
# XMLLexer - SAX with scanner like API
class XMLLexer
def initialize(path)
@f = Fiber.new do
parser = Nokogiri::HTML::SAX::Parser.new(DocProxy.new)
parser.parse_file(path)
raise EOFError
end
@buf = []
end
# Returns an event
def next
return @buf.pop unless @buf.empty?
@f.resume
end
# Returns an event without moving the pointer
def peek
return @buf[0] unless @buf.empty?
@buf[0] = @f.resume
end
# Returns true if given condition is correct, otherwise raise error
def verify(*arg)
ary = self.next
validate(ary, arg)
nil
end
# Returns true and consumes the item if given condition is correct.
# Otherwise returns false and keep the item in the buffer.
def check(*arg)
ary = self.peek
begin
validate(ary, arg)
self.next
true
rescue RuntimeError
false
end
end
# get text from the XML's text node
def gets
ary = self.next
raise "'characters' is expected, but '#{ary[0]}'" if ary[0] != :characters
str = ary[1]
while true
ary = self.peek
case ary[0]
when :characters
self.next
str << ary[1]
next
when :start_element
case ary[1]
when 'br'
self.next
verify :end_element, 'br'
next
end
end
break
end
str
end
private
def validate(ary, arg) # :nodoc:
type = arg[0]
raise "'#{type}' is expected, but '#{ary.inspect}'" if type != ary[0]
case arg[0]
when :characters
case arg[1]
when :blank
unless /\A\s*\z/ =~ ary[1]
raise "blank is expected but '#{ary[1].inspect}'"
end
when Regexp
unless arg[1] =~ ary[1]
raise "characters:'#{ary[1].inspect}' is expected to match '#{arg[1]}'"
end
when String
if arg[1] != ary[1]
raise "'#{arg[1]}' is expected but '#{ary[1].inspect}'"
end
when nil
else
raise 'invalid expectation'
end
when :start_element
return if arg.size == 1
raise "start_element: '#{arg[1]}' is expected, but '#{ary[1]}'" if arg[1] != ary[1]
return if arg.size == 2
case arg[2]
when Hash
attrs = ary[2]
arg[2].each_pair do |k, v|
case v
when Regexp
vs = ary[2].assoc(k)
raise unless vs
val = vs[1]
raise unless val
unless v =~ val
raise "start_element:#{arg[1]} '#{k}'='#{val}' is not match /#{v}/"
end
when String
vs = ary[2].assoc(k)
raise unless vs
val = vs[1]
raise unless val
if v != val
raise "start_element:#{arg[1]} '#{k}'='#{val}' != '#{v}'"
end
end
end
end
else
result = ary[0, arg.size]
raise "'#{arg}' is expected, but '#{result}'" if arg != result
end
end
end
class DocProxy < Nokogiri::XML::SAX::Document
def cdata_block(str) Fiber.yield __method__, str end
def characters(str) Fiber.yield __method__, str end
def comment(str) Fiber.yield __method__, str end
def end_document; Fiber.yield [__method__] end
def end_element(name) Fiber.yield __method__, name end
def end_element_namespace(name,prefix,uri) Fiber.yield __method__, name, prefix, uri end
def error(str) Fiber.yield __method__, str end
def start_document; Fiber.yield [__method__] end
def start_element(name, attrs) Fiber.yield __method__, name, attrs end
def start_element_namespace(name, attrs, prefix,url, ns) Fiber.yield __method__, name, attrs end
def warning(str) Fiber.yield __method__, str end
def xmldecl(version, encoding, standalone) Fiber.yield __method__, version, encoding, standalone end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment