public
Last active

Ox vs Nokogiri: DOM and SAX parsing comparison

  • Download Gist
benchmark
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
# I'm no benchmark guru. Just did a bunch of:
$ time ruby <filename>
 
# Note: This is just an 80mb XML file with 38,000 nodes.
 
ox_dom.rb 4.56s user 0.78s system 93% cpu 5.714 total (550mb)
ox_dom.rb 4.58s user 0.79s system 87% cpu 6.126 total (550mb)
ox_dom.rb 4.60s user 0.80s system 87% cpu 6.140 total (550mb)
nokigiri_dom.rb 11.75s user 1.02s system 94% cpu 13.518 total (895mb)
nokigiri_dom.rb 11.36s user 1.02s system 93% cpu 13.211 total (895mb)
nokigiri_dom.rb 11.57s user 1.03s system 94% cpu 13.326 total (900mb)
ox_sax.rb 4.37s user 0.41s system 81% cpu 5.848 total (12.3mb)
ox_sax.rb 4.47s user 0.43s system 91% cpu 5.339 total (11.8mb)
ox_sax.rb 4.29s user 0.41s system 82% cpu 5.724 total (12.0mb)
nokogiri_sax.rb 23.01s user 0.52s system 98% cpu 23.782 total (11.9mb)
nokogiri_sax.rb 23.40s user 0.55s system 98% cpu 24.319 total (11.8mb)
nokogiri_sax.rb 23.18s user 0.54s system 98% cpu 24.022 total (11.7mb)
nokogiri_dom_parsing.rb
Ruby
1 2 3 4 5 6 7 8 9 10 11
require "nokogiri"
 
doc = Nokogiri::XML(open("user.xml"))
 
doc.search("//row").each do |post|
@user = {}
@user[:userid] = post.at("userid").text.to_i
@user[:username] = post.at("username").text
@user[:email] = post.at("email").text
puts @user[:userid], @user[:username], @user[:email]
end
nokogiri_sax_parsing.rb
Ruby
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
require "nokogiri"
class UsersDocument < Nokogiri::XML::SAX::Document
USER_ATTR = %w[userid username email]
 
def start_element(name, attrs=[])
@user = {} if name == "row"
@current_node = name
end
 
def characters(string)
return unless USER_ATTR.include?(@current_node)
@user[@current_node] = string unless string.strip.empty?
end
 
def end_element(name)
return unless name == "row"
puts @user["userid"], @user["username"], @user["email"]
end
end
 
parser = Nokogiri::XML::SAX::Parser.new(UsersDocument.new)
parser.parse_file("user.xml")
ox_dom_parsing.rb
Ruby
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
# Check out Ox's tests to see how to iterate through it: https://github.com/ohler55/ox/blob/master/test/parse_cmp.rb
 
require "ox"
require "open-uri"
 
USER_ATTR = %w[userid username email]
 
def first_text(node)
node.nodes.each do |n|
return n if n.is_a?(String)
end
nil
end
 
def get_value(node)
case node.name
when "userid"
first_text(node).to_i
when "username", "email"
first_text(node)
end
end
 
doc = Ox.parse(open("user.xml").read)
plist = doc.root
rows = plist.nodes.first.nodes
 
rows.each do |row|
@user = {}
row.nodes.each do |node|
if USER_ATTR.include?(node.name)
@user[node.name] = get_value(node)
end
end
puts @user["userid"], @user["username"], @user["email"]
end
ox_sax_parsing.rb
Ruby
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
require "ox"
require "open-uri"
 
class Handler < Ox::Sax
USER_ATTR = [:userid, :username, :email]
ATTR_MAP = { userid: :as_i, username: :as_s, email: :as_s}
 
def start_element(name)
@user = {} if name == :row
@current_node = name
end
 
def value(value)
return unless USER_ATTR.include?(@current_node)
@user[@current_node] = value.send(ATTR_MAP[@current_node])
end
 
def end_element(name)
return unless name == :row
puts @user[:userid], @user[:username], @user[:email]
end
end
 
handler = Handler.new
File.open("user.xml") do |f|
Ox.sax_parse(handler, f)
end
user.xml
XML
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
<!-- 80mb file, 38,000 row nodes -->
<?xml version="1.0"?>
<data>
<users>
<row>
<userid>1</userid>
<username>danneu</username>
<email>danrodneu@gmail.com</email>
<dozens>etc.</dozens>
<more>etc.</more>
<nodes>etc.</nodes>
</row>
<row>
<userid>2</userid>
...
</row>
...
</users>
</data>

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.