#! /usr/bin/env ruby
require 'rubygems'
gem 'nokogiri', '>=1.0.6'
gem 'hpricot', '>=0.6.170'
require 'open-uri'
require 'benchmark'
require 'nokogiri'
require 'hpricot'
[
[1000, "#{File.dirname(__FILE__)}/sample_post.html"],
[10, "http://slashdot.com/"],
].each do |ntimes, uri|
html = open(uri).read
summary = []
puts "For an html snippet #{html.size} bytes long ..."
Benchmark.bm(20) do |x|
x.report("regex * #{ntimes}") do
ntimes.times do |j|
html.gsub(/(.*)<\/a>/i, '\2') # broken regex
html.gsub(/<(script|noscript|object|embed|style|frameset|frame|iframe)[>\s\S]*<\/\1>/, '')
html
end
end
stime = Time.now
x.report("nokogiri * #{ntimes}") do
ntimes.times do
doc = Nokogiri::HTML(html)
doc.search("a/text()").wrap("")
doc.search("script","noscript","object","embed","style","frameset","frame","iframe").unlink
doc.inner_html
end
end
etime = Time.now
summary << ("it took an average of %.4f seconds for Nokogiri to parse and operate on an HTML snippet #{html.size} bytes long" % ((etime - stime) / ntimes))
stime = Time.now
x.report("hpricot * #{ntimes}") do
ntimes.times do
doc = Hpricot(html)
doc.search("a/text()").wrap("")
doc.search(["script","noscript","object","embed","style","frameset","frame","iframe"]).remove
doc.inner_html
end
end
etime = Time.now
summary << ("it took an average of %.4f seconds for Hpricot to parse and operate on an HTML snippet #{html.size} bytes long" % ((etime - stime) / ntimes))
end
puts
puts summary
puts
end