#! /usr/bin/env ruby require 'rubygems' gem 'nokogiri', '>=1.0.6' gem 'hpricot', '>=0.6.170' require 'open-uri' require 'benchmark' require 'nokogiri' require 'hpricot' [ [1000, "#{File.dirname(__FILE__)}/sample_post.html"], [10, "http://slashdot.com/"], ].each do |ntimes, uri| html = open(uri).read summary = [] puts "For an html snippet #{html.size} bytes long ..." Benchmark.bm(20) do |x| x.report("regex * #{ntimes}") do ntimes.times do |j| html.gsub(/(.*)<\/a>/i, '\2') # broken regex html.gsub(/<(script|noscript|object|embed|style|frameset|frame|iframe)[>\s\S]*<\/\1>/, '') html end end stime = Time.now x.report("nokogiri * #{ntimes}") do ntimes.times do doc = Nokogiri::HTML(html) doc.search("a/text()").wrap("") doc.search("script","noscript","object","embed","style","frameset","frame","iframe").unlink doc.inner_html end end etime = Time.now summary << ("it took an average of %.4f seconds for Nokogiri to parse and operate on an HTML snippet #{html.size} bytes long" % ((etime - stime) / ntimes)) stime = Time.now x.report("hpricot * #{ntimes}") do ntimes.times do doc = Hpricot(html) doc.search("a/text()").wrap("") doc.search(["script","noscript","object","embed","style","frameset","frame","iframe"]).remove doc.inner_html end end etime = Time.now summary << ("it took an average of %.4f seconds for Hpricot to parse and operate on an HTML snippet #{html.size} bytes long" % ((etime - stime) / ntimes)) end puts puts summary puts end