Skip to content

Instantly share code, notes, and snippets.

@guybrush
Created January 30, 2011 04:53
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save guybrush/802552 to your computer and use it in GitHub Desktop.
Save guybrush/802552 to your computer and use it in GitHub Desktop.
testing performance of parsing xml/html with nodejs
//
// this is about parsing-performance of xml/html-parsers
// to be more precisely, i just want to look for specific nodes/attributes
//
// i am testing
//
// * (htmlparser)[https://github.com/tautologistics/node-htmlparser]
// * (html5)[https://github.com/aredridel/html5]
// * (sax)[https://github.com/isaacs/sax-js]
// * (jsdom)[https://github.com/tmpvar/jsdom] + sizzle/jquery
//
// the output i get is:
//
// htmlparser done in 73 ms - memory: 2.09375 mb RSS - found 200 items
// html5 done in 1727 ms - memory: 15.60546875 mb RSS - found 0 items (TODO)
// sax done in 100 ms - memory: 0.0390625 mb RSS - found 200 items
// jsdom/jquery done in 728 ms - memory: 0.1171875 mb RSS - found 200 items
var Seq = require('seq')
, Step = require('step')
, events = require('events')
, request = require('request')
, saxLib = require('sax')
, html = require('htmlparser')
, html5 = require('html5')
, jsdom = require('jsdom')
, uri = 'http://twitter.com/statuses/user_timeline/18975861.rss'
, lookfor = 'item'
, n = 10
//------------------------------------------------ request xml and test all
request({uri:uri}, function(err, res, body) {
Seq()
.seq(function(){testHtml(body, this)})
.seq(function(){testHtml5(body, this)})
.seq(function(){testSax(body, this)})
.seq(function(){testJsdom(body, this)})
.seq(function(){console.log('all done')})
})
//------------------------------------------------ tautologistic's htmlparser
function testHtml(body, cb) {
var htmlHandler, htmlParser
, t = Date.now()
, m = process.memoryUsage().rss
, done = 0
, items = 0
htmlHandler = new html.DefaultHandler(function(err, dom) {
function walkDom(dom) {
for (var i=0, len=dom.length; i<len; i++) {
if (dom[i].type == 'tag' && dom[i].name == lookfor) items++
if (dom[i].children && dom[i].children.length)
walkDom(dom[i].children)
}
}
walkDom(dom)
if (++done==n) {
console.log
( 'htmlparser done in %s ms - memory: %s mb RSS - found %s items'
, (Date.now())-t, (process.memoryUsage().rss-m)/1048576, items )
cb()
} else {
htmlParser.parseComplete(body)
}
})
htmlParser = new html.Parser(htmlHandler)
htmlParser.parseComplete(body)
}
//------------------------------------------------ aredridel's html5
function testHtml5(body, cb) {
var parser = new html5.Parser()
, em = new events.EventEmitter()
, t = Date.now()
, m = process.memoryUsage().rss
, done = 0
, items = 0
parser.on('done', function() { // i guess this is not done yet? altough its in the doc
console.log('html5-done')
})
parser.parse(em)
while(++done <= 10) {
em.emit('data', body)
em.emit('end')
}
console.log
( 'html5 done in %s ms - memory: %s mb RSS - found %s items (TODO)'
, (Date.now())-t, (process.memoryUsage().rss-m)/1048576, items )
cb()
}
//------------------------------------------------ isaac's sax-parser
function testSax(body, cb) {
var sax = saxLib.parser(true)
, t = Date.now()
, m = process.memoryUsage().rss
, done = 0
, items = 0
sax.onerror = function(err) {console.log(err)}
sax.onopentag = function(node) {
if (node.name == lookfor) items++
}
sax.onend = function() {
if (++done==n) {
console.log
( 'sax done in %s ms - memory: %s mb RSS - found %s items'
, (Date.now())-t, (process.memoryUsage().rss-m)/1048576, items )
cb()
}
}
while (done<n) sax.write(body).close()
}
//------------------------------------------------ tmpvar's jsdom + sizzle/jquery
function testJsdom(body, cb) {
var window = jsdom.jsdom().createWindow()
jsdom.jQueryify( window
, 'http://code.jquery.com/jquery.min.js'
, function() {
var done = 0
, t = Date.now()
, m = process.memoryUsage().rss
, items = 0
while(++done <= n) {
window.$('body').html(body)
items += window.$(lookfor).length
}
console.log
( 'jsdom/jquery done in %s ms - memory: %s mb RSS - found %s items'
, (Date.now())-t, (process.memoryUsage().rss-m)/1048576, items )
cb()
})
}
@tmpvar
Copy link

tmpvar commented Jan 30, 2011

which version of jsdom are you using? I would try HEAD

@guybrush
Copy link
Author

with HEAD (6019785d) i get

sax done in 117 ms - memory: 1.96484375 mb RSS
htmlparser done in 47 ms - memory: 0.5703125 mb RSS
jsdom/jquery done in 924 ms - memory: 5.82421875 mb RSS   

good job! :D

anyway this test/code is very incomplete and may not be worth anything - just wanted to make a snapshot

@tmpvar
Copy link

tmpvar commented Jan 30, 2011

cool, looks like I managed to chop off 1/3 of your execution time! I cant went to spend some real time optimizing.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment