Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
testing performance of parsing xml/html with nodejs
//
// this is about parsing-performance of xml/html-parsers
// to be more precisely, i just want to look for specific nodes/attributes
//
// i am testing
//
// * (htmlparser)[https://github.com/tautologistics/node-htmlparser]
// * (html5)[https://github.com/aredridel/html5]
// * (sax)[https://github.com/isaacs/sax-js]
// * (jsdom)[https://github.com/tmpvar/jsdom] + sizzle/jquery
//
// the output i get is:
//
// htmlparser done in 73 ms - memory: 2.09375 mb RSS - found 200 items
// html5 done in 1727 ms - memory: 15.60546875 mb RSS - found 0 items (TODO)
// sax done in 100 ms - memory: 0.0390625 mb RSS - found 200 items
// jsdom/jquery done in 728 ms - memory: 0.1171875 mb RSS - found 200 items
var Seq = require('seq')
, Step = require('step')
, events = require('events')
, request = require('request')
, saxLib = require('sax')
, html = require('htmlparser')
, html5 = require('html5')
, jsdom = require('jsdom')
, uri = 'http://twitter.com/statuses/user_timeline/18975861.rss'
, lookfor = 'item'
, n = 10
//------------------------------------------------ request xml and test all
request({uri:uri}, function(err, res, body) {
Seq()
.seq(function(){testHtml(body, this)})
.seq(function(){testHtml5(body, this)})
.seq(function(){testSax(body, this)})
.seq(function(){testJsdom(body, this)})
.seq(function(){console.log('all done')})
})
//------------------------------------------------ tautologistic's htmlparser
function testHtml(body, cb) {
var htmlHandler, htmlParser
, t = Date.now()
, m = process.memoryUsage().rss
, done = 0
, items = 0
htmlHandler = new html.DefaultHandler(function(err, dom) {
function walkDom(dom) {
for (var i=0, len=dom.length; i<len; i++) {
if (dom[i].type == 'tag' && dom[i].name == lookfor) items++
if (dom[i].children && dom[i].children.length)
walkDom(dom[i].children)
}
}
walkDom(dom)
if (++done==n) {
console.log
( 'htmlparser done in %s ms - memory: %s mb RSS - found %s items'
, (Date.now())-t, (process.memoryUsage().rss-m)/1048576, items )
cb()
} else {
htmlParser.parseComplete(body)
}
})
htmlParser = new html.Parser(htmlHandler)
htmlParser.parseComplete(body)
}
//------------------------------------------------ aredridel's html5
function testHtml5(body, cb) {
var parser = new html5.Parser()
, em = new events.EventEmitter()
, t = Date.now()
, m = process.memoryUsage().rss
, done = 0
, items = 0
parser.on('done', function() { // i guess this is not done yet? altough its in the doc
console.log('html5-done')
})
parser.parse(em)
while(++done <= 10) {
em.emit('data', body)
em.emit('end')
}
console.log
( 'html5 done in %s ms - memory: %s mb RSS - found %s items (TODO)'
, (Date.now())-t, (process.memoryUsage().rss-m)/1048576, items )
cb()
}
//------------------------------------------------ isaac's sax-parser
function testSax(body, cb) {
var sax = saxLib.parser(true)
, t = Date.now()
, m = process.memoryUsage().rss
, done = 0
, items = 0
sax.onerror = function(err) {console.log(err)}
sax.onopentag = function(node) {
if (node.name == lookfor) items++
}
sax.onend = function() {
if (++done==n) {
console.log
( 'sax done in %s ms - memory: %s mb RSS - found %s items'
, (Date.now())-t, (process.memoryUsage().rss-m)/1048576, items )
cb()
}
}
while (done<n) sax.write(body).close()
}
//------------------------------------------------ tmpvar's jsdom + sizzle/jquery
function testJsdom(body, cb) {
var window = jsdom.jsdom().createWindow()
jsdom.jQueryify( window
, 'http://code.jquery.com/jquery.min.js'
, function() {
var done = 0
, t = Date.now()
, m = process.memoryUsage().rss
, items = 0
while(++done <= n) {
window.$('body').html(body)
items += window.$(lookfor).length
}
console.log
( 'jsdom/jquery done in %s ms - memory: %s mb RSS - found %s items'
, (Date.now())-t, (process.memoryUsage().rss-m)/1048576, items )
cb()
})
}
@tmpvar

This comment has been minimized.

Copy link

commented Jan 30, 2011

which version of jsdom are you using? I would try HEAD

@guybrush

This comment has been minimized.

Copy link
Owner Author

commented Jan 30, 2011

with HEAD (6019785d) i get

sax done in 117 ms - memory: 1.96484375 mb RSS
htmlparser done in 47 ms - memory: 0.5703125 mb RSS
jsdom/jquery done in 924 ms - memory: 5.82421875 mb RSS   

good job! :D

anyway this test/code is very incomplete and may not be worth anything - just wanted to make a snapshot

@tmpvar

This comment has been minimized.

Copy link

commented Jan 30, 2011

cool, looks like I managed to chop off 1/3 of your execution time! I cant went to spend some real time optimizing.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.