iconv html stream decoder (reads charset from <meta> tag, uses it to decode document to utf8)
// buffers a stream of html untill it sees a charset meta tag (or opening <?xml tag with an encoding) | |
// then it creates an iconv-lite decoder for the charset and sends all data (the buffer and any future data) through it, emiting node.js-friendly utf8 | |
// if it cannot find a charset by the time the </head> tag is reached, it gives up and just calls .toString() on each chunk hoping that it's in a format that node.js can read | |
// based on iconv-lite's decodeStream | |
// todo: clean this up, add some tests, and stick it on npm | |
// == Decoder stream ======================================================= | |
function HTMLDecodeStream(options) { | |
this.buff = new Buffer([]); | |
this.isBuffering = true; | |
this.conv = null; | |
options = options || {}; | |
this.inputEncoding = 'utf8'; | |
this.encoding = options.encoding = 'utf8'; // this is the *output* encoding | |
this.conv = iconv.getEncoder(this.inputEncoding); | |
Transform.call(this, options); | |
} | |
HTMLDecodeStream.prototype = Object.create(Transform.prototype, { | |
constructor: { | |
value: HTMLDecodeStream | |
} | |
}); | |
HTMLDecodeStream.prototype._transform = function(chunk, encoding, done) { | |
if (!Buffer.isBuffer(chunk)) | |
return done(new Error("delayed decoding stream needs buffers as its input.")); | |
if (this.isBuffering) { | |
this.bufferAndTest(chunk, encoding, done); | |
} else { | |
this.stream(chunk, encoding, done); | |
} | |
}; | |
HTMLDecodeStream.prototype.stream = function(chunk, encoding, done) { | |
try { | |
var res = this.conv.write(chunk); | |
if (res && res.length) this.push(res, this.encoding); | |
done(); | |
} catch (e) { | |
done(e); | |
} | |
}; | |
HTMLDecodeStream.prototype.bufferAndTest = function(chunk, encoding, done) { | |
this.buff = Buffer.concat([this.buff, chunk]); | |
var str = this.buff.toString(); | |
var charsetMatch = str.match(/<meta [^>]*charset=['"]?([^ '">]+)/) || str.match(/<\?xml[^>]+encoding="([^">]+)"/); // extract the charset from a meta tag or the opening <?xml tag | |
var endOfHead = str.match(/<\/head>/); | |
if (charsetMatch) { | |
this.startStreaming(charsetMatch[1], encoding, done); | |
} else if (endOfHead) { | |
// go with the safest guess for the charset | |
this.startStreaming('utf8', encoding, done); | |
} | |
}; | |
HTMLDecodeStream.prototype.startStreaming = function(charset, encoding, done) { | |
// setup the decoder | |
if (iconv.encodingExists(charset)) { | |
this.inputEncoding = charset; | |
this.conv = iconv.getDecoder(this.inputEncoding); | |
} else { | |
console.error("unrecognized charset %s, decoding as utf8", this.inputEncoding); | |
} | |
this.emit('charset', this.inputEncoding); | |
this.isBuffering = false; | |
// decode and forward our existing buffer | |
this.stream(this.buff, encoding, done); | |
// cleanup to ensure _flush doesn't accidentally send data twice | |
this.buff = null; | |
}; | |
HTMLDecodeStream.prototype._flush = function(done) { | |
var res; | |
try { | |
if (this.buff) { | |
res = this.conv.write(this.buff); | |
if (res && res.length) this.push(res, this.encoding); | |
this.buff = null; | |
} | |
res = this.conv.end(); | |
if (res && res.length) this.push(res, this.encoding); | |
done(); | |
} catch (e) { | |
done(e); | |
} | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment