Skip to content

Instantly share code, notes, and snippets.

@nfriedly
Created July 1, 2015 20:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nfriedly/c4c41c0b053a1106ebe1 to your computer and use it in GitHub Desktop.
Save nfriedly/c4c41c0b053a1106ebe1 to your computer and use it in GitHub Desktop.
iconv html stream decoder (reads charset from <meta> tag, uses it to decode document to utf8)
// buffers a stream of html untill it sees a charset meta tag (or opening <?xml tag with an encoding)
// then it creates an iconv-lite decoder for the charset and sends all data (the buffer and any future data) through it, emiting node.js-friendly utf8
// if it cannot find a charset by the time the </head> tag is reached, it gives up and just calls .toString() on each chunk hoping that it's in a format that node.js can read
// based on iconv-lite's decodeStream
// todo: clean this up, add some tests, and stick it on npm
// == Decoder stream =======================================================
function HTMLDecodeStream(options) {
this.buff = new Buffer([]);
this.isBuffering = true;
this.conv = null;
options = options || {};
this.inputEncoding = 'utf8';
this.encoding = options.encoding = 'utf8'; // this is the *output* encoding
this.conv = iconv.getEncoder(this.inputEncoding);
Transform.call(this, options);
}
HTMLDecodeStream.prototype = Object.create(Transform.prototype, {
constructor: {
value: HTMLDecodeStream
}
});
HTMLDecodeStream.prototype._transform = function(chunk, encoding, done) {
if (!Buffer.isBuffer(chunk))
return done(new Error("delayed decoding stream needs buffers as its input."));
if (this.isBuffering) {
this.bufferAndTest(chunk, encoding, done);
} else {
this.stream(chunk, encoding, done);
}
};
HTMLDecodeStream.prototype.stream = function(chunk, encoding, done) {
try {
var res = this.conv.write(chunk);
if (res && res.length) this.push(res, this.encoding);
done();
} catch (e) {
done(e);
}
};
HTMLDecodeStream.prototype.bufferAndTest = function(chunk, encoding, done) {
this.buff = Buffer.concat([this.buff, chunk]);
var str = this.buff.toString();
var charsetMatch = str.match(/<meta [^>]*charset=['"]?([^ '">]+)/) || str.match(/<\?xml[^>]+encoding="([^">]+)"/); // extract the charset from a meta tag or the opening <?xml tag
var endOfHead = str.match(/<\/head>/);
if (charsetMatch) {
this.startStreaming(charsetMatch[1], encoding, done);
} else if (endOfHead) {
// go with the safest guess for the charset
this.startStreaming('utf8', encoding, done);
}
};
HTMLDecodeStream.prototype.startStreaming = function(charset, encoding, done) {
// setup the decoder
if (iconv.encodingExists(charset)) {
this.inputEncoding = charset;
this.conv = iconv.getDecoder(this.inputEncoding);
} else {
console.error("unrecognized charset %s, decoding as utf8", this.inputEncoding);
}
this.emit('charset', this.inputEncoding);
this.isBuffering = false;
// decode and forward our existing buffer
this.stream(this.buff, encoding, done);
// cleanup to ensure _flush doesn't accidentally send data twice
this.buff = null;
};
HTMLDecodeStream.prototype._flush = function(done) {
var res;
try {
if (this.buff) {
res = this.conv.write(this.buff);
if (res && res.length) this.push(res, this.encoding);
this.buff = null;
}
res = this.conv.end();
if (res && res.length) this.push(res, this.encoding);
done();
} catch (e) {
done(e);
}
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment