nfriedly/html-iconv-stream.js

## html-iconv-stream.js
// buffers a stream of html untill it sees a charset meta tag (or opening <?xml tag with an encoding)
// then it creates an iconv-lite decoder for the charset and sends all data (the buffer and any future data) through it, emiting node.js-friendly utf8
// if it cannot find a charset by the time the </head> tag is reached, it gives up and just calls .toString() on each chunk hoping that it's in a format that node.js can read
// based on iconv-lite's decodeStream

// todo: clean this up, add some tests, and stick it on npm
// == Decoder stream =======================================================
function HTMLDecodeStream(options) {
    this.buff = new Buffer([]);
    this.isBuffering = true;
    this.conv = null;
    options = options || {};
    this.inputEncoding = 'utf8';
    this.encoding = options.encoding = 'utf8'; // this is the *output* encoding
    this.conv = iconv.getEncoder(this.inputEncoding);
    Transform.call(this, options);
}

HTMLDecodeStream.prototype = Object.create(Transform.prototype, {
    constructor: {
        value: HTMLDecodeStream
    }
});

HTMLDecodeStream.prototype._transform = function(chunk, encoding, done) {
    if (!Buffer.isBuffer(chunk))
        return done(new Error("delayed decoding stream needs buffers as its input."));

    if (this.isBuffering) {
        this.bufferAndTest(chunk, encoding, done);
    } else {
        this.stream(chunk, encoding, done);
    }
};

HTMLDecodeStream.prototype.stream = function(chunk, encoding, done) {
    try {
        var res = this.conv.write(chunk);
        if (res && res.length) this.push(res, this.encoding);
        done();
    } catch (e) {
        done(e);
    }
};

HTMLDecodeStream.prototype.bufferAndTest = function(chunk, encoding, done) {
    this.buff = Buffer.concat([this.buff, chunk]);
    var str = this.buff.toString();
    var charsetMatch = str.match(/<meta [^>]*charset=['"]?([^ '">]+)/) || str.match(/<\?xml[^>]+encoding="([^">]+)"/); // extract the charset from a meta tag or the opening <?xml tag
    var endOfHead = str.match(/<\/head>/);
    if (charsetMatch) {
        this.startStreaming(charsetMatch[1], encoding, done);
    } else if (endOfHead) {
        // go with the safest guess for the charset
        this.startStreaming('utf8', encoding, done);
    }
};

HTMLDecodeStream.prototype.startStreaming = function(charset, encoding, done) {
    // setup the decoder
    if (iconv.encodingExists(charset)) {
        this.inputEncoding = charset;
        this.conv = iconv.getDecoder(this.inputEncoding);
    } else {
        console.error("unrecognized charset %s, decoding as utf8", this.inputEncoding);
    }
    this.emit('charset', this.inputEncoding);
    this.isBuffering = false;
    // decode and forward our existing buffer
    this.stream(this.buff, encoding, done);
    // cleanup to ensure _flush doesn't accidentally send data twice
    this.buff = null;
};

HTMLDecodeStream.prototype._flush = function(done) {
    var res;
    try {
        if (this.buff) {
            res = this.conv.write(this.buff);
            if (res && res.length) this.push(res, this.encoding);
            this.buff = null;
        }
        res = this.conv.end();
        if (res && res.length) this.push(res, this.encoding);
        done();
    } catch (e) {
        done(e);
    }
};
	// buffers a stream of html untill it sees a charset meta tag (or opening <?xml tag with an encoding)
	// then it creates an iconv-lite decoder for the charset and sends all data (the buffer and any future data) through it, emiting node.js-friendly utf8
	// if it cannot find a charset by the time the </head> tag is reached, it gives up and just calls .toString() on each chunk hoping that it's in a format that node.js can read
	// based on iconv-lite's decodeStream

	// todo: clean this up, add some tests, and stick it on npm
	// == Decoder stream =======================================================
	function HTMLDecodeStream(options) {
	this.buff = new Buffer([]);
	this.isBuffering = true;
	this.conv = null;
	options = options \|\| {};
	this.inputEncoding = 'utf8';
	this.encoding = options.encoding = 'utf8'; // this is the output encoding
	this.conv = iconv.getEncoder(this.inputEncoding);
	Transform.call(this, options);
	}

	HTMLDecodeStream.prototype = Object.create(Transform.prototype, {
	constructor: {
	value: HTMLDecodeStream
	}
	});

	HTMLDecodeStream.prototype._transform = function(chunk, encoding, done) {
	if (!Buffer.isBuffer(chunk))
	return done(new Error("delayed decoding stream needs buffers as its input."));

	if (this.isBuffering) {
	this.bufferAndTest(chunk, encoding, done);
	} else {
	this.stream(chunk, encoding, done);
	}
	};

	HTMLDecodeStream.prototype.stream = function(chunk, encoding, done) {
	try {
	var res = this.conv.write(chunk);
	if (res && res.length) this.push(res, this.encoding);
	done();
	} catch (e) {
	done(e);
	}
	};

	HTMLDecodeStream.prototype.bufferAndTest = function(chunk, encoding, done) {
	this.buff = Buffer.concat([this.buff, chunk]);
	var str = this.buff.toString();
	var charsetMatch = str.match(/<meta [^>]*charset=['"]?([^ '">]+)/) \|\| str.match(/<\?xml[^>]+encoding="([^">]+)"/); // extract the charset from a meta tag or the opening <?xml tag
	var endOfHead = str.match(/<\/head>/);
	if (charsetMatch) {
	this.startStreaming(charsetMatch[1], encoding, done);
	} else if (endOfHead) {
	// go with the safest guess for the charset
	this.startStreaming('utf8', encoding, done);
	}
	};

	HTMLDecodeStream.prototype.startStreaming = function(charset, encoding, done) {
	// setup the decoder
	if (iconv.encodingExists(charset)) {
	this.inputEncoding = charset;
	this.conv = iconv.getDecoder(this.inputEncoding);
	} else {
	console.error("unrecognized charset %s, decoding as utf8", this.inputEncoding);
	}
	this.emit('charset', this.inputEncoding);
	this.isBuffering = false;
	// decode and forward our existing buffer
	this.stream(this.buff, encoding, done);
	// cleanup to ensure _flush doesn't accidentally send data twice
	this.buff = null;
	};

	HTMLDecodeStream.prototype._flush = function(done) {
	var res;
	try {
	if (this.buff) {
	res = this.conv.write(this.buff);
	if (res && res.length) this.push(res, this.encoding);
	this.buff = null;
	}
	res = this.conv.end();
	if (res && res.length) this.push(res, this.encoding);
	done();
	} catch (e) {
	done(e);
	}
	};