qwerasd205/utf8decode.ts Secret

## utf8decode.ts
// This code is released for free, by the author, for use by anyone for any purpose,
// modified in any way, in whole, or in part, with or without this notice included.
//
// Please note:
//
// THIS CODE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
// INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHOR OF THIS CODE
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
// CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE CODE
// OR THE USE OR OTHER DEALINGS IN THE CODE.


// This function is based on Bjoern Hoehrmann's DFA UTF-8 decoder.
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
//
// Inserting a character to and flushing the buffer if neccessary
// could be a function, but I've decided to inline it in all places
// where it occurs, as it yields a performance benefit.
// I also inlined a data table in two places, as that, too, yielded
// a small performance increase.
// Please excuse the ugliness ^-^

function decode (utf8data: Uint8Array, fatal: boolean = false, ignoreBOM: boolean = false): string {
    let outString: string = "";

    // Prepare a buffer so that we don't have to do a lot of string concats, which are very slow.
    const outBufferLength: number = Math.min(1024, utf8data.length);
    const outBuffer: Uint16Array = new Uint16Array(outBufferLength);
    let outIndex: number = 0;

    let state: number = 0;
    let codepoint: number = 0;
    let type: number;

    let i = (ignoreBOM && utf8data[0] === 0xEF && utf8data[1] === 0xBB && utf8data[2] === 0xBF) ? 3 : 0;

    for (; i < utf8data.length; ++i) {
        // Encoding error handling
        // I checked and it seems like TextDecoder tends to handle errors by using replacement characters,
        // I've tried to copy that functionality.
        if (state === 12 || state !== 0 && ((utf8data[i] & 0xc0) !== 0x80)) {
            if (fatal) throw new TypeError(`Decoder error. Invalid byte in sequence at position ${i} in data.`);
            outBuffer[outIndex++] = 0xFFFD; // Replacement character
            if (outIndex === outBufferLength) {
                outString += String.fromCharCode.apply(null, outBuffer);
                outIndex = 0;
            }
            state = 0;
        }

        type = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
                1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
                7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
                8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
               10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8][utf8data[i]];
        codepoint = (state !== 0)
            ? (utf8data[i] & 0x3f) | (codepoint << 6)
            : (0xff >> type) & (utf8data[i]);
        state = [0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
                12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
                12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
                12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
                12,36,12,12,12,12,12,12,12,12,12,12][state + type];

        if (state !== 0) continue;

        // Add codepoint to buffer (as charcodes for utf-16), and flush buffer to string if needed.
        if (codepoint > 0xFFFF) {
            outBuffer[outIndex++] = (0xD7C0 + (codepoint >> 10));
            if (outIndex === outBufferLength) {
                outString += String.fromCharCode.apply(null, outBuffer);
                outIndex = 0;
            }
            outBuffer[outIndex++] = (0xDC00 | (codepoint & 0x3FF));
            if (outIndex === outBufferLength) {
                outString += String.fromCharCode.apply(null, outBuffer);
                outIndex = 0;
            }
        } else {
            outBuffer[outIndex++] = codepoint;
            if (outIndex === outBufferLength) {
                outString += String.fromCharCode.apply(null, outBuffer);
                outIndex = 0;
            }
        }
    }

    // Add a replacement character if we ended in the middle of a sequence or encountered an invalid code at the end.
    if (state !== 0) {
        if (fatal) throw new TypeError(`Decoder error. Unexpected end of data.`);
        outBuffer[outIndex++] = 0xFFFD; // Replacement character
    }

    // Final flush of buffer
    outString += String.fromCharCode.apply(null, outBuffer.subarray(0, outIndex));

    return outString;
}
	// This code is released for free, by the author, for use by anyone for any purpose,
	// modified in any way, in whole, or in part, with or without this notice included.
	//
	// Please note:
	//
	// THIS CODE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
	// INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
	// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHOR OF THIS CODE
	// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
	// CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE CODE
	// OR THE USE OR OTHER DEALINGS IN THE CODE.


	// This function is based on Bjoern Hoehrmann's DFA UTF-8 decoder.
	// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
	//
	// Inserting a character to and flushing the buffer if neccessary
	// could be a function, but I've decided to inline it in all places
	// where it occurs, as it yields a performance benefit.
	// I also inlined a data table in two places, as that, too, yielded
	// a small performance increase.
	// Please excuse the ugliness ^-^

	function decode (utf8data: Uint8Array, fatal: boolean = false, ignoreBOM: boolean = false): string {
	let outString: string = "";

	// Prepare a buffer so that we don't have to do a lot of string concats, which are very slow.
	const outBufferLength: number = Math.min(1024, utf8data.length);
	const outBuffer: Uint16Array = new Uint16Array(outBufferLength);
	let outIndex: number = 0;

	let state: number = 0;
	let codepoint: number = 0;
	let type: number;

	let i = (ignoreBOM && utf8data[0] === 0xEF && utf8data[1] === 0xBB && utf8data[2] === 0xBF) ? 3 : 0;

	for (; i < utf8data.length; ++i) {
	// Encoding error handling
	// I checked and it seems like TextDecoder tends to handle errors by using replacement characters,
	// I've tried to copy that functionality.
	if (state === 12 \|\| state !== 0 && ((utf8data[i] & 0xc0) !== 0x80)) {
	if (fatal) throw new TypeError(`Decoder error. Invalid byte in sequence at position ${i} in data.`);
	outBuffer[outIndex++] = 0xFFFD; // Replacement character
	if (outIndex === outBufferLength) {
	outString += String.fromCharCode.apply(null, outBuffer);
	outIndex = 0;
	}
	state = 0;
	}

	type = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8][utf8data[i]];
	codepoint = (state !== 0)
	? (utf8data[i] & 0x3f) \| (codepoint << 6)
	: (0xff >> type) & (utf8data[i]);
	state = [0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
	12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
	12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
	12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
	12,36,12,12,12,12,12,12,12,12,12,12][state + type];

	if (state !== 0) continue;

	// Add codepoint to buffer (as charcodes for utf-16), and flush buffer to string if needed.
	if (codepoint > 0xFFFF) {
	outBuffer[outIndex++] = (0xD7C0 + (codepoint >> 10));
	if (outIndex === outBufferLength) {
	outString += String.fromCharCode.apply(null, outBuffer);
	outIndex = 0;
	}
	outBuffer[outIndex++] = (0xDC00 \| (codepoint & 0x3FF));
	if (outIndex === outBufferLength) {
	outString += String.fromCharCode.apply(null, outBuffer);
	outIndex = 0;
	}
	} else {
	outBuffer[outIndex++] = codepoint;
	if (outIndex === outBufferLength) {
	outString += String.fromCharCode.apply(null, outBuffer);
	outIndex = 0;
	}
	}
	}

	// Add a replacement character if we ended in the middle of a sequence or encountered an invalid code at the end.
	if (state !== 0) {
	if (fatal) throw new TypeError(`Decoder error. Unexpected end of data.`);
	outBuffer[outIndex++] = 0xFFFD; // Replacement character
	}

	// Final flush of buffer
	outString += String.fromCharCode.apply(null, outBuffer.subarray(0, outIndex));

	return outString;
	}