Skip to content

Instantly share code, notes, and snippets.

@qwerasd205
Last active February 3, 2022 00:40
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save qwerasd205/95843f00fc3898856c46b4e5da3bc2f2 to your computer and use it in GitHub Desktop.
Save qwerasd205/95843f00fc3898856c46b4e5da3bc2f2 to your computer and use it in GitHub Desktop.
My highly optimized JS utf-8 decoder function
// This code is released for free, by the author, for use by anyone for any purpose,
// modified in any way, in whole, or in part, with or without this notice included.
//
// Please note:
//
// THIS CODE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
// INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHOR OF THIS CODE
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
// CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE CODE
// OR THE USE OR OTHER DEALINGS IN THE CODE.
// This function is based on Bjoern Hoehrmann's DFA UTF-8 decoder.
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
//
// Inserting a character to and flushing the buffer if neccessary
// could be a function, but I've decided to inline it in all places
// where it occurs, as it yields a performance benefit.
// I also inlined a data table in two places, as that, too, yielded
// a small performance increase.
// Please excuse the ugliness ^-^
function decode (utf8data: Uint8Array, fatal: boolean = false, ignoreBOM: boolean = false): string {
let outString: string = "";
// Prepare a buffer so that we don't have to do a lot of string concats, which are very slow.
const outBufferLength: number = Math.min(1024, utf8data.length);
const outBuffer: Uint16Array = new Uint16Array(outBufferLength);
let outIndex: number = 0;
let state: number = 0;
let codepoint: number = 0;
let type: number;
let i = (ignoreBOM && utf8data[0] === 0xEF && utf8data[1] === 0xBB && utf8data[2] === 0xBF) ? 3 : 0;
for (; i < utf8data.length; ++i) {
// Encoding error handling
// I checked and it seems like TextDecoder tends to handle errors by using replacement characters,
// I've tried to copy that functionality.
if (state === 12 || state !== 0 && ((utf8data[i] & 0xc0) !== 0x80)) {
if (fatal) throw new TypeError(`Decoder error. Invalid byte in sequence at position ${i} in data.`);
outBuffer[outIndex++] = 0xFFFD; // Replacement character
if (outIndex === outBufferLength) {
outString += String.fromCharCode.apply(null, outBuffer);
outIndex = 0;
}
state = 0;
}
type = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8][utf8data[i]];
codepoint = (state !== 0)
? (utf8data[i] & 0x3f) | (codepoint << 6)
: (0xff >> type) & (utf8data[i]);
state = [0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
12,36,12,12,12,12,12,12,12,12,12,12][state + type];
if (state !== 0) continue;
// Add codepoint to buffer (as charcodes for utf-16), and flush buffer to string if needed.
if (codepoint > 0xFFFF) {
outBuffer[outIndex++] = (0xD7C0 + (codepoint >> 10));
if (outIndex === outBufferLength) {
outString += String.fromCharCode.apply(null, outBuffer);
outIndex = 0;
}
outBuffer[outIndex++] = (0xDC00 | (codepoint & 0x3FF));
if (outIndex === outBufferLength) {
outString += String.fromCharCode.apply(null, outBuffer);
outIndex = 0;
}
} else {
outBuffer[outIndex++] = codepoint;
if (outIndex === outBufferLength) {
outString += String.fromCharCode.apply(null, outBuffer);
outIndex = 0;
}
}
}
// Add a replacement character if we ended in the middle of a sequence or encountered an invalid code at the end.
if (state !== 0) {
if (fatal) throw new TypeError(`Decoder error. Unexpected end of data.`);
outBuffer[outIndex++] = 0xFFFD; // Replacement character
}
// Final flush of buffer
outString += String.fromCharCode.apply(null, outBuffer.subarray(0, outIndex));
return outString;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment