-
-
Save qwerasd205/95843f00fc3898856c46b4e5da3bc2f2 to your computer and use it in GitHub Desktop.
My highly optimized JS utf-8 decoder function
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// This code is released for free, by the author, for use by anyone for any purpose, | |
// modified in any way, in whole, or in part, with or without this notice included. | |
// | |
// Please note: | |
// | |
// THIS CODE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | |
// INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A | |
// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHOR OF THIS CODE | |
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF | |
// CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE CODE | |
// OR THE USE OR OTHER DEALINGS IN THE CODE. | |
// This function is based on Bjoern Hoehrmann's DFA UTF-8 decoder. | |
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. | |
// | |
// Inserting a character to and flushing the buffer if neccessary | |
// could be a function, but I've decided to inline it in all places | |
// where it occurs, as it yields a performance benefit. | |
// I also inlined a data table in two places, as that, too, yielded | |
// a small performance increase. | |
// Please excuse the ugliness ^-^ | |
function decode (utf8data: Uint8Array, fatal: boolean = false, ignoreBOM: boolean = false): string { | |
let outString: string = ""; | |
// Prepare a buffer so that we don't have to do a lot of string concats, which are very slow. | |
const outBufferLength: number = Math.min(1024, utf8data.length); | |
const outBuffer: Uint16Array = new Uint16Array(outBufferLength); | |
let outIndex: number = 0; | |
let state: number = 0; | |
let codepoint: number = 0; | |
let type: number; | |
let i = (ignoreBOM && utf8data[0] === 0xEF && utf8data[1] === 0xBB && utf8data[2] === 0xBF) ? 3 : 0; | |
for (; i < utf8data.length; ++i) { | |
// Encoding error handling | |
// I checked and it seems like TextDecoder tends to handle errors by using replacement characters, | |
// I've tried to copy that functionality. | |
if (state === 12 || state !== 0 && ((utf8data[i] & 0xc0) !== 0x80)) { | |
if (fatal) throw new TypeError(`Decoder error. Invalid byte in sequence at position ${i} in data.`); | |
outBuffer[outIndex++] = 0xFFFD; // Replacement character | |
if (outIndex === outBufferLength) { | |
outString += String.fromCharCode.apply(null, outBuffer); | |
outIndex = 0; | |
} | |
state = 0; | |
} | |
type = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, | |
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, | |
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8][utf8data[i]]; | |
codepoint = (state !== 0) | |
? (utf8data[i] & 0x3f) | (codepoint << 6) | |
: (0xff >> type) & (utf8data[i]); | |
state = [0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, | |
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, | |
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, | |
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, | |
12,36,12,12,12,12,12,12,12,12,12,12][state + type]; | |
if (state !== 0) continue; | |
// Add codepoint to buffer (as charcodes for utf-16), and flush buffer to string if needed. | |
if (codepoint > 0xFFFF) { | |
outBuffer[outIndex++] = (0xD7C0 + (codepoint >> 10)); | |
if (outIndex === outBufferLength) { | |
outString += String.fromCharCode.apply(null, outBuffer); | |
outIndex = 0; | |
} | |
outBuffer[outIndex++] = (0xDC00 | (codepoint & 0x3FF)); | |
if (outIndex === outBufferLength) { | |
outString += String.fromCharCode.apply(null, outBuffer); | |
outIndex = 0; | |
} | |
} else { | |
outBuffer[outIndex++] = codepoint; | |
if (outIndex === outBufferLength) { | |
outString += String.fromCharCode.apply(null, outBuffer); | |
outIndex = 0; | |
} | |
} | |
} | |
// Add a replacement character if we ended in the middle of a sequence or encountered an invalid code at the end. | |
if (state !== 0) { | |
if (fatal) throw new TypeError(`Decoder error. Unexpected end of data.`); | |
outBuffer[outIndex++] = 0xFFFD; // Replacement character | |
} | |
// Final flush of buffer | |
outString += String.fromCharCode.apply(null, outBuffer.subarray(0, outIndex)); | |
return outString; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment