Last active
November 1, 2019 13:03
-
-
Save creationix/4a4f1ee43dc90d57586ab25e1351a2c3 to your computer and use it in GitHub Desktop.
Simple conversions between utf8 Uint8Arrays and strings with surrogate pairs.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
◢□■□■□■□■□■□◣ | |
🔆 Emojis 📶 | |
◥□■□■□■□■□■□◤ | |
◖■□■□■□■□■□■□■□■□■□■□■□■◗ | |
😃 😄 🤠 😎 👿 💩 | |
🤡 👹 👺 👻 👽 👾 🤖 | |
🌉 🌐 😺 😾 🦊 😀 | |
◖■□■□■□■□■□■□■□■□■□■□■□■◗ | |
╓──────────────╖ | |
║🐝 Deseret 🐝 ║ | |
╒═════════════╩══════════════╩════════════════╕ | |
│ “𐐜 𐐰𐐼𐑂𐐰𐑌𐐻𐐮𐐾𐑆 𐐲𐑂 𐑄𐐮𐑅 𐐰𐑊𐑁𐐰𐐺𐐯𐐻 𐐶𐐮𐑊 𐑅𐐭𐑌 𐐺𐐨 │ | |
│ 𐑉𐐨𐐲𐑊𐐴𐑆𐐼, 𐐯𐑅𐐹𐐯𐑇𐐲𐑊𐑊𐐨 𐐺𐐴 𐑁𐐫𐑉𐐲𐑌𐐲𐑉𐑆… 𐐆𐐻 𐐶𐐮𐑊 𐐫𐑊𐑅𐐬 │ | |
│ 𐐺𐐨 𐑂𐐯𐑉𐐨 𐐰𐐼𐑂𐐲𐑌𐐻𐐩𐐾𐐲𐑅 𐐻𐐭 𐐵𐑉 𐐽𐐮𐑊𐐼𐑉𐐲𐑌. 𐐆𐐻 𐐶𐐮𐑊 𐐺𐐨 │ | |
│ 𐑄 𐑋𐐨𐑌𐑆 𐐲𐑂 𐐮𐑌𐐻𐑉𐐲𐐼𐐭𐑅𐐮𐑍 𐐷𐐭𐑌𐐮𐑁𐐬𐑉𐑋𐐮𐐻𐐨 𐐮𐑌 𐐵𐑉 │ | |
│ 𐐫𐑉𐑃𐐪𐑀𐑉𐐲𐑁𐐨, 𐐰𐑌𐐼 𐑄 𐐷𐐨𐑉𐑆 𐑄𐐰𐐻 𐐪𐑉 𐑌𐐵 𐑉𐐮𐐿𐐶𐐴𐐲𐑉𐐼 𐐻𐐭 │ | |
│ 𐑊𐐲𐑉𐑌 𐐻𐐭 𐑉𐐨𐐼 𐐰𐑌𐐼 𐑅𐐹𐐯𐑊 𐐿𐐰𐑌 𐐺𐐨 𐐼𐐮𐑂𐐬𐐻𐐮𐐼 𐐻𐐭 𐐲𐑄𐐲𐑉 │ | |
│ 𐑅𐐻𐐲𐐼𐐨𐑆.” │ | |
├─────────────┬───────────────────────────────┘ | |
│ —𐐒𐑉𐐮𐑀𐐲𐑋 𐐏𐐲𐑍 │ | |
╘═════════════╛ | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** Convert an utf8 encoded Uint8Array into a unicode string (with surrogate pairs.) */ | |
export function utf8Decode(bin: Uint8Array): string { | |
// tslint:disable: no-bitwise | |
let str = ""; | |
for (let i = 0, l = bin.length; i < l;) { | |
const byte = bin[i++]; | |
const codePoint = | |
byte < 0x80 | |
? byte | |
: byte >= 0xc0 && byte < 0xe0 | |
? (byte & 0x1f) << 6 | | |
bin[i++] & 0x3f | |
: byte >= 0xe0 && byte < 0xf0 | |
? (byte & 0xf) << 12 | | |
(bin[i++] & 0x3f) << 6 | | |
bin[i++] & 0x3f | |
: byte >= 0xf0 && byte < 0xf8 | |
? (byte & 0x7) << 18 | | |
(bin[i++] & 0x3f) << 12 | | |
(bin[i++] & 0x3f) << 6 | | |
bin[i++] & 0x3f | |
: -1; | |
if (codePoint < 0) { | |
throw new Error("Invalid UTF-8 value found in decoding"); | |
} | |
str += String.fromCodePoint(codePoint); | |
} | |
return str; | |
} | |
/** Convert a unicode string (with surrogate pairs) into an utf8 encoded Uint8Array */ | |
export function utf8Encode(str: string): Uint8Array { | |
let sizeNeeded = 0; | |
const length = str.length; | |
for (let i = 0; i < length; i++) { | |
const codePoint = str.codePointAt(i) as number; | |
if (codePoint < 0x80) { | |
sizeNeeded++; | |
} else if (codePoint < 0x800) { | |
sizeNeeded += 2; | |
} else if (codePoint < 0x10000) { | |
sizeNeeded += 3; | |
} else { | |
i++; | |
sizeNeeded += 4; | |
} | |
} | |
const buffer = new Uint8Array(sizeNeeded); | |
let offset = 0; | |
for (let i = 0; i < length; i++) { | |
const codePoint = str.codePointAt(i) as number; | |
if (codePoint < 0x80) { | |
buffer[offset++] = codePoint; | |
} else if (codePoint < 0x800) { | |
buffer[offset++] = 0xc0 | (codePoint >> 6); | |
buffer[offset++] = 0x80 | (codePoint & 0x3f); | |
} else if (codePoint < 0x10000) { | |
buffer[offset++] = 0xe0 | (codePoint >> 12); | |
buffer[offset++] = 0x80 | ((codePoint >> 6) & 0x3f); | |
buffer[offset++] = 0x80 | (codePoint & 0x3f); | |
} else { | |
i++; | |
buffer[offset++] = 0xf0 | (codePoint >> 18); | |
buffer[offset++] = 0x80 | ((codePoint >> 12) & 0x3f); | |
buffer[offset++] = 0x80 | ((codePoint >> 6) & 0x3f); | |
buffer[offset++] = 0x80 | (codePoint & 0x3f); | |
} | |
} | |
return buffer; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment