Skip to content

Instantly share code, notes, and snippets.

@creationix
Last active November 1, 2019 13:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save creationix/4a4f1ee43dc90d57586ab25e1351a2c3 to your computer and use it in GitHub Desktop.
Save creationix/4a4f1ee43dc90d57586ab25e1351a2c3 to your computer and use it in GitHub Desktop.
Simple conversions between utf8 Uint8Arrays and strings with surrogate pairs.
◢□■□■□■□■□■□◣
🔆 Emojis 📶
◥□■□■□■□■□■□◤
◖■□■□■□■□■□■□■□■□■□■□■□■◗
😃 😄 🤠 😎 👿 💩
🤡 👹 👺 👻 👽 👾 🤖
🌉 🌐 😺 😾 🦊 😀
◖■□■□■□■□■□■□■□■□■□■□■□■◗
╓──────────────╖
║🐝 Deseret 🐝 ║
╒═════════════╩══════════════╩════════════════╕
│ “𐐜 𐐰𐐼𐑂𐐰𐑌𐐻𐐮𐐾𐑆 𐐲𐑂 𐑄𐐮𐑅 𐐰𐑊𐑁𐐰𐐺𐐯𐐻 𐐶𐐮𐑊 𐑅𐐭𐑌 𐐺𐐨 │
│ 𐑉𐐨𐐲𐑊𐐴𐑆𐐼, 𐐯𐑅𐐹𐐯𐑇𐐲𐑊𐑊𐐨 𐐺𐐴 𐑁𐐫𐑉𐐲𐑌𐐲𐑉𐑆… 𐐆𐐻 𐐶𐐮𐑊 𐐫𐑊𐑅𐐬 │
│ 𐐺𐐨 𐑂𐐯𐑉𐐨 𐐰𐐼𐑂𐐲𐑌𐐻𐐩𐐾𐐲𐑅 𐐻𐐭 𐐵𐑉 𐐽𐐮𐑊𐐼𐑉𐐲𐑌. 𐐆𐐻 𐐶𐐮𐑊 𐐺𐐨 │
│ 𐑄 𐑋𐐨𐑌𐑆 𐐲𐑂 𐐮𐑌𐐻𐑉𐐲𐐼𐐭𐑅𐐮𐑍 𐐷𐐭𐑌𐐮𐑁𐐬𐑉𐑋𐐮𐐻𐐨 𐐮𐑌 𐐵𐑉 │
│ 𐐫𐑉𐑃𐐪𐑀𐑉𐐲𐑁𐐨, 𐐰𐑌𐐼 𐑄 𐐷𐐨𐑉𐑆 𐑄𐐰𐐻 𐐪𐑉 𐑌𐐵 𐑉𐐮𐐿𐐶𐐴𐐲𐑉𐐼 𐐻𐐭 │
│ 𐑊𐐲𐑉𐑌 𐐻𐐭 𐑉𐐨𐐼 𐐰𐑌𐐼 𐑅𐐹𐐯𐑊 𐐿𐐰𐑌 𐐺𐐨 𐐼𐐮𐑂𐐬𐐻𐐮𐐼 𐐻𐐭 𐐲𐑄𐐲𐑉 │
│ 𐑅𐐻𐐲𐐼𐐨𐑆.” │
├─────────────┬───────────────────────────────┘
│ —𐐒𐑉𐐮𐑀𐐲𐑋 𐐏𐐲𐑍 │
╘═════════════╛
/** Convert an utf8 encoded Uint8Array into a unicode string (with surrogate pairs.) */
export function utf8Decode(bin: Uint8Array): string {
// tslint:disable: no-bitwise
let str = "";
for (let i = 0, l = bin.length; i < l;) {
const byte = bin[i++];
const codePoint =
byte < 0x80
? byte
: byte >= 0xc0 && byte < 0xe0
? (byte & 0x1f) << 6 |
bin[i++] & 0x3f
: byte >= 0xe0 && byte < 0xf0
? (byte & 0xf) << 12 |
(bin[i++] & 0x3f) << 6 |
bin[i++] & 0x3f
: byte >= 0xf0 && byte < 0xf8
? (byte & 0x7) << 18 |
(bin[i++] & 0x3f) << 12 |
(bin[i++] & 0x3f) << 6 |
bin[i++] & 0x3f
: -1;
if (codePoint < 0) {
throw new Error("Invalid UTF-8 value found in decoding");
}
str += String.fromCodePoint(codePoint);
}
return str;
}
/** Convert a unicode string (with surrogate pairs) into an utf8 encoded Uint8Array */
export function utf8Encode(str: string): Uint8Array {
let sizeNeeded = 0;
const length = str.length;
for (let i = 0; i < length; i++) {
const codePoint = str.codePointAt(i) as number;
if (codePoint < 0x80) {
sizeNeeded++;
} else if (codePoint < 0x800) {
sizeNeeded += 2;
} else if (codePoint < 0x10000) {
sizeNeeded += 3;
} else {
i++;
sizeNeeded += 4;
}
}
const buffer = new Uint8Array(sizeNeeded);
let offset = 0;
for (let i = 0; i < length; i++) {
const codePoint = str.codePointAt(i) as number;
if (codePoint < 0x80) {
buffer[offset++] = codePoint;
} else if (codePoint < 0x800) {
buffer[offset++] = 0xc0 | (codePoint >> 6);
buffer[offset++] = 0x80 | (codePoint & 0x3f);
} else if (codePoint < 0x10000) {
buffer[offset++] = 0xe0 | (codePoint >> 12);
buffer[offset++] = 0x80 | ((codePoint >> 6) & 0x3f);
buffer[offset++] = 0x80 | (codePoint & 0x3f);
} else {
i++;
buffer[offset++] = 0xf0 | (codePoint >> 18);
buffer[offset++] = 0x80 | ((codePoint >> 12) & 0x3f);
buffer[offset++] = 0x80 | ((codePoint >> 6) & 0x3f);
buffer[offset++] = 0x80 | (codePoint & 0x3f);
}
}
return buffer;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment