Skip to content

Instantly share code, notes, and snippets.

@bathos
Last active October 18, 2017 03:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bathos/be27895c4bd8a51ed1bf3c580cd9ca6a to your computer and use it in GitHub Desktop.
Save bathos/be27895c4bd8a51ed1bf3c580cd9ca6a to your computer and use it in GitHub Desktop.
quik-utf8-to-cps-sync.js
// Deliberately permissive (but fast) conversion from utf8 buffer to codepoint
// array. Invalid multibyte codepoint sequences become replacement character and
// orphaned surrogates are expressly permitted.
//
// The speed is almost identical to the built-in TextDecoder; however, it yields
// an array of codepoints, rather than a string. When compared as ways to get
// codepoints (e.g. by following TextDecoder.end with map), this runs fourteen
// times faster (not because TextDecoder is slow, but because x.from(str, map)
// is — not sure what accounts for such a big difference, though it only reaches
// those extremes on large files).
//
// I wrote a general lib for this for _streams_ (codpoint), but in this case a
// stream would be inefficient; we have the buffers we need to operate on in
// memory already. Oh well, nothing is generic when you want speed. (Also in this
// case I wanted FFFD replacement behavior rather than errors; codpoint is
// strict).
//
// V8 loves switch blocks. It likes them way more than it likes math, even
// bitwise.
const cont = byte => {
switch (byte) {
case 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86:
case 0x87: case 0x88: case 0x89: case 0x8A: case 0x8B: case 0x8C: case 0x8D:
case 0x8E: case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93: case 0x94:
case 0x95: case 0x96: case 0x97: case 0x98: case 0x99: case 0x9A: case 0x9B:
case 0x9C: case 0x9D: case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2:
case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7: case 0xA8: case 0xA9:
case 0xAA: case 0xAB: case 0xAC: case 0xAD: case 0xAE: case 0xAF: case 0xB0:
case 0xB1: case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6: case 0xB7:
case 0xB8: case 0xB9: case 0xBA: case 0xBB: case 0xBC: case 0xBD: case 0xBE:
case 0xBF: return byte & 0b111111;
default: return 0x40;
}
};
module.exports = buf => {
const target = new Uint32Array(buf.length);
let i = 0, j = 0, l = buf.length;
while (i < l) {
const byte = buf[i++];
switch (byte) {
case 0x00: case 0x01: case 0x02: case 0x03: case 0x04: case 0x05:
case 0x06: case 0x07: case 0x08: case 0x09: case 0x0A: case 0x0B:
case 0x0C: case 0x0D: case 0x0E: case 0x0F: case 0x10: case 0x11:
case 0x12: case 0x13: case 0x14: case 0x15: case 0x16: case 0x17:
case 0x18: case 0x19: case 0x1A: case 0x1B: case 0x1C: case 0x1D:
case 0x1E: case 0x1F: case 0x20: case 0x21: case 0x22: case 0x23:
case 0x24: case 0x25: case 0x26: case 0x27: case 0x28: case 0x29:
case 0x2A: case 0x2B: case 0x2C: case 0x2D: case 0x2E: case 0x2F:
case 0x30: case 0x31: case 0x32: case 0x33: case 0x34: case 0x35:
case 0x36: case 0x37: case 0x38: case 0x39: case 0x3A: case 0x3B:
case 0x3C: case 0x3D: case 0x3E: case 0x3F: case 0x40: case 0x41:
case 0x42: case 0x43: case 0x44: case 0x45: case 0x46: case 0x47:
case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C: case 0x4D:
case 0x4E: case 0x4F: case 0x50: case 0x51: case 0x52: case 0x53:
case 0x54: case 0x55: case 0x56: case 0x57: case 0x58: case 0x59:
case 0x5A: case 0x5B: case 0x5C: case 0x5D: case 0x5E: case 0x5F:
case 0x60: case 0x61: case 0x62: case 0x63: case 0x64: case 0x65:
case 0x66: case 0x67: case 0x68: case 0x69: case 0x6A: case 0x6B:
case 0x6C: case 0x6D: case 0x6E: case 0x6F: case 0x70: case 0x71:
case 0x72: case 0x73: case 0x74: case 0x75: case 0x76: case 0x77:
case 0x78: case 0x79: case 0x7A: case 0x7B: case 0x7C: case 0x7D:
case 0x7E: case 0x7F:
target[j++] = byte;
continue;
case 0xC2: case 0xC3: case 0xC4: case 0xC5: case 0xC6: case 0xC7:
case 0xC8: case 0xC9: case 0xCA: case 0xCB: case 0xCC: case 0xCD:
case 0xCE: case 0xCF: case 0xD0: case 0xD1: case 0xD2: case 0xD3:
case 0xD4: case 0xD5: case 0xD6: case 0xD7: case 0xD8: case 0xD9:
case 0xDA: case 0xDB: case 0xDC: case 0xDD: case 0xDE: case 0xDF:
if (i < l) {
const c1 = cont(buf[i]);
if (c1 === 0x40) break;
i++;
const cp = (byte & 0b11111) << 6 | c1;
if (cp < 0x80) break;
target[j++] = cp;
continue;
}
break;
case 0xE0: case 0xE1: case 0xE2: case 0xE3: case 0xE4: case 0xE5:
case 0xE6: case 0xE7: case 0xE8: case 0xE9: case 0xEA: case 0xEB:
case 0xEC: case 0xED: case 0xEE: case 0xEF:
if (i < l) {
const c1 = cont(buf[i]);
if (c1 === 0x40) break;
if (++i < l) {
const c2 = cont(buf[i]);
if (c2 === 0x40) break;
i++;
const cp = (byte & 0b1111) << 12 | c1 << 6 | c2;
if (cp < 0x800) break;
target[j++] = cp;
continue;
}
}
break;
case 0xF0: case 0xF1: case 0xF2: case 0xF3: case 0xF4:
if (i < l) {
const c1 = cont(buf[i]);
if (c1 === 0x40) break;
if (++i < l) {
const c2 = cont(buf[i]);
if (c2 === 0x40) break;
if (++i < l) {
const c3 = cont(buf[i]);
if (c3 === 0x40) break;
i++;
const cp = (byte & 0b111) << 18 | c1 << 12 | c2 << 6 | c3;
if (cp < 0x10000 || cp > 0x10FFFF) break;
target[j++] = cp;
continue;
}
}
}
break;
}
target[j++] = 0xFFFD;
}
return target.subarray(Number(target[0] === 0xFEFF), j);
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment