Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
JS Unicode String to Byte Array
function stringToByteArray(str) {
var b = [], i, unicode;
for(i = 0; i < str.length; i++) {
unicode = str.charCodeAt(i);
// 0x00000000 - 0x0000007f -> 0xxxxxxx
if (unicode <= 0x7f) {
b.push(String.fromCharCode(unicode));
// 0x00000080 - 0x000007ff -> 110xxxxx 10xxxxxx
} else if (unicode <= 0x7ff) {
b.push(String.fromCharCode((unicode >> 6) | 0xc0));
b.push(String.fromCharCode((unicode & 0x3F) | 0x80));
// 0x00000800 - 0x0000ffff -> 1110xxxx 10xxxxxx 10xxxxxx
} else if (unicode <= 0xffff) {
b.push(String.fromCharCode((unicode >> 12) | 0xe0));
b.push(String.fromCharCode(((unicode >> 6) & 0x3f) | 0x80));
b.push(String.fromCharCode((unicode & 0x3f) | 0x80));
// 0x00010000 - 0x001fffff -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
} else {
b.push(String.fromCharCode((unicode >> 18) | 0xf0));
b.push(String.fromCharCode(((unicode >> 12) & 0x3f) | 0x80));
b.push(String.fromCharCode(((unicode >> 6) & 0x3f) | 0x80));
b.push(String.fromCharCode((unicode & 0x3f) | 0x80));
}
}
return b;
}

the last else is incorrect, because unicode chars with index greater than 0xffff are encoded with two chars in JS. https://javascript.ru/String/charCodeAt

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment