Skip to content

Instantly share code, notes, and snippets.

@jchook

jchook/unicode.js

Last active Jun 20, 2019
Embed
What would you like to do?
JavaScript UTF-8 Helpers
/**
* Convert a string to a unicode byte array
* @param {string} str
* @return {Array} of bytes
*/
export function strToUtf8Bytes(str) {
const utf8 = [];
for (let ii = 0; ii < str.length; ii++) {
let charCode = str.charCodeAt(ii);
if (charCode < 0x80) utf8.push(charCode);
else if (charCode < 0x800) {
utf8.push(0xc0 | (charCode >> 6), 0x80 | (charCode & 0x3f));
} else if (charCode < 0xd800 || charCode >= 0xe000) {
utf8.push(0xe0 | (charCode >> 12), 0x80 | ((charCode >> 6) & 0x3f), 0x80 | (charCode & 0x3f));
} else {
ii++;
// Surrogate pair:
// UTF-16 encodes 0x10000-0x10FFFF by subtracting 0x10000 and
// splitting the 20 bits of 0x0-0xFFFFF into two halves
charCode = 0x10000 + (((charCode & 0x3ff) << 10) | (str.charCodeAt(ii) & 0x3ff));
utf8.push(
0xf0 | (charCode >> 18),
0x80 | ((charCode >> 12) & 0x3f),
0x80 | ((charCode >> 6) & 0x3f),
0x80 | (charCode & 0x3f),
);
}
}
return utf8;
}
/**
* How many unicode bytes per character?
* @param {string} str
* @return {Array} of integers, with length == str.length
*/
export function utf8BytesPerChar(str) {
const utf8 = [];
for (let ii = 0; ii < str.length; ii++) {
const charCode = str.charCodeAt(ii);
if (charCode < 0x80) {
utf8.push(1);
} else if (charCode < 0x800) {
utf8.push(2);
} else if (charCode < 0xd800 || charCode >= 0xe000) {
utf8.push(3);
} else {
ii++;
// Surrogate pair
utf8.push(2);
utf8.push(2);
}
}
return utf8;
}
/**
* Length of a string in UTF8 bytes (useful for dealing with data from PHP)
* @param {string} str
* @return {number}
*/
export function strlen(str) {
return utf8BytesPerChar(str).reduce((acc, cur) => acc + cur, 0);
}
/**
* Get a substring using string lengths similar to PHP's substr() and strlen()
* @param {string} str
* @param {number} start
* @param {number} end
* @return {string}
*/
export function substring(str, start, end) {
const bytesPerChar = utf8BytesPerChar(str);
const strLen = str.length;
let ii = 0;
let realStart = 0;
let realEnd = strLen;
let utf8Pos = 0;
// Find the utf-16 start/end location, aka the "real" start/end
if (start) {
for (; ii < strLen && utf8Pos <= start; ii++) {
utf8Pos += bytesPerChar[ii];
realStart = ii;
}
}
if (end) {
for (; ii <= strLen && utf8Pos <= end; ii++) {
utf8Pos += bytesPerChar[ii];
realEnd = ii;
}
}
return str.substring(realStart, realEnd);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.