Skip to content

Instantly share code, notes, and snippets.

@alcalyn
Last active August 29, 2015 14:16
Show Gist options
  • Save alcalyn/659382820d1e2cf4cb53 to your computer and use it in GitHub Desktop.
Save alcalyn/659382820d1e2cf4cb53 to your computer and use it in GitHub Desktop.
Utility class about UTF8 string proccessing : get utf8 string length, convert utf8 string to byte array, convert bytes array to utf8 string
/**
* Utility class about UTF8 string proccessing
*
* @type UTF8
*/
var UTF8 =
{
/**
* Returns length in bytes of an utf8 string
*
* @param {string} str
*
* @returns {int}
*/
length: function (str) {
var m = encodeURIComponent(str).match(/%[89ABab]/g);
return str.length + (m ? m.length : 0);
},
/**
* Returns array of bytes of an utf8 string
*
* @param {string} str
*
* @returns {Uint8Array}
*/
stringToArray: function (str) {
var utf8 = new Uint8Array(UTF8.length(str));
var idx = 0;
for (var i=0; i < str.length; i++) {
var charcode = str.charCodeAt(i);
if (charcode < 0x80) utf8[idx++] = charcode;
else if (charcode < 0x800) {
utf8[idx++] = 0xc0 | (charcode >> 6);
utf8[idx++] = 0x80 | (charcode & 0x3f);
}
else if (charcode < 0xd800 || charcode >= 0xe000) {
utf8[idx++] = 0xe0 | (charcode >> 12);
utf8[idx++] = 0x80 | ((charcode>>6) & 0x3f),
utf8[idx++] = 0x80 | (charcode & 0x3f);
}
// surrogate pair
else {
i++;
// UTF-16 encodes 0x10000-0x10FFFF by
// subtracting 0x10000 and splitting the
// 20 bits of 0x0-0xFFFFF into two halves
charcode = 0x10000 + (((charcode & 0x3ff)<<10)
| (str.charCodeAt(i) & 0x3ff))
utf8[idx++] = 0xf0 | (charcode >>18);
utf8[idx++] = 0x80 | ((charcode>>12) & 0x3f);
utf8[idx++] = 0x80 | ((charcode>>6) & 0x3f);
utf8[idx++] = 0x80 | (charcode & 0x3f);
}
}
return utf8;
},
/**
* Decode bytes sequence to an utf8 string
*
* @param {!Uint8Array} bytes
*
* @returns {string}
*/
arrayToString: function (bytes)
{
var encodedString = String.fromCharCode.apply(null, bytes);
var decodedString = decodeURIComponent(escape(encodedString));
return decodedString;
}
};
/*
* Examples:
*/
UTF8.length('¥ Jul'); // Returns 6
UTF8.stringToArray('¥ Jul'); // Returns [194, 165, 32, 74, 117, 108]
// ¥ ' ' J u l
UTF8.arrayToString([194, 165, 32, 106, 117, 108]); // Returns '¥ jul'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment