Skip to content

Instantly share code, notes, and snippets.

@yckart
Last active December 28, 2022 23:56
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yckart/4951636 to your computer and use it in GitHub Desktop.
Save yckart/4951636 to your computer and use it in GitHub Desktop.
Encode/Decode UTF8 with javascript. more at: https://gist.github.com/avenauche/4585741
// http://kenany.me/blog/base64-js/
var utf8 = {
encode: function(str) { return unescape( encodeURIComponent(str) ); },
decode: function(str) { return decodeURIComponent( escape(str) ); }
};
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
/* Utf8 class: encode / decode between multi-byte Unicode characters and UTF-8 multiple */
/* single-byte character encoding (c) Chris Veness 2002-2011 */
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
/* changelog
* 2013-02-14: changed namespace to object literal
* 2010-09-29: fixed decode order of operation to avoid falsely recognising decoded string as 3-byte
* utf-8 charaacter
*/
var UTF8 = {
/**
* Encode multi-byte Unicode string into utf-8 multiple single-byte characters
* (BMP / basic multilingual plane only)
*
* Chars in range U+0080 - U+07FF are encoded in 2 chars, U+0800 - U+FFFF in 3 chars
*
* @param {String} strUni Unicode string to be encoded as UTF-8
* @returns {String} encoded string
*/
encode: function (strUni) {
// use regular expressions & String.replace callback function for better efficiency
// than procedural approaches
var strUtf = strUni.replace(
/[\u0080-\u07ff]/g, // U+0080 - U+07FF => 2 bytes 110yyyyy, 10zzzzzz
function (c) {
var cc = c.charCodeAt(0);
return String.fromCharCode(0xc0 | cc >> 6, 0x80 | cc & 0x3f);
});
strUtf = strUtf.replace(
/[\u0800-\uffff]/g, // U+0800 - U+FFFF => 3 bytes 1110xxxx, 10yyyyyy, 10zzzzzz
function (c) {
var cc = c.charCodeAt(0);
return String.fromCharCode(0xe0 | cc >> 12, 0x80 | cc >> 6 & 0x3F, 0x80 | cc & 0x3f);
});
return strUtf;
},
/**
* Decode utf-8 encoded string back into multi-byte Unicode characters
*
* @param {String} strUtf UTF-8 string to be decoded back to Unicode
* @returns {String} decoded string
*/
decode: function (strUtf) {
// note: decode 3-byte chars first as decoded 2-byte strings could appear to be 3-byte char!
var strUni = strUtf.replace(
/[\u00e0-\u00ef][\u0080-\u00bf][\u0080-\u00bf]/g, // 3-byte chars
function (c) { // (note parentheses for precence)
var cc = ((c.charCodeAt(0) & 0x0f) << 12) | ((c.charCodeAt(1) & 0x3f) << 6) | (c.charCodeAt(2) & 0x3f);
return String.fromCharCode(cc);
});
strUni = strUni.replace(
/[\u00c0-\u00df][\u0080-\u00bf]/g, // 2-byte chars
function (c) { // (note parentheses for precence)
var cc = (c.charCodeAt(0) & 0x1f) << 6 | c.charCodeAt(1) & 0x3f;
return String.fromCharCode(cc);
});
return strUni;
}
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment