Instantly share code, notes, and snippets.

Embed
What would you like to do?
Utf8 string encode/decode using regular expressions
/**
* Encodes multi-byte Unicode string into utf-8 multiple single-byte characters
* (BMP / basic multilingual plane only).
*
* Chars in range U+0080 - U+07FF are encoded in 2 chars, U+0800 - U+FFFF in 3 chars.
*
* Can be achieved in JavaScript by unescape(encodeURIComponent(str)),
* but this approach may be useful in other languages.
*
* @param {string} unicodeString - Unicode string to be encoded as UTF-8.
* @returns {string} UTF8-encoded string.
*/
function utf8Encode(unicodeString) {
if (typeof unicodeString != 'string') throw new TypeError('parameter ‘unicodeString’ is not a string');
const utf8String = unicodeString.replace(
/[\u0080-\u07ff]/g, // U+0080 - U+07FF => 2 bytes 110yyyyy, 10zzzzzz
function(c) {
var cc = c.charCodeAt(0);
return String.fromCharCode(0xc0 | cc>>6, 0x80 | cc&0x3f); }
).replace(
/[\u0800-\uffff]/g, // U+0800 - U+FFFF => 3 bytes 1110xxxx, 10yyyyyy, 10zzzzzz
function(c) {
var cc = c.charCodeAt(0);
return String.fromCharCode(0xe0 | cc>>12, 0x80 | cc>>6&0x3F, 0x80 | cc&0x3f); }
);
return utf8String;
}
/**
* Decodes utf-8 encoded string back into multi-byte Unicode characters.
*
* Can be achieved JavaScript by decodeURIComponent(escape(str)),
* but this approach may be useful in other languages.
*
* @param {string} utf8String - UTF-8 string to be decoded back to Unicode.
* @returns {string} Decoded Unicode string.
*/
function utf8Decode(utf8String) {
if (typeof utf8String != 'string') throw new TypeError('parameter ‘utf8String’ is not a string');
// note: decode 3-byte chars first as decoded 2-byte strings could appear to be 3-byte char!
const unicodeString = utf8String.replace(
/[\u00e0-\u00ef][\u0080-\u00bf][\u0080-\u00bf]/g, // 3-byte chars
function(c) { // (note parentheses for precedence)
var cc = ((c.charCodeAt(0)&0x0f)<<12) | ((c.charCodeAt(1)&0x3f)<<6) | ( c.charCodeAt(2)&0x3f);
return String.fromCharCode(cc); }
).replace(
/[\u00c0-\u00df][\u0080-\u00bf]/g, // 2-byte chars
function(c) { // (note parentheses for precedence)
var cc = (c.charCodeAt(0)&0x1f)<<6 | c.charCodeAt(1)&0x3f;
return String.fromCharCode(cc); }
);
return unicodeString;
}
@natenrb9

This comment has been minimized.

Show comment
Hide comment
@natenrb9

natenrb9 Jan 14, 2016

THANK YOU!! Was looking for a JS solution to decoding UTF-8 special characters for a couple hours...!

natenrb9 commented Jan 14, 2016

THANK YOU!! Was looking for a JS solution to decoding UTF-8 special characters for a couple hours...!

@MarcelloDiSimone

This comment has been minimized.

Show comment
Hide comment
@MarcelloDiSimone

MarcelloDiSimone May 24, 2016

Very nice work, but you should "typecast" the parameters to be a string (I used it to decode url parameter values and when it happend to be an id or similar it was interpreted as an integer and therefor ´replace´ failed with unknown function error). You can also chain the the replace calls... you can see it in my fork:

https://gist.github.com/MarcelloDiSimone/933a13c6a5b6458ce29d972644bb5892

MarcelloDiSimone commented May 24, 2016

Very nice work, but you should "typecast" the parameters to be a string (I used it to decode url parameter values and when it happend to be an id or similar it was interpreted as an integer and therefor ´replace´ failed with unknown function error). You can also chain the the replace calls... you can see it in my fork:

https://gist.github.com/MarcelloDiSimone/933a13c6a5b6458ce29d972644bb5892

@chrisveness

This comment has been minimized.

Show comment
Hide comment
@chrisveness

chrisveness Sep 21, 2016

@MarcelloDiSimone, good idea, though probably better to type-check than silently coerce (users can explicitly coerce if they wish). Thx.

Owner

chrisveness commented Sep 21, 2016

@MarcelloDiSimone, good idea, though probably better to type-check than silently coerce (users can explicitly coerce if they wish). Thx.

@pombredanne

This comment has been minimized.

Show comment
Hide comment
@pombredanne

pombredanne Dec 16, 2016

Hi: what would be the license for this code?

pombredanne commented Dec 16, 2016

Hi: what would be the license for this code?

@DhamoR

This comment has been minimized.

Show comment
Hide comment
@DhamoR

DhamoR Jun 6, 2018

Hi, This is a brilliant piece of code. I have a string of octal representation of the utf8string like("\320\223...") that I want to decode to unicode String.. How do I do that..?

DhamoR commented Jun 6, 2018

Hi, This is a brilliant piece of code. I have a string of octal representation of the utf8string like("\320\223...") that I want to decode to unicode String.. How do I do that..?

@TSlivede

This comment has been minimized.

Show comment
Hide comment
@TSlivede

TSlivede Jul 5, 2018

Hi, nice work, but you don't handle Codepoints above 'U+FFFF' correctly, see https://github.com/TSlivede/utf8-regex-encode-decode-js/

@DhamoR If your String only contains ASCII and octal UTF-8 sequences you can do this:

str="\\320\\223...";
Utf8Decode(str.replace(/\\[0-9][0-9][0-9]/g,function(s){return String.fromCharCode(parseInt(s.substr(1),8));}))

If your string can already contain UTF-16 chars you need to first encode those into UTF-8:

str="\\320\\223"+String.fromCodePoint(0x1F60E)+"...";
Utf8Decode(Utf8Encode(str).replace(/\\[0-9][0-9][0-9]/g,function(s){return String.fromCharCode(parseInt(s.substr(1),8));}))

TSlivede commented Jul 5, 2018

Hi, nice work, but you don't handle Codepoints above 'U+FFFF' correctly, see https://github.com/TSlivede/utf8-regex-encode-decode-js/

@DhamoR If your String only contains ASCII and octal UTF-8 sequences you can do this:

str="\\320\\223...";
Utf8Decode(str.replace(/\\[0-9][0-9][0-9]/g,function(s){return String.fromCharCode(parseInt(s.substr(1),8));}))

If your string can already contain UTF-16 chars you need to first encode those into UTF-8:

str="\\320\\223"+String.fromCodePoint(0x1F60E)+"...";
Utf8Decode(Utf8Encode(str).replace(/\\[0-9][0-9][0-9]/g,function(s){return String.fromCharCode(parseInt(s.substr(1),8));}))
@erikaperugachi

This comment has been minimized.

Show comment
Hide comment
@erikaperugachi

erikaperugachi Sep 13, 2018

this does not support emoji :(

erikaperugachi commented Sep 13, 2018

this does not support emoji :(

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment