Skip to content

Instantly share code, notes, and snippets.

@hugowetterberg
Created November 30, 2015 07:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save hugowetterberg/6b6275feb028bdc26bb6 to your computer and use it in GitHub Desktop.
Save hugowetterberg/6b6275feb028bdc26bb6 to your computer and use it in GitHub Desktop.
A test of utf8-encoding in javascript
// https://github.com/beatgammit/base64-js
var lookup = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
;(function (exports) {
'use strict'
var Arr = (typeof Uint8Array !== 'undefined')
? Uint8Array
: Array
var PLUS = '+'.charCodeAt(0)
var SLASH = '/'.charCodeAt(0)
var NUMBER = '0'.charCodeAt(0)
var LOWER = 'a'.charCodeAt(0)
var UPPER = 'A'.charCodeAt(0)
var PLUS_URL_SAFE = '-'.charCodeAt(0)
var SLASH_URL_SAFE = '_'.charCodeAt(0)
function decode (elt) {
var code = elt.charCodeAt(0)
if (code === PLUS || code === PLUS_URL_SAFE) return 62 // '+'
if (code === SLASH || code === SLASH_URL_SAFE) return 63 // '/'
if (code < NUMBER) return -1 // no match
if (code < NUMBER + 10) return code - NUMBER + 26 + 26
if (code < UPPER + 26) return code - UPPER
if (code < LOWER + 26) return code - LOWER + 26
}
function b64ToByteArray (b64) {
var i, j, l, tmp, placeHolders, arr
if (b64.length % 4 > 0) {
throw new Error('Invalid string. Length must be a multiple of 4')
}
// the number of equal signs (place holders)
// if there are two placeholders, than the two characters before it
// represent one byte
// if there is only one, then the three characters before it represent 2 bytes
// this is just a cheap hack to not do indexOf twice
var len = b64.length
placeHolders = b64.charAt(len - 2) === '=' ? 2 : b64.charAt(len - 1) === '=' ? 1 : 0
// base64 is 4/3 + up to two characters of the original data
arr = new Arr(b64.length * 3 / 4 - placeHolders)
// if there are placeholders, only get up to the last complete 4 chars
l = placeHolders > 0 ? b64.length - 4 : b64.length
var L = 0
function push (v) {
arr[L++] = v
}
for (i = 0, j = 0; i < l; i += 4, j += 3) {
tmp = (decode(b64.charAt(i)) << 18) | (decode(b64.charAt(i + 1)) << 12) | (decode(b64.charAt(i + 2)) << 6) | decode(b64.charAt(i + 3))
push((tmp & 0xFF0000) >> 16)
push((tmp & 0xFF00) >> 8)
push(tmp & 0xFF)
}
if (placeHolders === 2) {
tmp = (decode(b64.charAt(i)) << 2) | (decode(b64.charAt(i + 1)) >> 4)
push(tmp & 0xFF)
} else if (placeHolders === 1) {
tmp = (decode(b64.charAt(i)) << 10) | (decode(b64.charAt(i + 1)) << 4) | (decode(b64.charAt(i + 2)) >> 2)
push((tmp >> 8) & 0xFF)
push(tmp & 0xFF)
}
return arr
}
function uint8ToBase64 (uint8) {
var i
var extraBytes = uint8.length % 3 // if we have 1 byte left, pad 2 bytes
var output = ''
var temp, length
function encode (num) {
return lookup.charAt(num)
}
function tripletToBase64 (num) {
return encode(num >> 18 & 0x3F) + encode(num >> 12 & 0x3F) + encode(num >> 6 & 0x3F) + encode(num & 0x3F)
}
// go through the array every three bytes, we'll deal with trailing stuff later
for (i = 0, length = uint8.length - extraBytes; i < length; i += 3) {
temp = (uint8[i] << 16) + (uint8[i + 1] << 8) + (uint8[i + 2])
output += tripletToBase64(temp)
}
// pad the end with zeros, but make sure to not forget the extra bytes
switch (extraBytes) {
case 1:
temp = uint8[uint8.length - 1]
output += encode(temp >> 2)
output += encode((temp << 4) & 0x3F)
output += '=='
break
case 2:
temp = (uint8[uint8.length - 2] << 8) + (uint8[uint8.length - 1])
output += encode(temp >> 10)
output += encode((temp >> 4) & 0x3F)
output += encode((temp << 2) & 0x3F)
output += '='
break
default:
break
}
return output
}
exports.toByteArray = b64ToByteArray
exports.fromByteArray = uint8ToBase64
}(typeof exports === 'undefined' ? (this.base64js = {}) : exports))
// Copied from http://xahlee.info/js/js_unicode_code_point.html
// returns a char's Unicode codepoint, of the char at index idx of string str
// 2013-07-16 from https://developer.mozilla.org/en-US/docs/JavaScript/Reference/Global_Objects/String/charCodeAt
function fixedCharCodeAt (str, idx) {
// ex. fixedCharCodeAt ('\uD800\uDC00', 0); // 65536
// ex. fixedCharCodeAt ('\uD800\uDC00', 1); // 65536
idx = idx || 0;
var code = str.charCodeAt(idx);
var hi, low;
if (0xD800 <= code && code <= 0xDBFF) { // High surrogate (could change last hex to 0xDB7F to treat high private surrogates as single characters)
hi = code;
low = str.charCodeAt(idx+1);
if (isNaN(low)) {
throw 'High surrogate not followed by low surrogate in fixedCharCodeAt()';
}
return ((hi - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000;
}
if (0xDC00 <= code && code <= 0xDFFF) { // Low surrogate
// We return false to allow loops to skip this iteration since should have already handled high surrogate above in the previous iteration
return false;
/*hi = str.charCodeAt(idx-1);
low = code;
return ((hi - 0xD800) * 0x400) + (low - 0xDC00) + 0x10000;*/
}
return code;
}
// Google closure, string to byte modified to handle code points up to U+7FFFFFFF (full utf8)
// https://github.com/google/closure-library/blob/28d9db61f5dc639c010be74e4d61682121d2dbd7/closure/goog/crypt/crypt.js#L110
/**
* Converts a JS string to a UTF-8 "byte" array.
* @param {string} str 16-bit unicode string.
* @return {!Array<number>} UTF-8 byte array.
*/
var stringToUtf8ByteArray = function(str) {
// TODO(user): Use native implementations if/when available
var out = [], p = 0;
for (var i = 0; i < str.length; i++) {
var c = fixedCharCodeAt(str, i);
if (c === false) continue;
if (c < 128) {
out[p++] = c;
} else if (c < 2048) {
out[p++] = (c >> 6) | 192;
out[p++] = (c & 63) | 128;
} else if (c < 65536) {
out[p++] = (c >> 12) | 224;
out[p++] = ((c >> 6) & 63) | 128;
out[p++] = (c & 63) | 128;
} else if (c < 2097152) {
out[p++] = (c >> 18) | 240;
out[p++] = ((c >> 12) & 63) | 128;
out[p++] = ((c >> 6) & 63) | 128;
out[p++] = (c & 63) | 128;
} else if (c < 67108864) {
out[p++] = (c >> 24) | 248;
out[p++] = ((c >> 18) & 63) | 128;
out[p++] = ((c >> 12) & 63) | 128;
out[p++] = ((c >> 6) & 63) | 128;
out[p++] = (c & 63) | 128;
} else if (c < 2147483648) {
out[p++] = (c >> 30) | 252;
out[p++] = ((c >> 24) & 63) | 128;
out[p++] = ((c >> 18) & 63) | 128;
out[p++] = ((c >> 12) & 63) | 128;
out[p++] = ((c >> 6) & 63) | 128;
out[p++] = (c & 63) | 128;
}
}
return out;
};
// Using the js-implementation
var utfyPass = "Huöut væ 💥💖 Iñtërnâtiônàlizætiøn";
var data = stringToUtf8ByteArray(utfyPass);
var base = exports.fromByteArray(data);
// Using native buffera
var buf = new Buffer(utfyPass, 'utf8');
console.log(base);
if (buf.toString('base64') === base) {
console.log('Yay, we passed comparison with native base64-encoded utf8 buffer!')
console.log(new Buffer(base, 'base64').toString('utf8'));
} else {
console.log("Noo! Fail!")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment