Skip to content

Instantly share code, notes, and snippets.

@imjosh
Created January 28, 2019 07:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save imjosh/666ae47c700307210d1a873f4e61aff6 to your computer and use it in GitHub Desktop.
Save imjosh/666ae47c700307210d1a873f4e61aff6 to your computer and use it in GitHub Desktop.
Messing around with hex and base36 character encodings
/* Messing around with character encodings */
/* notes:
http://www.i18nguy.com/unicode/supplementary-test.html
https://stackoverflow.com/questions/6063148/java-unicode-where-to-find-example-n-byte-unicode-characters
https://stackoverflow.com/a/37954501/2034089
https://unicode-table.com/
*/
// Adapted from https://stackoverflow.com/a/21648161/2034089
String.prototype.encodeHex = function (bytes = 4) {
if (bytes < 2 || bytes > 4) {
throw new Error('Invalid bytes parameter');
}
const pad = '0'.repeat(bytes);
let result = "";
for (let i = 0; i < this.length; i++) {
if (bytes === 2 && this.charCodeAt(i) > 255) {
throw new Error('encode_2byte_UTF8_Hex does not support charCode > 255');
} else if (bytes === 3 && this.charCodeAt(i) > 4095) {
throw new Error('encode_2byte_UTF8_Hex does not support charCode > 4095'); // 16**3
}
else if ((bytes === 4) && (this.charCodeAt(i) > 65535)) {
throw new Error('encode_4byte_UTF8_Base36 does not support charCode > 65535');
}
const hex = this.charCodeAt(i).toString(16);
result += (pad + hex).slice(bytes * -1);
}
return result
}
String.prototype.decodeHex = function (bytes = 4) {
if (bytes < 1 || bytes > 4) {
throw new Error('Invalid bytes parameter');
}
const re = new RegExp(`.{1,${bytes}}`, 'g');
const hexes = this.match(re) || [];
let back = "";
for (let j = 0; j < hexes.length; j++) {
back += String.fromCharCode(parseInt(hexes[j], 16));
}
return back;
}
String.prototype.encode36 = function (bytes = 4) {
if (bytes !== 0 && bytes < 2 || bytes > 4) {
throw new Error('Invalid bytes parameter');
}
if (bytes === 0) {
if (this.match(/[^a-z0-9]/)) {
throw new Error('encode_az09_Base36 does not support characters other than lowercase a-z and digits 0-9');
}
return parseInt(this, 36).toString();
}
const pad = '0'.repeat(bytes);
let result = "";
for (let i = 0; i < this.length; i++) {
if (bytes === 2 && this.charCodeAt(i) > 1295) {
throw new Error('encode_2byte_UTF8_Base36 does not support charCode > 1295'); // 36**2
} else if (bytes === 3 && this.charCodeAt(i) > 46655) {
throw new Error('encode_3byte_UTF8_Base36 does not support charCode > 46655'); // 36**3
}
else if ((bytes === 4) && (this.charCodeAt(i) > 65535)) {
throw new Error('encode_4byte_UTF8_Base36 does not support charCode > 65535');
}
const b36 = this.charCodeAt(i).toString(36);
result += (pad + b36).slice(bytes * -1);
}
return result;
}
String.prototype.decode36 = function (bytes = 4) {
const str = this;
if (bytes !== 0 && bytes < 2 || bytes > 4) {
throw new Error('Invalid bytes parameter');
}
if (bytes === 0) {
return parseInt(this).toString(36);
}
const re = new RegExp(`.{1,${bytes}}`, 'g');
const chars = str.match(re) || [];
return chars.reduce((acc, char) => {
return acc + String.fromCharCode(parseInt(char, 36));
}, '');
}
String.prototype.encode_az09_Base36 = function () {
return this.encode36(0);
}
String.prototype.decode_az09_Hex = function () {
return this.decodeHex(1);
}
String.prototype.encode_2byte_UTF8_Hex = function () {
return this.encodeHex(2);
}
String.prototype.decode_2byte_UTF8_Hex = function () {
return this.decodeHex(2);
}
String.prototype.encode_3byte_UTF8_Hex = function () {
return this.encodeHex(3);
}
String.prototype.decode_3byte_UTF8_Hex = function () {
return this.decodeHex(3);
}
String.prototype.encode_4byte_UTF8_Hex = function () {
return this.encodeHex(4);
}
String.prototype.decode_4byte_UTF8_Hex = function () {
return this.decodeHex(4);
}
String.prototype.decode_az09_Base36 = function () {
return this.decode36(0);
}
String.prototype.encode_2byte_UTF8_Base36 = function () {
return this.encode36(2);
}
String.prototype.decode_2byte_UTF8_Base36 = function () {
return this.decode36(2);
}
String.prototype.encode_3byte_UTF8_Base36 = function () {
return this.encode36(3);
}
String.prototype.decode_3byte_UTF8_Base36 = function () {
return this.decode36(3);
}
String.prototype.encode_4byte_UTF8_Base36 = function () {
return this.encode36(4);
}
String.prototype.decode_4byte_UTF8_Base36 = function () {
return this.decode36(4);
}
// fixme
var testStrings = {
az09Str: 'az0',
azAZ09Str: 'aZ0',
utf8_one_byteStr: 'aZ!',
utf8_two_byteStr: 'aZ¶',
// utf8_three_byteStr: 'aZ‱',
utf8_three_byteStr: 'aZ' + String.fromCharCode(4095), // hex will pass
// utf8_three_byteStr: 'aZ'+ String.fromCharCode(4096), // hex will fail
utf8_four_byteStr: 'aZ𠴕',
}
var tests = {
// testString: { testFn: should }
az09Str: {
'az09_Base36': 'match',
'2byte_UTF8_Hex': 'match',
'2byte_UTF8_Base36': 'match',
'3byte_UTF8_Hex': 'match',
'3byte_UTF8_Base36': 'match',
'4byte_UTF8_Hex': 'match',
'4byte_UTF8_Base36': 'match',
},
azAZ09Str: {
'az09_Base36': 'throw',
'2byte_UTF8_Hex': 'match',
'2byte_UTF8_Base36': 'match',
'3byte_UTF8_Hex': 'match',
'3byte_UTF8_Base36': 'match',
'4byte_UTF8_Hex': 'match',
'4byte_UTF8_Base36': 'match',
},
utf8_one_byteStr: {
'az09_Base36': 'throw',
'2byte_UTF8_Hex': 'match',
'2byte_UTF8_Base36': 'match',
'3byte_UTF8_Hex': 'match',
'3byte_UTF8_Base36': 'match',
'4byte_UTF8_Hex': 'match',
'4byte_UTF8_Base36': 'match',
},
utf8_two_byteStr: {
'az09_Base36': 'throw',
'2byte_UTF8_Hex': 'match',
'2byte_UTF8_Base36': 'match',
'3byte_UTF8_Hex': 'match',
'3byte_UTF8_Base36': 'match',
'4byte_UTF8_Hex': 'match',
'4byte_UTF8_Base36': 'match',
},
utf8_three_byteStr: {
'az09_Base36': 'throw',
'2byte_UTF8_Hex': 'throw',
'2byte_UTF8_Base36': 'throw',
'3byte_UTF8_Hex': 'match',
'3byte_UTF8_Base36': 'match',
'4byte_UTF8_Hex': 'match',
'4byte_UTF8_Base36': 'match',
},
utf8_four_byteStr: {
'az09_Base36': 'throw',
'2byte_UTF8_Hex': 'throw',
'2byte_UTF8_Base36': 'throw',
'3byte_UTF8_Hex': 'throw',
'3byte_UTF8_Base36': 'throw',
'4byte_UTF8_Hex': 'match',
'4byte_UTF8_Base36': 'match',
}
}
var testResults = []
Object.keys(tests).forEach(testStringKey => {
const test = tests[testStringKey];
Object.keys(test).forEach(fnName => {
const testResult = {};
const should = test[fnName];
testResult.test = `Test ${testStringKey} on ${fnName}`;
try {
const result = testEncoding(fnName, testStrings[testStringKey]);
testResult.match = result.matches;
testResult.chars = result.length;
if (should === 'match') {
if (result.matches) {
testResult.passFail = 'PASS';
} else {
testResult.passFail = 'FAIL! - should match';
}
}
if (should === 'throw') {
testResult.passFail = 'FAIL! - should throw';
}
} catch (error) {
testResult.match = `${error.message}`;
if (should === 'throw') {
testResult.passFail = 'PASS';
} else {
testResult.passFail = `FAIL! - should ${should}`;
}
}
testResults.push(testResult);
});
});
console.table(testResults);
function testEncoding(encoding, myStr) {
const encoded = myStr[`encode_${encoding}`]();
const decoded = encoded[`decode_${encoding}`]();
const matches = myStr === decoded;
return { length: encoded.length, matches };
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment