Skip to content

Instantly share code, notes, and snippets.

@drewwiens
Last active July 16, 2021 15:40
Show Gist options
  • Save drewwiens/99e94c7f0eb599807a8c91ad6688b270 to your computer and use it in GitHub Desktop.
Save drewwiens/99e94c7f0eb599807a8c91ad6688b270 to your computer and use it in GitHub Desktop.
JavaScript String Compression of Filenames in a URL
// JavaScript String Compression of Filenames in a URL
//
// Problem: Reduce length of URL caused by storing a list of long filenames in URL query parameter.
//
// Solution: Switch between a 5-bit alphabet of letters and 4-bit alphabet of numbers. Use a special
// character to switch between lowercase and uppercase ("shift"). Represent the resulting bit
// string as a URL-safe base62 encoded string.
//
// Results: From my test set of ~100 strings from real filenames on a project:
// avg compression ratio: 0.87
// max compression ratio: 0.97
// min compression ratio: 0.80
//
// Key points:
// 1. Letters and numbers inside the filenames tend to be grouped together.
// 2. The only symbol chars are ".", "-", " ", and "_".
// 3. Unicode and UTF-8 chars are passed via a special escape character.
// 4. The decode function is TODO.
//
const _ = require("lodash");
const baseX = require("base-x");
/**
* Convert a number to a bit string e.g. "10101...", padding 0's at the front to
* make the resulting bit string length equal to len.
*
* @param {number} num Number to convert into a bit string.
* @param {number} len Desired length of resulting bit string.
*/
function toBitString(num, len) {
const bitString = num.toString(2);
let padByte = len - bitString.length;
return "0".repeat(padByte) + bitString;
}
const letters = _.mapValues(
{
switchToNums: 0,
shift: 1,
".": 2,
"-": 3,
_: 4,
a: 5,
b: 6,
c: 7,
d: 8,
e: 9,
f: 10,
g: 11,
h: 12,
i: 13,
j: 14,
k: 15,
l: 16,
m: 17,
n: 18,
o: 19,
p: 20,
q: 21,
r: 22,
s: 23,
t: 24,
u: 25,
v: 26,
w: 27,
x: 28,
y: 29,
z: 30,
" ": 31,
},
(k) => toBitString(k, 5) // Use 5 bits per "letter"
);
const numbers = _.mapValues(
{
0: 0,
1: 1,
2: 2,
3: 3,
4: 4,
5: 5,
6: 6,
7: 7,
8: 8,
9: 9,
".": 10,
"-": 11,
_: 12,
switchToLetters: 13,
" ": 14,
escapeNextChar: 15, // Allow Unicode & UTF-8 as a fall-back
},
(k) => toBitString(k, 4) // Use 4 bits per "number"
);
const onlyLetters = new Set([
"a",
"b",
"c",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"q",
"r",
"s",
"t",
"u",
"v",
"w",
"x",
"y",
"z",
]);
const letterChars = new Set(_.keys(letters).filter((l) => l.length === 1));
const numberChars = new Set(_.keys(numbers).filter((n) => n.length === 1));
// The alphabet for base62 as per the docs: https://www.npmjs.com/package/base-x
const base62 = baseX(
"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
);
/**
* Convert a bit string e.g. "10101..." into base62
*
* @param {string} bitString
* @returns base62
*/
function bitStringToBase62(bitString) {
let padByte = 8 - (bitString.length % 8),
encoded = [];
bitString = "0".repeat(padByte) + bitString;
for (let i = bitString.length - 1; i >= 0; i -= 8) {
encoded.unshift(parseInt(bitString.substr(i, 8), 2));
}
const byteArray = new Uint8Array(encoded);
return base62.encode(byteArray);
}
/**
* Convert `input` using custom alphabet and encoding scheme.
*
* @param {string} input
* @returns base62 of the encoded string
*/
function encode(input) {
let output = "";
// Record initial casing:
const firstLetter = input
.split("")
.find((c) => onlyLetters.has(c.toLowerCase()));
let uppercase = !!firstLetter && firstLetter === firstLetter.toUpperCase();
output += uppercase ? "1" : "0";
// Record initial alphabet:
let letter = letterChars.has(input[0].toLowerCase());
output += letter ? "1" : "0";
for (let char of input) {
const charLower = char.toLowerCase();
// Check if char needs to be escaped to Unicode or UTF-8:
if (!letterChars.has(charLower) && !numberChars.has(char)) {
if (letter) {
output += letters.switchToNums;
output += numbers.escapeNextChar;
const charCode = char.charCodeAt(0);
const isUnicode = charCode > 255;
output += isUnicode ? "1" : "0"; // Indicate Unicode (1) or UTF-8 (0)
output += toBitString(char.charCodeAt(0), isUnicode ? 16 : 8);
}
}
// Check if need to switch char set:
if (letter && !letterChars.has(charLower)) {
output += letters.switchToNums;
letter = false;
} else if (!letter && !numberChars.has(char)) {
output += numbers.switchToLetters;
letter = true;
}
// Check if need to switch casing:
if (letter) {
if (!uppercase && char !== char.toLowerCase()) {
output += letters.shift;
uppercase = true;
} else if (uppercase && char !== char.toUpperCase()) {
output += letters.shift;
uppercase = false;
}
}
// Encode the character:
output += (letter ? letters : numbers)[charLower];
}
// Return the encoded string in URL-safe base62:
return bitStringToBase62(output);
}
const testStrs = [ /* Your test strings here */ ]
let avg = 0,
max = 0,
min = Infinity;
for (let testStr of testStrs) {
const encoded = encode(testStr);
console.warn(`"${testStr}"\t\t`, testStr.length);
console.warn(`"${encoded}"\t\t`, encoded.length);
const comprRatio = encoded.length / testStr.length;
max = Math.max(max, comprRatio);
min = Math.min(min, comprRatio);
avg += comprRatio / testStrs.length;
}
console.warn("=================");
console.warn("avg compression ratio:", avg);
console.warn("max compression ratio:", max);
console.warn("min compression ratio:", min);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment