Skip to content

Instantly share code, notes, and snippets.

@JobLeonard
Last active February 16, 2023 18:46
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save JobLeonard/4bbc5cd5952f25560d59abb2d056f2a0 to your computer and use it in GitHub Desktop.
Save JobLeonard/4bbc5cd5952f25560d59abb2d056f2a0 to your computer and use it in GitHub Desktop.
Uint8Array to LZ-string and back
const LZuint8Array = (function () {
/*
basic ranges of printable UTF16 values (as found in LZ-string):
[32, 127), [160, 55296), [63744, 65536)
We also have filter out string characters like:
" (34)
' (39)
` (44)
(Forward tick is safe: ´ (96))
So:
32, 33, [35, 39), [40, 44), [45, 127), [160, 55296), [63744, 65536)
*/
// integer to unicode:
function itou(i) {
i += 32;
if (i > 33 && i < 39) {
i++;
} else if (i > 38 && i < 44) {
i += 2;
} else if (i > 43 && i < 127) {
i += 3;
} else if (i > 126 && i < 55258) {
i += 37; // === 160 - 128 + 3
} else if (i > 55295) {
i += 8485; // === 63744 - 55296 + 37
}
return String.fromCharCode(i);
}
function utoi(i) {
return i - (i > 63743 ? 8517 :
i > 159 ? 69 :
i > 46 && i < 130 ? 35 :
i > 40 && i < 46 ? 34 :
i > 34 && i < 40 ? 33 :
32);
}
var _node = function(val) { return {v: val, d: {} }; }
return {
itou,
utoi,
compress: function (input) {
if (input === null) return '';
let i = 0,
j = 0,
value = 0,
dictionary = { d: {} },
freshNode = true,
c = 0,
node = _node(2), // first node will always be initialised like this.
nextNode,
enlargeIn = 2,
dictSize = 3,
numBits = 2,
data = [],
data_val = 0,
data_position = 0;
if (input.length) {
// Write length of the input as the first four unicode characters,
// Or 45 bits. 1<<45 bytes is about 35 terabytes, so more than enough.
value = input.length;
data.push(itou(value / 40000000 & 0x7FFF));
data.push(itou((value >>> 15) & 0x7FFF));
data.push(itou(value & 0x7FFF));
// If there is an array, the first byte is guaranteed to
// be new, so we write it to output stream, and add it to the
// dictionary. For the same reason we can initialize freshNode
// as true, and new_node, node and dictSize as if
// it was already added to the dictionary (see above).
c = input[0];
// === Write first byte token to output ==
// insert new byte token into bitstream
for (i = 0; i < numBits; i++) {
// Value for "new token" is 0
data_val <<= 1;
if (++data_position === 15) {
data_position = 0;
data.push(itou(data_val));
data_val = 0;
}
}
// insert byt bits into bitstream
for (i = 0; i < 8; i++) {
// shifting has precedence over bitmasking
data_val = c >> i & 1 | data_val << 1;
if (++data_position === 15) {
data_position = 0;
data.push(itou(data_val));
data_val = 0;
}
}
// Add charCode to the dictionary.
dictionary[c] = node;
for (j = 1; j < input.length; j++) {
c = input[j];
// does the new charCode match an existing prefix?
nextNode = node.d[c];
if (nextNode) {
// continue with next prefix
node = nextNode;
} else {
// Prefix+charCode does not exist in trie yet.
// We write the prefix to the bitstream, and add
// the new charCode to the dictionary if it's new
// Then we set `node` to the root node matching
// the charCode.
if (freshNode) {
// Prefix is a freshly added character token,
// which was already written to the bitstream
freshNode = false;
} else {
// write out the current prefix token
value = node.v;
for (i = 0; i < numBits; i++) {
// shifting has precedence over bitmasking
data_val = value >> i & 1 | data_val << 1;
if (++data_position === 15) {
data_position = 0;
data.push(itou(data_val));
data_val = 0;
}
}
}
// Is the byte a new byte
// that needs to be stored at the root?
if (dictionary[c] === undefined) {
// increase token bitlength if necessary
if (--enlargeIn === 0) {
enlargeIn = 1 << numBits++;
}
// insert new byte token
for (i = 0; i < numBits; i++) {
data_val <<= 1;
if (++data_position === 15) {
data_position = 0;
data.push(itou(data_val));
data_val = 0;
}
}
for (i = 0; i < 8; i++) {
data_val = c >> i & 1 | data_val << 1;
if (++data_position === 15) {
data_position = 0;
data.push(itou(data_val));
data_val = 0;
}
}
dictionary[c] = _node(dictSize++);
// Note of that we already wrote
// the charCode token to the bitstream
freshNode = true;
}
// add node representing prefix + new charCode to trie
node.d[c] = _node(dictSize++);
// increase token bitlength if necessary
if (--enlargeIn === 0) {
enlargeIn = 1 << numBits++;
}
// set node to first charCode of new prefix
node = dictionary[c];
}
}
// === Write last prefix to output ===
if (freshNode) {
// character token already written to output
freshNode = false;
} else {
// write out the prefix token
value = node.v;
for (i = 0; i < numBits; i++) {
// shifting has precedence over bitmasking
data_val = value >> i & 1 | data_val << 1;
if (++data_position === 15) {
data_position = 0;
data.push(itou(data_val));
data_val = 0;
}
}
}
// Is c a new character?
if (dictionary[c] === undefined) {
// increase token bitlength if necessary
if (--enlargeIn === 0) {
enlargeIn = 1 << numBits++;
}
for (i = 0; i < numBits; i++) {
data_val <<= 1;
if (++data_position === 15) {
data_position = 0;
data.push(itou(data_val));
data_val = 0;
}
}
for (i = 0; i < 8; i++) {
data_val = c >> i & 1 | data_val << 1;
if (++data_position === 15) {
data_position = 0;
data.push(itou(data_val));
data_val = 0;
}
}
}
// increase token bitlength if necessary
if (--enlargeIn === 0) {
enlargeIn = 1 << numBits++;
}
}
// Mark the end of the stream
for (i = 0; i < numBits; i++) {
// shifting has precedence over bitmasking
data_val = 1 >> i | data_val << 1;
if (++data_position === 15) {
data_position = 0;
data.push(itou(data_val));
data_val = 0;
}
}
// Flush the last char
data_val <<= 15 - data_position;
data.push(itou(data_val));
data.push(' ');
return data.join('');
},
decompress: function (compressed) {
if (compressed === null || compressed.length < 4) return null;
let length = compressed.length,
getNextValue = function (index) { return utoi(compressed.charCodeAt(index)); };
let dictionary = [0, 1],
enlargeIn = 1,
dictSize = 3,
numBits = 2,
bytes = null,
bytes_concat = null,
result = new Uint8Array(
getNextValue(0) * 0x40000000 +
(getNextValue(1) << 15) +
getNextValue(2)),
result_index = 0,
bits = 0,
maxPower = 2,
power = 0,
data_val = getNextValue(3),
data_position = 15,
data_index = 4;
// Get first token, guaranteed to be either
// a new byte token or end of stream token.
while (power < maxPower) {
// shifting has precedence over bitmasking
bits += (data_val >> --data_position & 1) << power++;
if (data_position === 0) {
data_position = 15;
data_val = getNextValue(data_index++);
}
}
if (bits === 1) {
return null;
}
// else, get byte value
bits = power = 0;
while (power < 8) {
// shifting has precedence over bitmasking
bits += (data_val >> --data_position & 1) << power++;
if (data_position === 0) {
data_position = 15;
data_val = getNextValue(data_index++);
}
}
bytes = [bits];
dictionary[2] = bytes;
result[result_index++] = bits;
// read rest of string
while (data_index <= length) {
// read out next token
maxPower = numBits;
bits = power = 0;
while (power < maxPower) {
// shifting has precedence over bitmasking
bits += (data_val >> --data_position & 1) << power++;
if (data_position === 0) {
data_position = 15;
data_val = getNextValue(data_index++);
}
}
// 0 implies new byte
if (!bits) {
bits = power = 0;
while (power < 8) {
// shifting has precedence over bitmasking
bits += (data_val >> --data_position & 1) << power++;
if (data_position === 0) {
data_position = 15;
data_val = getNextValue(data_index++);
}
}
dictionary[dictSize] = [bits];
bits = dictSize++;
if (--enlargeIn === 0) {
enlargeIn = 1 << numBits++;
}
} else if (bits === 1) {
// end of stream token
return result;
}
if (bits > dictionary.length) {
return null;
}
bytes_concat = bits < dictionary.length ? dictionary[bits] : bytes.concat(bytes[0]);
for (let i = 0; i < bytes_concat.length; i++) {
result[result_index++] = bytes_concat[i];
}
dictionary[dictSize++] = bytes.concat(bytes_concat[0]);
bytes = bytes_concat;
if (--enlargeIn === 0) {
enlargeIn = 1 << numBits++;
}
}
return null;
},
};
})();
function testCompression(LZ) {
console.log('Testing utoi/itou functions');
let utoiMismatches = [];
for (let i = 0; i < 1 << 15; i++) {
let j = LZ.utoi(LZ.itou(i).charCodeAt(0));
if (i !== j) {
utoiMismatches.push({itou: i, utio: j});
}
}
if (utoiMismatches.length) {
console.log("Errors in itou/utoi conversion detected:", utoiMismatches);
} else {
console.log('No errors in itou/utoi conversion detected');
}
let input = new Uint16Array(1 << 15);
for (let i = 0; i < input.length; i++) {
input[i] = i>>4;
}
let inputUint8 = new Uint8Array(input.buffer);
let compressed = LZ.compress(inputUint8);
let decompressed = new Uint16Array(LZ.decompress(compressed).buffer);
let mismatches = [];
for (let i = 0; i < input.length; i++) {
if (input[i] !== decompressed[i]) {
mismatches.push({index: i, input: input[i], decompressed: decompressed[i]});
}
}
console.log({
compressed,
mismatches,
length: compressed.length,
inputLength: input.length,
});
}
@okikio
Copy link

okikio commented Feb 16, 2023

Supposedly ChatGPT 4, is much better at code optimizations

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment