Last active
February 16, 2023 18:46
-
-
Save JobLeonard/4bbc5cd5952f25560d59abb2d056f2a0 to your computer and use it in GitHub Desktop.
Uint8Array to LZ-string and back
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const LZuint8Array = (function () { | |
/* | |
basic ranges of printable UTF16 values (as found in LZ-string): | |
[32, 127), [160, 55296), [63744, 65536) | |
We also have filter out string characters like: | |
" (34) | |
' (39) | |
` (44) | |
(Forward tick is safe: ´ (96)) | |
So: | |
32, 33, [35, 39), [40, 44), [45, 127), [160, 55296), [63744, 65536) | |
*/ | |
// integer to unicode: | |
function itou(i) { | |
i += 32; | |
if (i > 33 && i < 39) { | |
i++; | |
} else if (i > 38 && i < 44) { | |
i += 2; | |
} else if (i > 43 && i < 127) { | |
i += 3; | |
} else if (i > 126 && i < 55258) { | |
i += 37; // === 160 - 128 + 3 | |
} else if (i > 55295) { | |
i += 8485; // === 63744 - 55296 + 37 | |
} | |
return String.fromCharCode(i); | |
} | |
function utoi(i) { | |
return i - (i > 63743 ? 8517 : | |
i > 159 ? 69 : | |
i > 46 && i < 130 ? 35 : | |
i > 40 && i < 46 ? 34 : | |
i > 34 && i < 40 ? 33 : | |
32); | |
} | |
var _node = function(val) { return {v: val, d: {} }; } | |
return { | |
itou, | |
utoi, | |
compress: function (input) { | |
if (input === null) return ''; | |
let i = 0, | |
j = 0, | |
value = 0, | |
dictionary = { d: {} }, | |
freshNode = true, | |
c = 0, | |
node = _node(2), // first node will always be initialised like this. | |
nextNode, | |
enlargeIn = 2, | |
dictSize = 3, | |
numBits = 2, | |
data = [], | |
data_val = 0, | |
data_position = 0; | |
if (input.length) { | |
// Write length of the input as the first four unicode characters, | |
// Or 45 bits. 1<<45 bytes is about 35 terabytes, so more than enough. | |
value = input.length; | |
data.push(itou(value / 40000000 & 0x7FFF)); | |
data.push(itou((value >>> 15) & 0x7FFF)); | |
data.push(itou(value & 0x7FFF)); | |
// If there is an array, the first byte is guaranteed to | |
// be new, so we write it to output stream, and add it to the | |
// dictionary. For the same reason we can initialize freshNode | |
// as true, and new_node, node and dictSize as if | |
// it was already added to the dictionary (see above). | |
c = input[0]; | |
// === Write first byte token to output == | |
// insert new byte token into bitstream | |
for (i = 0; i < numBits; i++) { | |
// Value for "new token" is 0 | |
data_val <<= 1; | |
if (++data_position === 15) { | |
data_position = 0; | |
data.push(itou(data_val)); | |
data_val = 0; | |
} | |
} | |
// insert byt bits into bitstream | |
for (i = 0; i < 8; i++) { | |
// shifting has precedence over bitmasking | |
data_val = c >> i & 1 | data_val << 1; | |
if (++data_position === 15) { | |
data_position = 0; | |
data.push(itou(data_val)); | |
data_val = 0; | |
} | |
} | |
// Add charCode to the dictionary. | |
dictionary[c] = node; | |
for (j = 1; j < input.length; j++) { | |
c = input[j]; | |
// does the new charCode match an existing prefix? | |
nextNode = node.d[c]; | |
if (nextNode) { | |
// continue with next prefix | |
node = nextNode; | |
} else { | |
// Prefix+charCode does not exist in trie yet. | |
// We write the prefix to the bitstream, and add | |
// the new charCode to the dictionary if it's new | |
// Then we set `node` to the root node matching | |
// the charCode. | |
if (freshNode) { | |
// Prefix is a freshly added character token, | |
// which was already written to the bitstream | |
freshNode = false; | |
} else { | |
// write out the current prefix token | |
value = node.v; | |
for (i = 0; i < numBits; i++) { | |
// shifting has precedence over bitmasking | |
data_val = value >> i & 1 | data_val << 1; | |
if (++data_position === 15) { | |
data_position = 0; | |
data.push(itou(data_val)); | |
data_val = 0; | |
} | |
} | |
} | |
// Is the byte a new byte | |
// that needs to be stored at the root? | |
if (dictionary[c] === undefined) { | |
// increase token bitlength if necessary | |
if (--enlargeIn === 0) { | |
enlargeIn = 1 << numBits++; | |
} | |
// insert new byte token | |
for (i = 0; i < numBits; i++) { | |
data_val <<= 1; | |
if (++data_position === 15) { | |
data_position = 0; | |
data.push(itou(data_val)); | |
data_val = 0; | |
} | |
} | |
for (i = 0; i < 8; i++) { | |
data_val = c >> i & 1 | data_val << 1; | |
if (++data_position === 15) { | |
data_position = 0; | |
data.push(itou(data_val)); | |
data_val = 0; | |
} | |
} | |
dictionary[c] = _node(dictSize++); | |
// Note of that we already wrote | |
// the charCode token to the bitstream | |
freshNode = true; | |
} | |
// add node representing prefix + new charCode to trie | |
node.d[c] = _node(dictSize++); | |
// increase token bitlength if necessary | |
if (--enlargeIn === 0) { | |
enlargeIn = 1 << numBits++; | |
} | |
// set node to first charCode of new prefix | |
node = dictionary[c]; | |
} | |
} | |
// === Write last prefix to output === | |
if (freshNode) { | |
// character token already written to output | |
freshNode = false; | |
} else { | |
// write out the prefix token | |
value = node.v; | |
for (i = 0; i < numBits; i++) { | |
// shifting has precedence over bitmasking | |
data_val = value >> i & 1 | data_val << 1; | |
if (++data_position === 15) { | |
data_position = 0; | |
data.push(itou(data_val)); | |
data_val = 0; | |
} | |
} | |
} | |
// Is c a new character? | |
if (dictionary[c] === undefined) { | |
// increase token bitlength if necessary | |
if (--enlargeIn === 0) { | |
enlargeIn = 1 << numBits++; | |
} | |
for (i = 0; i < numBits; i++) { | |
data_val <<= 1; | |
if (++data_position === 15) { | |
data_position = 0; | |
data.push(itou(data_val)); | |
data_val = 0; | |
} | |
} | |
for (i = 0; i < 8; i++) { | |
data_val = c >> i & 1 | data_val << 1; | |
if (++data_position === 15) { | |
data_position = 0; | |
data.push(itou(data_val)); | |
data_val = 0; | |
} | |
} | |
} | |
// increase token bitlength if necessary | |
if (--enlargeIn === 0) { | |
enlargeIn = 1 << numBits++; | |
} | |
} | |
// Mark the end of the stream | |
for (i = 0; i < numBits; i++) { | |
// shifting has precedence over bitmasking | |
data_val = 1 >> i | data_val << 1; | |
if (++data_position === 15) { | |
data_position = 0; | |
data.push(itou(data_val)); | |
data_val = 0; | |
} | |
} | |
// Flush the last char | |
data_val <<= 15 - data_position; | |
data.push(itou(data_val)); | |
data.push(' '); | |
return data.join(''); | |
}, | |
decompress: function (compressed) { | |
if (compressed === null || compressed.length < 4) return null; | |
let length = compressed.length, | |
getNextValue = function (index) { return utoi(compressed.charCodeAt(index)); }; | |
let dictionary = [0, 1], | |
enlargeIn = 1, | |
dictSize = 3, | |
numBits = 2, | |
bytes = null, | |
bytes_concat = null, | |
result = new Uint8Array( | |
getNextValue(0) * 0x40000000 + | |
(getNextValue(1) << 15) + | |
getNextValue(2)), | |
result_index = 0, | |
bits = 0, | |
maxPower = 2, | |
power = 0, | |
data_val = getNextValue(3), | |
data_position = 15, | |
data_index = 4; | |
// Get first token, guaranteed to be either | |
// a new byte token or end of stream token. | |
while (power < maxPower) { | |
// shifting has precedence over bitmasking | |
bits += (data_val >> --data_position & 1) << power++; | |
if (data_position === 0) { | |
data_position = 15; | |
data_val = getNextValue(data_index++); | |
} | |
} | |
if (bits === 1) { | |
return null; | |
} | |
// else, get byte value | |
bits = power = 0; | |
while (power < 8) { | |
// shifting has precedence over bitmasking | |
bits += (data_val >> --data_position & 1) << power++; | |
if (data_position === 0) { | |
data_position = 15; | |
data_val = getNextValue(data_index++); | |
} | |
} | |
bytes = [bits]; | |
dictionary[2] = bytes; | |
result[result_index++] = bits; | |
// read rest of string | |
while (data_index <= length) { | |
// read out next token | |
maxPower = numBits; | |
bits = power = 0; | |
while (power < maxPower) { | |
// shifting has precedence over bitmasking | |
bits += (data_val >> --data_position & 1) << power++; | |
if (data_position === 0) { | |
data_position = 15; | |
data_val = getNextValue(data_index++); | |
} | |
} | |
// 0 implies new byte | |
if (!bits) { | |
bits = power = 0; | |
while (power < 8) { | |
// shifting has precedence over bitmasking | |
bits += (data_val >> --data_position & 1) << power++; | |
if (data_position === 0) { | |
data_position = 15; | |
data_val = getNextValue(data_index++); | |
} | |
} | |
dictionary[dictSize] = [bits]; | |
bits = dictSize++; | |
if (--enlargeIn === 0) { | |
enlargeIn = 1 << numBits++; | |
} | |
} else if (bits === 1) { | |
// end of stream token | |
return result; | |
} | |
if (bits > dictionary.length) { | |
return null; | |
} | |
bytes_concat = bits < dictionary.length ? dictionary[bits] : bytes.concat(bytes[0]); | |
for (let i = 0; i < bytes_concat.length; i++) { | |
result[result_index++] = bytes_concat[i]; | |
} | |
dictionary[dictSize++] = bytes.concat(bytes_concat[0]); | |
bytes = bytes_concat; | |
if (--enlargeIn === 0) { | |
enlargeIn = 1 << numBits++; | |
} | |
} | |
return null; | |
}, | |
}; | |
})(); | |
function testCompression(LZ) { | |
console.log('Testing utoi/itou functions'); | |
let utoiMismatches = []; | |
for (let i = 0; i < 1 << 15; i++) { | |
let j = LZ.utoi(LZ.itou(i).charCodeAt(0)); | |
if (i !== j) { | |
utoiMismatches.push({itou: i, utio: j}); | |
} | |
} | |
if (utoiMismatches.length) { | |
console.log("Errors in itou/utoi conversion detected:", utoiMismatches); | |
} else { | |
console.log('No errors in itou/utoi conversion detected'); | |
} | |
let input = new Uint16Array(1 << 15); | |
for (let i = 0; i < input.length; i++) { | |
input[i] = i>>4; | |
} | |
let inputUint8 = new Uint8Array(input.buffer); | |
let compressed = LZ.compress(inputUint8); | |
let decompressed = new Uint16Array(LZ.decompress(compressed).buffer); | |
let mismatches = []; | |
for (let i = 0; i < input.length; i++) { | |
if (input[i] !== decompressed[i]) { | |
mismatches.push({index: i, input: input[i], decompressed: decompressed[i]}); | |
} | |
} | |
console.log({ | |
compressed, | |
mismatches, | |
length: compressed.length, | |
inputLength: input.length, | |
}); | |
} |
It was an experiment more than anything. I started realizing half-way through that it actually wasn't performant or reliable, and the optimizations it suggested were unreliable
I also think some of the optimizations it suggests are plain old, and no longer relevant
Yes, job security guranteed, your welcome, LMAO 🤣
Supposedly ChatGPT 4, is much better at code optimizations
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
TL;DR don't trust ChatGPT.
edit: and if you find this from my Mastodon post, don't trust my claims either because they might be misinformed and/or outdated too. Benchmark and test your code!
Ok, so since you did not provide any context I'm going to assume this was not posted as a joke. If it was, too late.
I just wasted half an hour going through this and everything this thing proposed is wrong in one way or the other.
... except that this is not necessarily true when using small integers as keys, because that will result in sparse array-like behavior that is typically faster than a map (yes I benchmarked this edit: when I wrote the code, and in this specific case sparse array-like indexing on objects beat the Map edit: when I wrote the code)
Ok I grant that on modern browsers using indexing is faster, but the explanation as to why is completely wrong. We are always appending data so it's always resizing. The reason
arr[arr.length] = val
is faster is because JS engines have implemented a special optimization for this pattern because too many real-world codebases abused this in the past. That's it.Here is the original snippet:
As you can see its value is not "effectively always 0 or 1", I have no idea where that comes from, and the rewrite introduces an off-by-one error: it resets every 16 increments instead of every 15 increments. We're not using the 16th bit on purpose here.
I have already benchmarked this on multiple browsers and it is slower in this context. Or maybe it changed again. Because guess what: whether
switch
orif/else
is faster changes every couple of years as JS engines change optimization patterns.So the snippet doesn't even relate to the original code any more. On the compression side I'm using small single chars. They vary too much to use memoization on
itou
(funny how that isn't a technique ChatGPT proposes, btw). Sure, I could store the integer values in a largeUint16Array
and convert them to chars in the end. But then I'd either have to append them to a string (slower) or put them in an array and then calling.join('')
on that array (also slower because it's literally what we do now except with aUint16Array
step inbetween). Plus we don't know ahead of time how big this buffer has to be, so we either allocate too much or have to resize, which is slow or wastes memory.On the decompression side of things the current approach of using plain arrays is faster because allocating typed arrays is absurdly slow and has major memory overhead (something like 100-200 bytes vs 20 for a plain array object), whereas a small integer array is basically as fast as a typed array in most code these days.
Wow, nice Java snippet you got there ChatGPT. Also we are already doing this where it is faster so what are you talking about
NO. For reasons I just explained this is slower, what the hell?
There is no
parseInt
anywhere in the code, nor any regexes, and your snippets are labeledscss
andrust
so what the heck are you smoking ChatGPT?!But hey, thanks for confirming I don't have to worry about my job security just yet.