Last active
July 22, 2024 01:28
-
-
Save mildsunrise/c24045a3eb4ec3e15b69af2a837b8392 to your computer and use it in GitHub Desktop.
simplified implementation of the (cursed) compression scheme used by the popular lz-string library
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Iterator | |
def read_int_le(bits: Iterator[int], size: int) -> int: | |
''' consume a fixed-size LE (LSB-first) integer from 'bits' ''' | |
result = 0 | |
for i in range(size): | |
result |= next(bits) << i | |
return result | |
# unlike the original implementation, this one verifies that all the | |
# input is consumed. for the original behavior of accepting any input | |
# even if it has trailing junk, remove the assert for padding below. | |
def decode_packets(data: bytes): | |
''' decode the packets from a bitstream given in BE (MSB-first) form ''' | |
bits = ( (x >> i) & 1 for x in data for i in reversed(range(8)) ) | |
max_opcode = 2 | |
while (opcode := read_int_le(bits, max_opcode.bit_length())) != 2: | |
if opcode < 2: | |
yield "literal", read_int_le(bits, [8, 16][opcode]) | |
max_opcode += 2 | |
else: | |
yield "dictionary index", opcode - 3 | |
max_opcode += 1 | |
padding = list(bits) | |
assert len(padding) < 16 and not any(padding) | |
def decompress(data: bytes): | |
''' decompress a buffer of bytes into a stream of UTF-16 code units ''' | |
dictionary = [] | |
last_entry = None | |
for kind, value in decode_packets(data): | |
if kind == "literal": | |
entry = [value] | |
dictionary.append(entry) | |
elif value == len(dictionary): | |
entry = last_entry + entry[:1] | |
else: | |
entry = dictionary[value] | |
yield from entry | |
if last_entry: | |
dictionary.append(last_entry + entry[:1]) | |
last_entry = entry | |
import base64 | |
def decompress_uricomponent(data: str) -> str: | |
''' emulates decompressFromEncodedURIComponent ''' | |
assert all(x.isalnum() or x in '+-' for x in data) | |
# the original implementation (old version) contains a bug that causes incomplete | |
# Base64 to be emitted, so correct for that or b64decode will fail: | |
data += 'A' * ((-len(data)) % 4) | |
# it doesn't use the standard urlsafe alphabet, and the custom alphabet | |
# still keeps the (URL-sensitive) '+', which defeats the purpose :) | |
data_str = base64.b64decode(data, '+-') | |
# FIXME: does this work with UTF-16 surrogate pairs? | |
return ''.join(map(chr, decompress(data_str))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment