Skip to content

Instantly share code, notes, and snippets.

@mildsunrise
Last active July 22, 2024 01:28
Show Gist options
  • Save mildsunrise/c24045a3eb4ec3e15b69af2a837b8392 to your computer and use it in GitHub Desktop.
Save mildsunrise/c24045a3eb4ec3e15b69af2a837b8392 to your computer and use it in GitHub Desktop.
simplified implementation of the (cursed) compression scheme used by the popular lz-string library
from typing import Iterator
def read_int_le(bits: Iterator[int], size: int) -> int:
''' consume a fixed-size LE (LSB-first) integer from 'bits' '''
result = 0
for i in range(size):
result |= next(bits) << i
return result
# unlike the original implementation, this one verifies that all the
# input is consumed. for the original behavior of accepting any input
# even if it has trailing junk, remove the assert for padding below.
def decode_packets(data: bytes):
''' decode the packets from a bitstream given in BE (MSB-first) form '''
bits = ( (x >> i) & 1 for x in data for i in reversed(range(8)) )
max_opcode = 2
while (opcode := read_int_le(bits, max_opcode.bit_length())) != 2:
if opcode < 2:
yield "literal", read_int_le(bits, [8, 16][opcode])
max_opcode += 2
else:
yield "dictionary index", opcode - 3
max_opcode += 1
padding = list(bits)
assert len(padding) < 16 and not any(padding)
def decompress(data: bytes):
''' decompress a buffer of bytes into a stream of UTF-16 code units '''
dictionary = []
last_entry = None
for kind, value in decode_packets(data):
if kind == "literal":
entry = [value]
dictionary.append(entry)
elif value == len(dictionary):
entry = last_entry + entry[:1]
else:
entry = dictionary[value]
yield from entry
if last_entry:
dictionary.append(last_entry + entry[:1])
last_entry = entry
import base64
def decompress_uricomponent(data: str) -> str:
''' emulates decompressFromEncodedURIComponent '''
assert all(x.isalnum() or x in '+-' for x in data)
# the original implementation (old version) contains a bug that causes incomplete
# Base64 to be emitted, so correct for that or b64decode will fail:
data += 'A' * ((-len(data)) % 4)
# it doesn't use the standard urlsafe alphabet, and the custom alphabet
# still keeps the (URL-sensitive) '+', which defeats the purpose :)
data_str = base64.b64decode(data, '+-')
# FIXME: does this work with UTF-16 surrogate pairs?
return ''.join(map(chr, decompress(data_str)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment