mildsunrise/lz_string.py

## lz_string.py
from typing import Iterator

def read_int_le(bits: Iterator[int], size: int) -> int:
	''' consume a fixed-size LE (LSB-first) integer from 'bits' '''
	result = 0
	for i in range(size):
		result |= next(bits) << i
	return result

# unlike the original implementation, this one verifies that all the
# input is consumed. for the original behavior of accepting any input
# even if it has trailing junk, remove the assert for padding below.

def decode_packets(data: bytes):
	''' decode the packets from a bitstream given in BE (MSB-first) form '''
	bits = ( (x >> i) & 1 for x in data for i in reversed(range(8)) )
	max_opcode = 2
	while (opcode := read_int_le(bits, max_opcode.bit_length())) != 2:
		if opcode < 2:
			yield "literal", read_int_le(bits, [8, 16][opcode])
			max_opcode += 2
		else:
			yield "dictionary index", opcode - 3
			max_opcode += 1
	padding = list(bits)
	assert len(padding) < 16 and not any(padding)

def decompress(data: bytes):
	''' decompress a buffer of bytes into a stream of UTF-16 code units '''
	dictionary = []
	last_entry = None
	for kind, value in decode_packets(data):
		if kind == "literal":
			entry = [value]
			dictionary.append(entry)
		elif value == len(dictionary):
			entry = last_entry + entry[:1]
		else:
			entry = dictionary[value]
		yield from entry
		if last_entry:
			dictionary.append(last_entry + entry[:1])
		last_entry = entry

import base64

def decompress_uricomponent(data: str) -> str:
	''' emulates decompressFromEncodedURIComponent '''
	assert all(x.isalnum() or x in '+-' for x in data)
	# the original implementation (old version) contains a bug that causes incomplete
	# Base64 to be emitted, so correct for that or b64decode will fail:
	data += 'A' * ((-len(data)) % 4)
	# it doesn't use the standard urlsafe alphabet, and the custom alphabet
	# still keeps the (URL-sensitive) '+', which defeats the purpose :)
	data_str = base64.b64decode(data, '+-')
	# FIXME: does this work with UTF-16 surrogate pairs?
	return ''.join(map(chr, decompress(data_str)))
	from typing import Iterator

	def read_int_le(bits: Iterator[int], size: int) -> int:
	''' consume a fixed-size LE (LSB-first) integer from 'bits' '''
	result = 0
	for i in range(size):
	result \|= next(bits) << i
	return result

	# unlike the original implementation, this one verifies that all the
	# input is consumed. for the original behavior of accepting any input
	# even if it has trailing junk, remove the assert for padding below.

	def decode_packets(data: bytes):
	''' decode the packets from a bitstream given in BE (MSB-first) form '''
	bits = ( (x >> i) & 1 for x in data for i in reversed(range(8)) )
	max_opcode = 2
	while (opcode := read_int_le(bits, max_opcode.bit_length())) != 2:
	if opcode < 2:
	yield "literal", read_int_le(bits, [8, 16][opcode])
	max_opcode += 2
	else:
	yield "dictionary index", opcode - 3
	max_opcode += 1
	padding = list(bits)
	assert len(padding) < 16 and not any(padding)

	def decompress(data: bytes):
	''' decompress a buffer of bytes into a stream of UTF-16 code units '''
	dictionary = []
	last_entry = None
	for kind, value in decode_packets(data):
	if kind == "literal":
	entry = [value]
	dictionary.append(entry)
	elif value == len(dictionary):
	entry = last_entry + entry[:1]
	else:
	entry = dictionary[value]
	yield from entry
	if last_entry:
	dictionary.append(last_entry + entry[:1])
	last_entry = entry

	import base64

	def decompress_uricomponent(data: str) -> str:
	''' emulates decompressFromEncodedURIComponent '''
	assert all(x.isalnum() or x in '+-' for x in data)
	# the original implementation (old version) contains a bug that causes incomplete
	# Base64 to be emitted, so correct for that or b64decode will fail:
	data += 'A' * ((-len(data)) % 4)
	# it doesn't use the standard urlsafe alphabet, and the custom alphabet
	# still keeps the (URL-sensitive) '+', which defeats the purpose :)
	data_str = base64.b64decode(data, '+-')
	# FIXME: does this work with UTF-16 surrogate pairs?
	return ''.join(map(chr, decompress(data_str)))