Created
January 5, 2018 00:48
Star
You must be signed in to star a gist
lzify
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 -- | |
# vim: set ts=8 sw=8 noet : | |
""" | |
lzify: pure LZSS pack/depack routines | |
by GreaseMonkey, 2018 - Public Domain | |
Usage: | |
python3 lzify.py infile.tar outfile.tar.lzified | |
The CLI test program cannot decompress at this stage. | |
This is only a proof of concept. | |
Window length: 4MB | |
Max LZSS block length: 32KB+2B | |
This does a pretty good job of packing large uncompressed archives. | |
If you want to squeeze out a bit more, add a Huffman layer to it. | |
Fun fact: | |
This is an extension of the compression format used in Gran Turismo 1. | |
(GT1 supports 258-byte blocks and a 32KB window.) | |
Due to the approach taken to extend the space, | |
this cannot decompress GT1 compressed data. | |
Format: | |
- Read 1 byte. This is your mask. | |
- For each bit in the mask, from least significant to most significant: | |
- If 0, read the next byte as a literal. | |
- If 1: | |
- Read the length, it will be 1 to 2 bytes in a packed form as so: | |
- 0xxxxxxx -> xxxxxxx | |
- 1xxxxxxx yyyyyyyy -> xxxxxxxyyyyyyyy | |
- Read the offset, it will be 1 to 3 bytes in a packed form as so: | |
- 0xxxxxxx -> xxxxxxx | |
- 1xxxxxxx 0yyyyyyy -> xxxxxxxyyyyyyy | |
- 1xxxxxxx 1yyyyyyy zzzzzzzz -> xxxxxxxyyyyyyyzzzzzzzz | |
- Copy length+3 bytes from offset-1 bytes to your output. | |
For reference, GT1 uses these formats for the length and offset: | |
- Length: | |
- xxxxxxxx -> xxxxxxxx | |
- Offset: | |
- 0xxxxxxx -> xxxxxxx | |
- 1xxxxxxx yyyyyyyy -> xxxxxxxyyyyyyyy | |
""" | |
MAX_RETURNS = 50 | |
DELTA_CODED = False | |
import sys, struct | |
indata = open(sys.argv[1], "rb").read() | |
def lzify_pack(indata): | |
repdict = {} | |
def add_to_repdict(indata, indata_len, inoffs): | |
tag = indata[inoffs:inoffs+3] | |
if tag not in repdict: | |
repdict[tag] = [] | |
repdict[tag].insert(0, inoffs) | |
while len(repdict[tag]) > MAX_RETURNS: | |
repdict[tag].pop(MAX_RETURNS) | |
def find_best_repeat(indata, indata_len, inoffs, tag): | |
best_blen = -1 | |
best_boffs = repdict[tag][-1] | |
best_cost = 1000 | |
base_inoffs = inoffs | |
for lastpos in repdict[tag]: | |
boffs = base_inoffs - lastpos | |
assert boffs > 0 | |
if boffs >= 0x400000: | |
continue | |
blen = 0 | |
inoffs = base_inoffs + 3 | |
while blen < 0x7FFF and inoffs < indata_len and indata[inoffs] == indata[inoffs-boffs]: | |
inoffs += 1 | |
blen += 1 | |
cost = 2 | |
if boffs >= 0x80: | |
cost += 1 | |
if boffs >= 0x4000: | |
cost += 1 | |
if blen >= 0x80: | |
cost += 1 | |
cost -= 3+blen | |
if cost < best_cost: | |
best_cost = cost | |
best_blen = blen | |
best_boffs = boffs | |
if best_cost >= 0: | |
return None, None, base_inoffs | |
for i in range(best_blen+3): | |
add_to_repdict(indata, indata_len, base_inoffs+i) | |
assert best_blen >= 0 | |
return best_blen, best_boffs-1, base_inoffs+3+best_blen | |
inoffs = 0 | |
indata_len = len(indata) | |
if DELTA_CODED: | |
print("Delta encoding...") | |
indata = bytes(((indata[i]-(indata[i-1] if i >= 1 else 0))&0xFF) for i in range(indata_len)) | |
print("Delta done!") | |
assert indata_len == len(indata) | |
outdata = bytearray() | |
bl1 = 0 | |
bl2 = 0 | |
bo1 = 0 | |
bo2 = 0 | |
bo3 = 0 | |
while True: | |
if inoffs >= indata_len: | |
break | |
maskoffs = len(outdata) | |
mask = 0x00 | |
outdata.append(mask) | |
for mbit in range(8): | |
if inoffs >= indata_len: | |
break | |
tag = indata[inoffs:inoffs+3] | |
if tag in repdict: | |
blen, boffs, inoffs, = find_best_repeat(indata, indata_len, inoffs, tag) | |
if blen is None: | |
outdata.append(indata[inoffs]) | |
add_to_repdict(indata, indata_len, inoffs) | |
inoffs += 1 | |
else: | |
mask |= (1<<mbit) | |
if blen < 0x80: | |
bl1 += 1 | |
outdata += (bytes([ | |
blen])) | |
else: | |
bl2 += 1 | |
outdata += (bytes([ | |
((blen>>8)&0x7F)|0x80, | |
blen&0xFF])) | |
if boffs < 0x80: | |
bo1 += 1 | |
outdata += (bytes([ | |
boffs])) | |
elif boffs < 0x4000: | |
bo2 += 1 | |
outdata += (bytes([ | |
(boffs>>7)|0x80, | |
boffs&0x7F])) | |
else: | |
bo3 += 1 | |
outdata += (bytes([ | |
(boffs>>15)|0x80, | |
((boffs>>8)&0x7F)|0x80, | |
boffs&0xFF])) | |
#if blen >= 10: | |
if False: | |
print(blen, boffs) | |
else: | |
outdata.append(indata[inoffs]) | |
add_to_repdict(indata, indata_len, inoffs) | |
inoffs += 1 | |
outdata[maskoffs] = mask | |
outdata[maskoffs] = mask | |
print("Block length histogram: %9d %9d" % (bl1, bl2,)) | |
print("Block offset histogram: %9d %9d %9d" % (bo1, bo2, bo3,)) | |
print("Repdict density: %7.4f%% (%d)" % ( | |
100.0*(len(repdict)/float(1<<24)), | |
len(repdict),)) | |
return outdata | |
def lzify_unpack(indata): | |
inoffs = 0 | |
indata_len = len(indata) | |
outdata = bytearray() | |
while True: | |
if inoffs >= indata_len: | |
break | |
mask = indata[inoffs] | |
inoffs += 1 | |
for mbit in range(8): | |
#print(hex(mask), mbit) | |
if (mask & (1<<mbit)) == 0: | |
if inoffs >= indata_len: | |
break | |
outdata.append(indata[inoffs]) | |
inoffs += 1 | |
else: | |
blen = indata[inoffs] | |
inoffs += 1 | |
if (blen & 0x80) != 0: | |
blen &= 0x7F | |
blen <<= 8 | |
blen |= indata[inoffs] | |
inoffs += 1 | |
boffs = indata[inoffs] | |
inoffs += 1 | |
if (boffs & 0x80) != 0: | |
boffs &= 0x7F | |
boffs <<= 7 | |
lowoffs = indata[inoffs] | |
inoffs += 1 | |
boffs |= lowoffs & 0x7F | |
if (lowoffs & 0x80) != 0: | |
boffs <<= 8 | |
boffs |= indata[inoffs] | |
inoffs += 1 | |
l = len(outdata)-boffs-1 | |
#print(l, len(outdata), boffs, blen) | |
assert l >= 0 | |
for i in range(blen+3): | |
outdata.append(outdata[l+i]) | |
if DELTA_CODED: | |
print("Delta decoding...") | |
for i in range(1, len(outdata), 1): | |
outdata[i] = (outdata[i] + outdata[i-1]) & 0xFF | |
print("Delta undone!") | |
return outdata | |
if __name__ == "__main__": | |
outdata = lzify_pack(indata) | |
print("outdata:", len(outdata)) | |
redata = lzify_unpack(outdata) | |
print("indata:", len(indata), len(redata)) | |
outfp = open(sys.argv[2], "wb") | |
outfp.write(outdata) | |
outfp.close() | |
assert len(indata) == len(redata) | |
for i in range(len(indata)): | |
if indata[i] != redata[i]: | |
print("MISMATCH @ %08X" % (i,)) | |
print(" ".join("%02X" % (indata[j],) for j in range(i-16,i+16,1))) | |
print(" ".join("%02X" % (redata[j],) for j in range(i-16,i+16,1))) | |
assert False |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment