Skip to content

Instantly share code, notes, and snippets.

@iamgreaser
Created January 5, 2018 00:48
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save iamgreaser/c7599de6480396564d22012205f1ad1d to your computer and use it in GitHub Desktop.
lzify
#!/usr/bin/env python3 --
# vim: set ts=8 sw=8 noet :
"""
lzify: pure LZSS pack/depack routines
by GreaseMonkey, 2018 - Public Domain
Usage:
python3 lzify.py infile.tar outfile.tar.lzified
The CLI test program cannot decompress at this stage.
This is only a proof of concept.
Window length: 4MB
Max LZSS block length: 32KB+2B
This does a pretty good job of packing large uncompressed archives.
If you want to squeeze out a bit more, add a Huffman layer to it.
Fun fact:
This is an extension of the compression format used in Gran Turismo 1.
(GT1 supports 258-byte blocks and a 32KB window.)
Due to the approach taken to extend the space,
this cannot decompress GT1 compressed data.
Format:
- Read 1 byte. This is your mask.
- For each bit in the mask, from least significant to most significant:
- If 0, read the next byte as a literal.
- If 1:
- Read the length, it will be 1 to 2 bytes in a packed form as so:
- 0xxxxxxx -> xxxxxxx
- 1xxxxxxx yyyyyyyy -> xxxxxxxyyyyyyyy
- Read the offset, it will be 1 to 3 bytes in a packed form as so:
- 0xxxxxxx -> xxxxxxx
- 1xxxxxxx 0yyyyyyy -> xxxxxxxyyyyyyy
- 1xxxxxxx 1yyyyyyy zzzzzzzz -> xxxxxxxyyyyyyyzzzzzzzz
- Copy length+3 bytes from offset-1 bytes to your output.
For reference, GT1 uses these formats for the length and offset:
- Length:
- xxxxxxxx -> xxxxxxxx
- Offset:
- 0xxxxxxx -> xxxxxxx
- 1xxxxxxx yyyyyyyy -> xxxxxxxyyyyyyyy
"""
MAX_RETURNS = 50
DELTA_CODED = False
import sys, struct
indata = open(sys.argv[1], "rb").read()
def lzify_pack(indata):
repdict = {}
def add_to_repdict(indata, indata_len, inoffs):
tag = indata[inoffs:inoffs+3]
if tag not in repdict:
repdict[tag] = []
repdict[tag].insert(0, inoffs)
while len(repdict[tag]) > MAX_RETURNS:
repdict[tag].pop(MAX_RETURNS)
def find_best_repeat(indata, indata_len, inoffs, tag):
best_blen = -1
best_boffs = repdict[tag][-1]
best_cost = 1000
base_inoffs = inoffs
for lastpos in repdict[tag]:
boffs = base_inoffs - lastpos
assert boffs > 0
if boffs >= 0x400000:
continue
blen = 0
inoffs = base_inoffs + 3
while blen < 0x7FFF and inoffs < indata_len and indata[inoffs] == indata[inoffs-boffs]:
inoffs += 1
blen += 1
cost = 2
if boffs >= 0x80:
cost += 1
if boffs >= 0x4000:
cost += 1
if blen >= 0x80:
cost += 1
cost -= 3+blen
if cost < best_cost:
best_cost = cost
best_blen = blen
best_boffs = boffs
if best_cost >= 0:
return None, None, base_inoffs
for i in range(best_blen+3):
add_to_repdict(indata, indata_len, base_inoffs+i)
assert best_blen >= 0
return best_blen, best_boffs-1, base_inoffs+3+best_blen
inoffs = 0
indata_len = len(indata)
if DELTA_CODED:
print("Delta encoding...")
indata = bytes(((indata[i]-(indata[i-1] if i >= 1 else 0))&0xFF) for i in range(indata_len))
print("Delta done!")
assert indata_len == len(indata)
outdata = bytearray()
bl1 = 0
bl2 = 0
bo1 = 0
bo2 = 0
bo3 = 0
while True:
if inoffs >= indata_len:
break
maskoffs = len(outdata)
mask = 0x00
outdata.append(mask)
for mbit in range(8):
if inoffs >= indata_len:
break
tag = indata[inoffs:inoffs+3]
if tag in repdict:
blen, boffs, inoffs, = find_best_repeat(indata, indata_len, inoffs, tag)
if blen is None:
outdata.append(indata[inoffs])
add_to_repdict(indata, indata_len, inoffs)
inoffs += 1
else:
mask |= (1<<mbit)
if blen < 0x80:
bl1 += 1
outdata += (bytes([
blen]))
else:
bl2 += 1
outdata += (bytes([
((blen>>8)&0x7F)|0x80,
blen&0xFF]))
if boffs < 0x80:
bo1 += 1
outdata += (bytes([
boffs]))
elif boffs < 0x4000:
bo2 += 1
outdata += (bytes([
(boffs>>7)|0x80,
boffs&0x7F]))
else:
bo3 += 1
outdata += (bytes([
(boffs>>15)|0x80,
((boffs>>8)&0x7F)|0x80,
boffs&0xFF]))
#if blen >= 10:
if False:
print(blen, boffs)
else:
outdata.append(indata[inoffs])
add_to_repdict(indata, indata_len, inoffs)
inoffs += 1
outdata[maskoffs] = mask
outdata[maskoffs] = mask
print("Block length histogram: %9d %9d" % (bl1, bl2,))
print("Block offset histogram: %9d %9d %9d" % (bo1, bo2, bo3,))
print("Repdict density: %7.4f%% (%d)" % (
100.0*(len(repdict)/float(1<<24)),
len(repdict),))
return outdata
def lzify_unpack(indata):
inoffs = 0
indata_len = len(indata)
outdata = bytearray()
while True:
if inoffs >= indata_len:
break
mask = indata[inoffs]
inoffs += 1
for mbit in range(8):
#print(hex(mask), mbit)
if (mask & (1<<mbit)) == 0:
if inoffs >= indata_len:
break
outdata.append(indata[inoffs])
inoffs += 1
else:
blen = indata[inoffs]
inoffs += 1
if (blen & 0x80) != 0:
blen &= 0x7F
blen <<= 8
blen |= indata[inoffs]
inoffs += 1
boffs = indata[inoffs]
inoffs += 1
if (boffs & 0x80) != 0:
boffs &= 0x7F
boffs <<= 7
lowoffs = indata[inoffs]
inoffs += 1
boffs |= lowoffs & 0x7F
if (lowoffs & 0x80) != 0:
boffs <<= 8
boffs |= indata[inoffs]
inoffs += 1
l = len(outdata)-boffs-1
#print(l, len(outdata), boffs, blen)
assert l >= 0
for i in range(blen+3):
outdata.append(outdata[l+i])
if DELTA_CODED:
print("Delta decoding...")
for i in range(1, len(outdata), 1):
outdata[i] = (outdata[i] + outdata[i-1]) & 0xFF
print("Delta undone!")
return outdata
if __name__ == "__main__":
outdata = lzify_pack(indata)
print("outdata:", len(outdata))
redata = lzify_unpack(outdata)
print("indata:", len(indata), len(redata))
outfp = open(sys.argv[2], "wb")
outfp.write(outdata)
outfp.close()
assert len(indata) == len(redata)
for i in range(len(indata)):
if indata[i] != redata[i]:
print("MISMATCH @ %08X" % (i,))
print(" ".join("%02X" % (indata[j],) for j in range(i-16,i+16,1)))
print(" ".join("%02X" % (redata[j],) for j in range(i-16,i+16,1)))
assert False
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment