Skip to content

Instantly share code, notes, and snippets.

@graeme-winter
Created April 3, 2024 05:37
Show Gist options
  • Save graeme-winter/e40534f137189b777700b2006d5eeab0 to your computer and use it in GitHub Desktop.
Save graeme-winter/e40534f137189b777700b2006d5eeab0 to your computer and use it in GitHub Desktop.
Manually unpack bitshuffle / lz4 data in Python
# Proof of work is computing the sha1 hash of the uncompressed and unshuffled
# data and comparing with the result of doing the operation on the h5py
# extracted data
import binascii
import hashlib
import struct
import sys
import lz4.block
import bitshuffle
import hdf5plugin
import numpy
import h5py
import tqdm
chunks = {}
hashes = {}
def bits(data):
# find the lz4 chunk sizes, count the number of blocks and return
l = len(data)
i = 0
n = 0
sha = hashlib.sha1()
while i < l:
x = struct.unpack(">I", data[i : i + 4])[0]
b = data[i + 4 : i + x + 4]
d = numpy.frombuffer(
lz4.block.decompress(b, uncompressed_size=8192), dtype=numpy.uint16
)
sha.update(bitshuffle.bitunshuffle(d, 8192).tobytes())
i += x + 4
n += 1
return n, binascii.hexlify(sha.digest()).decode()
with h5py.File(sys.argv[1]) as f:
d = f["data"]
n = d.id.get_num_chunks()
for i in tqdm.tqdm(range(n)):
chunk_info = d.id.get_chunk_info(i)
chunks[chunk_info.chunk_offset] = (chunk_info.byte_offset, chunk_info.size)
hashes[chunk_info.chunk_offset] = binascii.hexlify(
hashlib.sha1(
d[
chunk_info.chunk_offset[0], chunk_info.chunk_offset[1], :, :
].tobytes()
).digest()
).decode()
with open(sys.argv[1], "rb") as f:
for i in tqdm.tqdm(sorted(chunks)):
c = chunks[i]
f.seek(c[0])
x = f.read(12)
b = f.read(c[1] - 12)
s = struct.unpack(">QI", x)
assert s == (1052672, 8192)
b0, b1 = bits(b)
assert b0 == 1 + (s[0] // s[1])
assert b1 == hashes[i]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment