-
-
Save martindurant/16bee4256595d3b6814be139ab1bd54e to your computer and use it in GitHub Desktop.
import astropy.io.fits._tiled_compression as tiled | |
from astropy.io import fits | |
import numcodecs | |
import fsspec | |
import zarr | |
import json | |
import numpy as np | |
class MY_RICE(tiled.codecs.Rice1): | |
def decode(self, buf, out=None): | |
arr = super().decode(np.frombuffer(buf, dtype="uint8")) | |
if out is not None: | |
out[:] = arr | |
return out | |
return arr | |
numcodecs.register_codec(MY_RICE) | |
fn = "/Users/mdurant/Downloads/aia.lev1.171A_2017-09-06T13_59_33.35Z.image_lev1.fits" | |
f = open(fn, "rb") | |
hdul = fits.open(f) | |
# includes compression/table info, as opposed to .header which is rewritten to the final image specs | |
full_head = hdul[1]._header | |
n_rows = full_head["NAXIS2"] | |
for i in range(1, 10): | |
if full_head[f"ZNAME{i}"] == "BLOCKSIZE": | |
blocksize = full_head[f"ZVAL{i}"] | |
break | |
for i in range(1, 10): | |
if full_head[f"ZNAME{i}"] == "BYTEPIX": | |
bytepix = full_head[f"ZVAL{i}"] | |
break | |
tilesize = full_head["ZTILE1"] * full_head["ZTILE2"] | |
inf = hdul[1].fileinfo() | |
f.seek(inf["datLoc"]) | |
table = np.frombuffer(f.read(n_rows*4*2), dtype=[("size", ">i4"), ("offset", ">i4")], count=n_rows) | |
data_start = f.tell() | |
def test_codec(): | |
codec = tiled.codecs.Rice1(blocksize=blocksize, bytepix=bytepix, tilesize=tilesize) | |
f.seek(data_start) | |
for j in range(n_rows): | |
block0 = f.read(table["size"][j]) | |
out = codec.decode(np.frombuffer(block0, "uint8")) | |
assert (hdul[1].data[j] == out).all() | |
refs = {f"data/{i}.0": [fn, data_start + table["offset"][i], table["size"][i]] | |
for i in range(len(table))} | |
refs["data/.zarray"] = b""" | |
{ | |
"chunks": [1, 4096], | |
"compressor": {"blocksize": 32, "bytepix": 2, "tilesize": 4096, "id": "FITS_RICE1"}, | |
"filters": [], | |
"dtype": "<i2", | |
"fill_value": 0, | |
"order": "C", | |
"shape": [4096, 4096], | |
"zarr_format": 2 | |
} | |
""" | |
head = {} | |
for k in hdul[1].header: | |
try: | |
head[k] = hdul[1].header[k] | |
except: | |
pass | |
refs["data/.zattr"] = json.dumps(head, separators=(',', ':'), indent=None).encode() | |
refs[".zgroup"] = b'{"zarr_format": 2}' | |
fs = fsspec.filesystem("reference", fo=refs) | |
g = zarr.open(fs.get_mapper()) | |
(g.data[:] == hdul[1].data).all() |
Is there a way to do that by reading the table with astropy?
hdu._theap
will give you the byte offset of the heap, which doesn't have to be the same as the end of the table.
I think the codec should zero-pad after decompression to the given tilesize, since zarr expects to read whole chunks even at the edge.
It zero pads the output array and then zarr crops it down afterwards?
It zero pads the output array and then zarr crops it down afterwards?
Yes, we would have to do this. A normal chunk is stored whole, with the unused part normally zeros (or other default fill value), but the actually values doesn't matter. This allows for expanding the array by just rewriting the metadata.
Ok that's interesting, and very different to the underlying way FITS stores it.
We need to make a list of desired changes to our codec implementations. I am going to loose them scattered around the place.
OK; didn't know about that flag.
My reason was, to know the byte at which the table ends. Is there a way to do that by reading the table with astropy?
A good point. This only happens in the very last chunk? I think the codec should zero-pad after decompression to the given tilesize, since zarr expects to read whole chunks even at the edge.