Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save martindurant/16bee4256595d3b6814be139ab1bd54e to your computer and use it in GitHub Desktop.
Save martindurant/16bee4256595d3b6814be139ab1bd54e to your computer and use it in GitHub Desktop.
astropy-rice-play
import astropy.io.fits._tiled_compression as tiled
from astropy.io import fits
import numcodecs
import fsspec
import zarr
import json
import numpy as np
class MY_RICE(tiled.codecs.Rice1):
def decode(self, buf, out=None):
arr = super().decode(np.frombuffer(buf, dtype="uint8"))
if out is not None:
out[:] = arr
return out
return arr
numcodecs.register_codec(MY_RICE)
fn = "/Users/mdurant/Downloads/aia.lev1.171A_2017-09-06T13_59_33.35Z.image_lev1.fits"
f = open(fn, "rb")
hdul = fits.open(f)
# includes compression/table info, as opposed to .header which is rewritten to the final image specs
full_head = hdul[1]._header
n_rows = full_head["NAXIS2"]
for i in range(1, 10):
if full_head[f"ZNAME{i}"] == "BLOCKSIZE":
blocksize = full_head[f"ZVAL{i}"]
break
for i in range(1, 10):
if full_head[f"ZNAME{i}"] == "BYTEPIX":
bytepix = full_head[f"ZVAL{i}"]
break
tilesize = full_head["ZTILE1"] * full_head["ZTILE2"]
inf = hdul[1].fileinfo()
f.seek(inf["datLoc"])
table = np.frombuffer(f.read(n_rows*4*2), dtype=[("size", ">i4"), ("offset", ">i4")], count=n_rows)
data_start = f.tell()
def test_codec():
codec = tiled.codecs.Rice1(blocksize=blocksize, bytepix=bytepix, tilesize=tilesize)
f.seek(data_start)
for j in range(n_rows):
block0 = f.read(table["size"][j])
out = codec.decode(np.frombuffer(block0, "uint8"))
assert (hdul[1].data[j] == out).all()
refs = {f"data/{i}.0": [fn, data_start + table["offset"][i], table["size"][i]]
for i in range(len(table))}
refs["data/.zarray"] = b"""
{
"chunks": [1, 4096],
"compressor": {"blocksize": 32, "bytepix": 2, "tilesize": 4096, "id": "FITS_RICE1"},
"filters": [],
"dtype": "<i2",
"fill_value": 0,
"order": "C",
"shape": [4096, 4096],
"zarr_format": 2
}
"""
head = {}
for k in hdul[1].header:
try:
head[k] = hdul[1].header[k]
except:
pass
refs["data/.zattr"] = json.dumps(head, separators=(',', ':'), indent=None).encode()
refs[".zgroup"] = b'{"zarr_format": 2}'
fs = fsspec.filesystem("reference", fo=refs)
g = zarr.open(fs.get_mapper())
(g.data[:] == hdul[1].data).all()
@martindurant
Copy link
Author

Using fits.open(disable_image_compression=True) is probably the best idea, it stops you having to access ._header and should give you access to the offsets table easier.

OK; didn't know about that flag.

You almost certainly want to use astropy to read the offset table (not the heap) because it can have other columns. (i.e. some tiles can use a gzip codec if the quantization fails).

My reason was, to know the byte at which the table ends. Is there a way to do that by reading the table with astropy?

The AIA data always has a integer number of tile_size pixels, which isn't always true for all files, and the codec (currently) needs to know the actual tile size for that tile rather than the target (maximum) tile size.

A good point. This only happens in the very last chunk? I think the codec should zero-pad after decompression to the given tilesize, since zarr expects to read whole chunks even at the edge.

@Cadair
Copy link

Cadair commented Mar 21, 2023

Is there a way to do that by reading the table with astropy?

hdu._theap will give you the byte offset of the heap, which doesn't have to be the same as the end of the table.

I think the codec should zero-pad after decompression to the given tilesize, since zarr expects to read whole chunks even at the edge.

It zero pads the output array and then zarr crops it down afterwards?

@martindurant
Copy link
Author

It zero pads the output array and then zarr crops it down afterwards?

Yes, we would have to do this. A normal chunk is stored whole, with the unused part normally zeros (or other default fill value), but the actually values doesn't matter. This allows for expanding the array by just rewriting the metadata.

@Cadair
Copy link

Cadair commented Mar 21, 2023

Ok that's interesting, and very different to the underlying way FITS stores it.

We need to make a list of desired changes to our codec implementations. I am going to loose them scattered around the place.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment