Skip to content

Instantly share code, notes, and snippets.

@martindurant
Created October 31, 2022 18:26
Show Gist options
  • Save martindurant/d8fde3992326bde766af695b9fd60c18 to your computer and use it in GitHub Desktop.
Save martindurant/d8fde3992326bde766af695b9fd60c18 to your computer and use it in GitHub Desktop.
Single file kerchunking
import kerchunk.hdf
import kerchunk.combine
import fsspec
import tarfile
import xarray as xr
ro = dict(
s3={
"anon": True,
"client_kwargs": {"endpoint_url": "https://object-store.cloud.muni.cz"}
}
)
# get offsets
with fsspec.open("s3://testfred/gridS.tar", **ro["s3"]) as tf:
tar = tarfile.TarFile(fileobj=tf)
offsets = {ti.name: ti.offset_data for ti in tar.getmembers()}
ofs = fsspec.open_files(
# should have "first" caching strategy?
"tar://*.nc::~/Downloads/gridS.tar", **ro
)
outs = {}
for of in ofs:
with of as f:
h = kerchunk.hdf.SingleHdf5ToZarr(f)
outs[of.path] = h.translate()
mods = {}
for key, offset in offsets.items():
out = outs[key]
fs = fsspec.filesystem(
"reference",
fo=out,
template_overrides={"u": f"tar://{key}"},
remote_options={
"fo": "s3://testfred/gridS.tar",
"target_options": ro["s3"]
}
)
mod = fs.references.copy()
for k, v in mod.items():
if isinstance(v, list):
v[0] = "s3://testfred/gridS.tar"
v[1] += offset
mods[key] = mod
mzz = kerchunk.combine.MultiZarrToZarr(
list(mods.values()),
remote_options=ro["s3"],
remote_protocol="s3",
concat_dims=["time_counter"],
identical_dims=['nav_lon', 'nav_lat']
)
out = mzz.translate()
ds = xr.open_zarr(
"reference://",
storage_options={
"fo": out,
"remote_options": ro["s3"]
},
consolidated=False
)
with open("gridS.json", "w") as j:
import ujson
ujson.dump(out, j)
ds.sivolu.mean(dim='time_counter').mean().compute()
# 0.9407204
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment