Skip to content

Instantly share code, notes, and snippets.

@martindurant
Last active September 20, 2022 17:14
Show Gist options
  • Save martindurant/95a5c67a1a1cd9ea6c67bd515e331613 to your computer and use it in GitHub Desktop.
Save martindurant/95a5c67a1a1cd9ea6c67bd515e331613 to your computer and use it in GitHub Desktop.
Single file datasets
import kerchunk.hdf
import kerchunk.combine
import fsspec
import tarfile
import xarray as xr
ro = dict(
s3={
"anon": True,
"client_kwargs": {"endpoint_url": "https://object-store.cloud.muni.cz"}
}
)
# get offsets
with fsspec.open("s3://testfred/ice.tar", **ro["s3"]) as tf:
tar = tarfile.TarFile(fileobj=tf)
offsets = {ti.name: ti.offset_data for ti in tar.getmembers()}
ofs = fsspec.open_files(
# should have "first" caching strategy?
"tar://*.nc::s3://testfred/ice.tar", **ro
)
outs = {}
for of in ofs:
with of as f:
h = kerchunk.hdf.SingleHdf5ToZarr(f)
outs[of.path] = h.translate()
mods = {}
for key, offset in offsets.items():
out = outs[key]
fs = fsspec.filesystem(
"reference",
fo=out,
template_overrides={"u": f"tar://{key}"},
remote_options={
"fo": "s3://testfred/ice.tar",
"target_options": ro["s3"]
}
)
mod = fs.references.copy()
for k, v in mod.items():
if isinstance(v, list):
v[0] = "s3://testfred/ice.tar"
v[1] += offset
mods[key] = mod
mzz = kerchunk.combine.MultiZarrToZarr(
list(mods.values()),
remote_options=ro["s3"],
remote_protocol="s3",
concat_dims=["time_counter"],
identical_dims=['nav_lon','nav_lat']
)
out = mzz.translate()
ds = xr.open_zarr(
"reference://",
storage_options={
"fo": out,
"remote_options": ro["s3"]
},
consolidated=False
)
with open("icemod.json", "w") as j:
import ujson
ujson.dump(out, j)
ds.sivolu.mean(dim='time_counter').mean().compute()
# 0.9407204
@tinaok
Copy link

tinaok commented Sep 20, 2022

I updated line19 to 26 as

ofs = fsspec.open_files(
    "tar://*.nc::s3://testfred/ice.tar", **ro
)
outs = {}

for of in ofs:
    with of as f:
        h = kerchunk.hdf.SingleHdf5ToZarr(f,of.path)
        outs[of.path] = h.translate()

and I could open the tar file as dataset, thank you!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment