Skip to content

Instantly share code, notes, and snippets.

@rsignell-usgs
Created October 17, 2023 13:40
Show Gist options
  • Save rsignell-usgs/fe4a5e5133b79ac8398895c6499bf3a2 to your computer and use it in GitHub Desktop.
Save rsignell-usgs/fe4a5e5133b79ac8398895c6499bf3a2 to your computer and use it in GitHub Desktop.
02_lazy.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "outside-mayor",
"metadata": {},
"outputs": [],
"source": [
"import fsspec\n",
"import xarray as xr\n",
"from kerchunk.hdf import SingleHdf5ToZarr\n",
"from kerchunk.combine import MultiZarrToZarr, auto_dask, JustLoad\n",
"from dask.distributed import Client\n",
"import dask.bag as db\n",
"import ujson\n",
"from pathlib import Path\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c44423f2-befc-4da3-bb91-bf6b59a6b10d",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 2,
"id": "endangered-therapist",
"metadata": {},
"outputs": [],
"source": [
"fs_read = fsspec.filesystem('s3', anon=True, skip_instance_cache=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "native-serial",
"metadata": {},
"outputs": [],
"source": [
"fs_write = fsspec.filesystem('s3', anon=False, skip_instance_cache=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "packed-lightning",
"metadata": {},
"outputs": [],
"source": [
"flist = fs_read.glob('s3://usgs-coawst/useast-archive/*.nc')\n",
"json_dir = 's3://usgs-coawst/useast-archive/json'"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "destroyed-abortion",
"metadata": {},
"outputs": [],
"source": [
"flist = [f's3://{f}' for f in flist]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "charitable-logan",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"736\n",
"s3://usgs-coawst/useast-archive/coawst_2009-08-21_0000.nc\n",
"s3://usgs-coawst/useast-archive/coawst_2023-09-22_0735.nc\n"
]
}
],
"source": [
"print(len(flist))\n",
"print(flist[0])\n",
"print(flist[-1])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "636a01dd-334b-46fa-a7ec-52b2c3d567e4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"736\n",
"s3://usgs-coawst/useast-archive/json/coawst_2009-08-21_0000.json\n",
"s3://usgs-coawst/useast-archive/json/coawst_2023-09-22_0735.json\n"
]
}
],
"source": [
"json_list = fs_read.glob(f'{json_dir}/*.json')\n",
"json_list = [f's3://{j}' for j in json_list]\n",
"print(len(json_list))\n",
"print(json_list[0])\n",
"print(json_list[-1])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "protecting-recall",
"metadata": {},
"outputs": [],
"source": [
"import base64\n",
"\n",
"def consolidate(out_):\n",
" for k, v in out_.items():\n",
" if isinstance(v, bytes):\n",
" try:\n",
" # easiest way to test if data is ascii\n",
" out_[k] = v.decode('ascii')\n",
" except UnicodeDecodeError:\n",
" out_[k] = (b\"base64:\" + base64.b64encode(v)).decode()\n",
" else:\n",
" out_[k] = v\n",
" return out_\n",
"\n",
"import zarr\n",
"\n",
"def modify_attrs(out):\n",
" out_= zarr.open(out)\n",
" out_.ocean_time.attrs['standard_name'] = 'time'\n",
" return out\n",
"\n",
"def preprocess(out):\n",
" out = modify_attrs(out)\n",
" out = consolidate(out)\n",
" return out"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "d0e1e923-6ae5-4273-ad43-46fbae1caa5a",
"metadata": {},
"outputs": [],
"source": [
"from kerchunk import hdf, combine, df\n",
"import fsspec.implementations.reference\n",
"from fsspec.implementations.reference import LazyReferenceMapper\n",
"from tempfile import TemporaryDirectory"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "a75e9548-ed8a-4db4-ae4f-0752ac241a16",
"metadata": {},
"outputs": [],
"source": [
"fs_local = fsspec.filesystem(\"file\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8cd651a4-92c5-4d3d-ad47-8c4ab10e00d4",
"metadata": {},
"outputs": [],
"source": [
"fs_local.mkdir('combined.parq')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "ffb98395-d64e-4926-96a5-19a6f19f64b3",
"metadata": {},
"outputs": [],
"source": [
"out = LazyReferenceMapper.create(1000, \"combined.parq\", fs_local)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "fe81ce8c-69f7-4cf5-81f0-0fa6f38b9ea2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1min 58s, sys: 10.5 s, total: 2min 8s\n",
"Wall time: 6min 16s\n"
]
}
],
"source": [
"%%time\n",
"out_dict = MultiZarrToZarr(\n",
"json_list,\n",
"remote_protocol=\"s3\",\n",
"concat_dims=[\"ocean_time\"],\n",
" coo_map={\"ocean_time\": \"cf:ocean_time\"},\n",
" identical_dims=['lat_psi','lat_rho','lat_u','lat_v',\n",
" 'lon_psi','lon_rho','lon_u','lon_v'],\n",
" preprocess=preprocess,\n",
" out=out).translate()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "700b0ea3-7c75-4c8e-ab7b-a91e4fd10f9d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 7.98 ms, sys: 0 ns, total: 7.98 ms\n",
"Wall time: 89.7 ms\n"
]
}
],
"source": [
"%%time\n",
"out.flush()\n",
"\n",
"df.refs_to_dataframe(out_dict, \"combined.parq\")\n",
"\n",
"fs_ref = fsspec.implementations.reference.ReferenceFileSystem(\n",
" \"combined.parq\", remote_protocol=\"s3\", target_protocol=\"file\", lazy=True)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "ce9ee213-8e0a-437d-ba0f-5b5f7fd2f9b8",
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "'shape'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"File \u001b[0;32m<timed exec>:1\u001b[0m\n",
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/xarray/backends/api.py:540\u001b[0m, in \u001b[0;36mopen_dataset\u001b[0;34m(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, inline_array, backend_kwargs, **kwargs)\u001b[0m\n\u001b[1;32m 528\u001b[0m decoders \u001b[38;5;241m=\u001b[39m _resolve_decoders_kwargs(\n\u001b[1;32m 529\u001b[0m decode_cf,\n\u001b[1;32m 530\u001b[0m open_backend_dataset_parameters\u001b[38;5;241m=\u001b[39mbackend\u001b[38;5;241m.\u001b[39mopen_dataset_parameters,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 536\u001b[0m decode_coords\u001b[38;5;241m=\u001b[39mdecode_coords,\n\u001b[1;32m 537\u001b[0m )\n\u001b[1;32m 539\u001b[0m overwrite_encoded_chunks \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moverwrite_encoded_chunks\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m--> 540\u001b[0m backend_ds \u001b[38;5;241m=\u001b[39m \u001b[43mbackend\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 541\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename_or_obj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 542\u001b[0m \u001b[43m \u001b[49m\u001b[43mdrop_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdrop_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 543\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mdecoders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 544\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 545\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 546\u001b[0m ds \u001b[38;5;241m=\u001b[39m _dataset_from_backend_dataset(\n\u001b[1;32m 547\u001b[0m backend_ds,\n\u001b[1;32m 548\u001b[0m filename_or_obj,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 556\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 557\u001b[0m )\n\u001b[1;32m 558\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ds\n",
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/xarray/backends/zarr.py:897\u001b[0m, in \u001b[0;36mZarrBackendEntrypoint.open_dataset\u001b[0;34m(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, group, mode, synchronizer, consolidated, chunk_store, storage_options, stacklevel, zarr_version)\u001b[0m\n\u001b[1;32m 895\u001b[0m store_entrypoint \u001b[38;5;241m=\u001b[39m StoreBackendEntrypoint()\n\u001b[1;32m 896\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m close_on_error(store):\n\u001b[0;32m--> 897\u001b[0m ds \u001b[38;5;241m=\u001b[39m \u001b[43mstore_entrypoint\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 898\u001b[0m \u001b[43m \u001b[49m\u001b[43mstore\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 899\u001b[0m \u001b[43m \u001b[49m\u001b[43mmask_and_scale\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmask_and_scale\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 900\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecode_times\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecode_times\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 901\u001b[0m \u001b[43m \u001b[49m\u001b[43mconcat_characters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconcat_characters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 902\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecode_coords\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecode_coords\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 903\u001b[0m \u001b[43m \u001b[49m\u001b[43mdrop_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdrop_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 904\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cftime\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cftime\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 905\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecode_timedelta\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecode_timedelta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 906\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 907\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ds\n",
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/xarray/backends/store.py:28\u001b[0m, in \u001b[0;36mStoreBackendEntrypoint.open_dataset\u001b[0;34m(self, store, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta)\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mopen_dataset\u001b[39m(\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 18\u001b[0m store,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 26\u001b[0m decode_timedelta\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 27\u001b[0m ):\n\u001b[0;32m---> 28\u001b[0m \u001b[38;5;28mvars\u001b[39m, attrs \u001b[38;5;241m=\u001b[39m \u001b[43mstore\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 29\u001b[0m encoding \u001b[38;5;241m=\u001b[39m store\u001b[38;5;241m.\u001b[39mget_encoding()\n\u001b[1;32m 31\u001b[0m \u001b[38;5;28mvars\u001b[39m, attrs, coord_names \u001b[38;5;241m=\u001b[39m conventions\u001b[38;5;241m.\u001b[39mdecode_cf_variables(\n\u001b[1;32m 32\u001b[0m \u001b[38;5;28mvars\u001b[39m,\n\u001b[1;32m 33\u001b[0m attrs,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 40\u001b[0m decode_timedelta\u001b[38;5;241m=\u001b[39mdecode_timedelta,\n\u001b[1;32m 41\u001b[0m )\n",
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/xarray/backends/common.py:128\u001b[0m, in \u001b[0;36mAbstractDataStore.load\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 106\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 107\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 108\u001b[0m \u001b[38;5;124;03m This loads the variables and attributes simultaneously.\u001b[39;00m\n\u001b[1;32m 109\u001b[0m \u001b[38;5;124;03m A centralized loading function makes it easier to create\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[38;5;124;03m are requested, so care should be taken to make sure its fast.\u001b[39;00m\n\u001b[1;32m 126\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m 127\u001b[0m variables \u001b[38;5;241m=\u001b[39m FrozenDict(\n\u001b[0;32m--> 128\u001b[0m (_decode_variable_name(k), v) \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_variables\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mitems()\n\u001b[1;32m 129\u001b[0m )\n\u001b[1;32m 130\u001b[0m attributes \u001b[38;5;241m=\u001b[39m FrozenDict(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_attrs())\n\u001b[1;32m 131\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m variables, attributes\n",
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/xarray/backends/zarr.py:480\u001b[0m, in \u001b[0;36mZarrStore.get_variables\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 479\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_variables\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 480\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mFrozenDict\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 481\u001b[0m \u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen_store_variable\u001b[49m\u001b[43m(\u001b[49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mv\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mv\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mzarr_group\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marrays\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 482\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/xarray/core/utils.py:469\u001b[0m, in \u001b[0;36mFrozenDict\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 468\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mFrozenDict\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Frozen:\n\u001b[0;32m--> 469\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m Frozen(\u001b[38;5;28;43mdict\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m)\n",
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/xarray/backends/zarr.py:480\u001b[0m, in \u001b[0;36m<genexpr>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 479\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_variables\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 480\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m FrozenDict(\n\u001b[1;32m 481\u001b[0m (k, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mopen_store_variable(k, v)) \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mzarr_group\u001b[38;5;241m.\u001b[39marrays()\n\u001b[1;32m 482\u001b[0m )\n",
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/zarr/hierarchy.py:629\u001b[0m, in \u001b[0;36mGroup._array_iter\u001b[0;34m(self, keys_only, method, recurse)\u001b[0m\n\u001b[1;32m 627\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_array_iter\u001b[39m(\u001b[38;5;28mself\u001b[39m, keys_only, method, recurse):\n\u001b[1;32m 628\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_version \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m2\u001b[39m:\n\u001b[0;32m--> 629\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28msorted\u001b[39m(\u001b[43mlistdir\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_store\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_path\u001b[49m\u001b[43m)\u001b[49m):\n\u001b[1;32m 630\u001b[0m path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_key_prefix \u001b[38;5;241m+\u001b[39m key\n\u001b[1;32m 631\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m contains_array(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_store, path):\n",
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/zarr/storage.py:212\u001b[0m, in \u001b[0;36mlistdir\u001b[0;34m(store, path)\u001b[0m\n\u001b[1;32m 209\u001b[0m path \u001b[38;5;241m=\u001b[39m normalize_storage_path(path)\n\u001b[1;32m 210\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(store, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlistdir\u001b[39m\u001b[38;5;124m'\u001b[39m):\n\u001b[1;32m 211\u001b[0m \u001b[38;5;66;03m# pass through\u001b[39;00m\n\u001b[0;32m--> 212\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mstore\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlistdir\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m \u001b[38;5;66;03m# slow version, iterate through all keys\u001b[39;00m\n\u001b[1;32m 215\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 216\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mStore \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstore\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m has no `listdir` method. From zarr 2.9 onwards \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 217\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmay want to inherit from `Store`.\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 218\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m,\n\u001b[1;32m 219\u001b[0m )\n",
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/zarr/_storage/store.py:144\u001b[0m, in \u001b[0;36mStore.listdir\u001b[0;34m(self, path)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mlistdir\u001b[39m(\u001b[38;5;28mself\u001b[39m, path: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[\u001b[38;5;28mstr\u001b[39m]:\n\u001b[1;32m 143\u001b[0m path \u001b[38;5;241m=\u001b[39m normalize_storage_path(path)\n\u001b[0;32m--> 144\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_listdir_from_keys\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/zarr/_storage/store.py:413\u001b[0m, in \u001b[0;36m_listdir_from_keys\u001b[0;34m(store, path)\u001b[0m\n\u001b[1;32m 411\u001b[0m prefix \u001b[38;5;241m=\u001b[39m _path_to_prefix(path)\n\u001b[1;32m 412\u001b[0m children \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n\u001b[0;32m--> 413\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mstore\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkeys\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 414\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m key\u001b[38;5;241m.\u001b[39mstartswith(prefix) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(key) \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlen\u001b[39m(prefix):\n\u001b[1;32m 415\u001b[0m suffix \u001b[38;5;241m=\u001b[39m key[\u001b[38;5;28mlen\u001b[39m(prefix):]\n",
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/_collections_abc.py:866\u001b[0m, in \u001b[0;36mMappingView.__len__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 865\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__len__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 866\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_mapping\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/zarr/storage.py:742\u001b[0m, in \u001b[0;36mKVStore.__len__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 741\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__len__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 742\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_mutable_mapping\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/fsspec/mapping.py:177\u001b[0m, in \u001b[0;36mFSMap.__len__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__len__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 177\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfind\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mroot\u001b[49m\u001b[43m)\u001b[49m)\n",
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/fsspec/implementations/reference.py:1019\u001b[0m, in \u001b[0;36mReferenceFileSystem.find\u001b[0;34m(self, path, maxdepth, withdirs, detail, **kwargs)\u001b[0m\n\u001b[1;32m 1017\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msorted\u001b[39m(k \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreferences \u001b[38;5;28;01mif\u001b[39;00m k\u001b[38;5;241m.\u001b[39mstartswith(path))\n\u001b[1;32m 1018\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1019\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msorted\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreferences\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1020\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m detail:\n\u001b[1;32m 1021\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdircache:\n",
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/fsspec/implementations/reference.py:453\u001b[0m, in \u001b[0;36mLazyReferenceMapper.__len__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 451\u001b[0m count \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 452\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 453\u001b[0m chunk_sizes \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_chunk_sizes\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfield\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 454\u001b[0m nchunks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnp\u001b[38;5;241m.\u001b[39mproduct(chunk_sizes)\n\u001b[1;32m 455\u001b[0m count \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m nchunks\n",
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/fsspec/implementations/reference.py:278\u001b[0m, in \u001b[0;36mLazyReferenceMapper._get_chunk_sizes\u001b[0;34m(self, field)\u001b[0m\n\u001b[1;32m 275\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m field \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mchunk_sizes:\n\u001b[1;32m 276\u001b[0m zarray \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mzmetadata[\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfield\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/.zarray\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 277\u001b[0m size_ratio \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m--> 278\u001b[0m math\u001b[38;5;241m.\u001b[39mceil(s \u001b[38;5;241m/\u001b[39m c) \u001b[38;5;28;01mfor\u001b[39;00m s, c \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(\u001b[43mzarray\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mshape\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m, zarray[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mchunks\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 279\u001b[0m ]\n\u001b[1;32m 280\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mchunk_sizes[field] \u001b[38;5;241m=\u001b[39m size_ratio\n\u001b[1;32m 281\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mchunk_sizes[field]\n",
"\u001b[0;31mKeyError\u001b[0m: 'shape'"
]
}
],
"source": [
"%%time\n",
"ds = xr.open_dataset(\n",
" fs_ref.get_mapper(), engine=\"zarr\", drop_variables=['dstart'], chunks={},\n",
" backend_kwargs={\"consolidated\": False}\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dd07ecbf-232f-45d1-965c-10e08dd7856a",
"metadata": {},
"outputs": [],
"source": [
"import xarray as xr\n",
"\n",
"files = fsspec.open(location_of_data)\n",
"\n",
"# Create LazyReferenceMapper to pass to MultiZarrToZarr\n",
"fs = fsspec.filesystem(\"file\")\n",
"\n",
"os.makedirs(\"combined.parq\")\n",
"out = LazyReferenceMapper.create(1000, \"combined.parq\", fs)\n",
"\n",
"# Create references from input files\n",
"single_ref_sets = [hdf.SingleHdf5ToZarr(_).translate() for _ in files]\n",
"\n",
"out_dict = MultiZarrToZarr(\n",
" single_ref_sets,\n",
" remote_protocol=\"memory\",\n",
" concat_dims=[\"time\"],\n",
" out=out).translate()\n",
"\n",
"out.flush()\n",
"\n",
"df.refs_to_dataframe(out_dict, \"combined.parq\")\n",
"\n",
"fs = fsspec.implementations.reference.ReferenceFileSystem(\n",
" \"combined.parq\", remote_protocol=\"s3\", target_protocol=\"file\", lazy=True)\n",
"ds = xr.open_dataset(\n",
" fs.get_mapper(), engine=\"zarr\",\n",
" backend_kwargs={\"consolidated\": False}\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "users-users-pangeo",
"language": "python",
"name": "conda-env-users-users-pangeo-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"state": {},
"version_major": 2,
"version_minor": 0
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment