Created
October 17, 2023 13:40
-
-
Save rsignell-usgs/fe4a5e5133b79ac8398895c6499bf3a2 to your computer and use it in GitHub Desktop.
02_lazy.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "outside-mayor", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import fsspec\n", | |
"import xarray as xr\n", | |
"from kerchunk.hdf import SingleHdf5ToZarr\n", | |
"from kerchunk.combine import MultiZarrToZarr, auto_dask, JustLoad\n", | |
"from dask.distributed import Client\n", | |
"import dask.bag as db\n", | |
"import ujson\n", | |
"from pathlib import Path\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "c44423f2-befc-4da3-bb91-bf6b59a6b10d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "endangered-therapist", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fs_read = fsspec.filesystem('s3', anon=True, skip_instance_cache=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "native-serial", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fs_write = fsspec.filesystem('s3', anon=False, skip_instance_cache=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "packed-lightning", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"flist = fs_read.glob('s3://usgs-coawst/useast-archive/*.nc')\n", | |
"json_dir = 's3://usgs-coawst/useast-archive/json'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "destroyed-abortion", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"flist = [f's3://{f}' for f in flist]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "charitable-logan", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"736\n", | |
"s3://usgs-coawst/useast-archive/coawst_2009-08-21_0000.nc\n", | |
"s3://usgs-coawst/useast-archive/coawst_2023-09-22_0735.nc\n" | |
] | |
} | |
], | |
"source": [ | |
"print(len(flist))\n", | |
"print(flist[0])\n", | |
"print(flist[-1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "636a01dd-334b-46fa-a7ec-52b2c3d567e4", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"736\n", | |
"s3://usgs-coawst/useast-archive/json/coawst_2009-08-21_0000.json\n", | |
"s3://usgs-coawst/useast-archive/json/coawst_2023-09-22_0735.json\n" | |
] | |
} | |
], | |
"source": [ | |
"json_list = fs_read.glob(f'{json_dir}/*.json')\n", | |
"json_list = [f's3://{j}' for j in json_list]\n", | |
"print(len(json_list))\n", | |
"print(json_list[0])\n", | |
"print(json_list[-1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "protecting-recall", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import base64\n", | |
"\n", | |
"def consolidate(out_):\n", | |
" for k, v in out_.items():\n", | |
" if isinstance(v, bytes):\n", | |
" try:\n", | |
" # easiest way to test if data is ascii\n", | |
" out_[k] = v.decode('ascii')\n", | |
" except UnicodeDecodeError:\n", | |
" out_[k] = (b\"base64:\" + base64.b64encode(v)).decode()\n", | |
" else:\n", | |
" out_[k] = v\n", | |
" return out_\n", | |
"\n", | |
"import zarr\n", | |
"\n", | |
"def modify_attrs(out):\n", | |
" out_= zarr.open(out)\n", | |
" out_.ocean_time.attrs['standard_name'] = 'time'\n", | |
" return out\n", | |
"\n", | |
"def preprocess(out):\n", | |
" out = modify_attrs(out)\n", | |
" out = consolidate(out)\n", | |
" return out" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "d0e1e923-6ae5-4273-ad43-46fbae1caa5a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from kerchunk import hdf, combine, df\n", | |
"import fsspec.implementations.reference\n", | |
"from fsspec.implementations.reference import LazyReferenceMapper\n", | |
"from tempfile import TemporaryDirectory" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "a75e9548-ed8a-4db4-ae4f-0752ac241a16", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fs_local = fsspec.filesystem(\"file\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "8cd651a4-92c5-4d3d-ad47-8c4ab10e00d4", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fs_local.mkdir('combined.parq')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "ffb98395-d64e-4926-96a5-19a6f19f64b3", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"out = LazyReferenceMapper.create(1000, \"combined.parq\", fs_local)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "fe81ce8c-69f7-4cf5-81f0-0fa6f38b9ea2", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1min 58s, sys: 10.5 s, total: 2min 8s\n", | |
"Wall time: 6min 16s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"out_dict = MultiZarrToZarr(\n", | |
"json_list,\n", | |
"remote_protocol=\"s3\",\n", | |
"concat_dims=[\"ocean_time\"],\n", | |
" coo_map={\"ocean_time\": \"cf:ocean_time\"},\n", | |
" identical_dims=['lat_psi','lat_rho','lat_u','lat_v',\n", | |
" 'lon_psi','lon_rho','lon_u','lon_v'],\n", | |
" preprocess=preprocess,\n", | |
" out=out).translate()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "700b0ea3-7c75-4c8e-ab7b-a91e4fd10f9d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 7.98 ms, sys: 0 ns, total: 7.98 ms\n", | |
"Wall time: 89.7 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"out.flush()\n", | |
"\n", | |
"df.refs_to_dataframe(out_dict, \"combined.parq\")\n", | |
"\n", | |
"fs_ref = fsspec.implementations.reference.ReferenceFileSystem(\n", | |
" \"combined.parq\", remote_protocol=\"s3\", target_protocol=\"file\", lazy=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"id": "ce9ee213-8e0a-437d-ba0f-5b5f7fd2f9b8", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "KeyError", | |
"evalue": "'shape'", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", | |
"File \u001b[0;32m<timed exec>:1\u001b[0m\n", | |
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/xarray/backends/api.py:540\u001b[0m, in \u001b[0;36mopen_dataset\u001b[0;34m(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, inline_array, backend_kwargs, **kwargs)\u001b[0m\n\u001b[1;32m 528\u001b[0m decoders \u001b[38;5;241m=\u001b[39m _resolve_decoders_kwargs(\n\u001b[1;32m 529\u001b[0m decode_cf,\n\u001b[1;32m 530\u001b[0m open_backend_dataset_parameters\u001b[38;5;241m=\u001b[39mbackend\u001b[38;5;241m.\u001b[39mopen_dataset_parameters,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 536\u001b[0m decode_coords\u001b[38;5;241m=\u001b[39mdecode_coords,\n\u001b[1;32m 537\u001b[0m )\n\u001b[1;32m 539\u001b[0m overwrite_encoded_chunks \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moverwrite_encoded_chunks\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m--> 540\u001b[0m backend_ds \u001b[38;5;241m=\u001b[39m \u001b[43mbackend\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 541\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename_or_obj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 542\u001b[0m \u001b[43m \u001b[49m\u001b[43mdrop_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdrop_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 543\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mdecoders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 544\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 545\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 546\u001b[0m ds \u001b[38;5;241m=\u001b[39m _dataset_from_backend_dataset(\n\u001b[1;32m 547\u001b[0m backend_ds,\n\u001b[1;32m 548\u001b[0m filename_or_obj,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 556\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 557\u001b[0m )\n\u001b[1;32m 558\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ds\n", | |
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/xarray/backends/zarr.py:897\u001b[0m, in \u001b[0;36mZarrBackendEntrypoint.open_dataset\u001b[0;34m(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, group, mode, synchronizer, consolidated, chunk_store, storage_options, stacklevel, zarr_version)\u001b[0m\n\u001b[1;32m 895\u001b[0m store_entrypoint \u001b[38;5;241m=\u001b[39m StoreBackendEntrypoint()\n\u001b[1;32m 896\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m close_on_error(store):\n\u001b[0;32m--> 897\u001b[0m ds \u001b[38;5;241m=\u001b[39m \u001b[43mstore_entrypoint\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 898\u001b[0m \u001b[43m \u001b[49m\u001b[43mstore\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 899\u001b[0m \u001b[43m \u001b[49m\u001b[43mmask_and_scale\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmask_and_scale\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 900\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecode_times\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecode_times\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 901\u001b[0m \u001b[43m \u001b[49m\u001b[43mconcat_characters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconcat_characters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 902\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecode_coords\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecode_coords\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 903\u001b[0m \u001b[43m \u001b[49m\u001b[43mdrop_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdrop_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 904\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cftime\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cftime\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 905\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecode_timedelta\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecode_timedelta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 906\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 907\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ds\n", | |
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/xarray/backends/store.py:28\u001b[0m, in \u001b[0;36mStoreBackendEntrypoint.open_dataset\u001b[0;34m(self, store, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta)\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mopen_dataset\u001b[39m(\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 18\u001b[0m store,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 26\u001b[0m decode_timedelta\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 27\u001b[0m ):\n\u001b[0;32m---> 28\u001b[0m \u001b[38;5;28mvars\u001b[39m, attrs \u001b[38;5;241m=\u001b[39m \u001b[43mstore\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 29\u001b[0m encoding \u001b[38;5;241m=\u001b[39m store\u001b[38;5;241m.\u001b[39mget_encoding()\n\u001b[1;32m 31\u001b[0m \u001b[38;5;28mvars\u001b[39m, attrs, coord_names \u001b[38;5;241m=\u001b[39m conventions\u001b[38;5;241m.\u001b[39mdecode_cf_variables(\n\u001b[1;32m 32\u001b[0m \u001b[38;5;28mvars\u001b[39m,\n\u001b[1;32m 33\u001b[0m attrs,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 40\u001b[0m decode_timedelta\u001b[38;5;241m=\u001b[39mdecode_timedelta,\n\u001b[1;32m 41\u001b[0m )\n", | |
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/xarray/backends/common.py:128\u001b[0m, in \u001b[0;36mAbstractDataStore.load\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 106\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 107\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 108\u001b[0m \u001b[38;5;124;03m This loads the variables and attributes simultaneously.\u001b[39;00m\n\u001b[1;32m 109\u001b[0m \u001b[38;5;124;03m A centralized loading function makes it easier to create\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[38;5;124;03m are requested, so care should be taken to make sure its fast.\u001b[39;00m\n\u001b[1;32m 126\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m 127\u001b[0m variables \u001b[38;5;241m=\u001b[39m FrozenDict(\n\u001b[0;32m--> 128\u001b[0m (_decode_variable_name(k), v) \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_variables\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mitems()\n\u001b[1;32m 129\u001b[0m )\n\u001b[1;32m 130\u001b[0m attributes \u001b[38;5;241m=\u001b[39m FrozenDict(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_attrs())\n\u001b[1;32m 131\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m variables, attributes\n", | |
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/xarray/backends/zarr.py:480\u001b[0m, in \u001b[0;36mZarrStore.get_variables\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 479\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_variables\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 480\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mFrozenDict\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 481\u001b[0m \u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen_store_variable\u001b[49m\u001b[43m(\u001b[49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mv\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mv\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mzarr_group\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marrays\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 482\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/xarray/core/utils.py:469\u001b[0m, in \u001b[0;36mFrozenDict\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 468\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mFrozenDict\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Frozen:\n\u001b[0;32m--> 469\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m Frozen(\u001b[38;5;28;43mdict\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m)\n", | |
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/xarray/backends/zarr.py:480\u001b[0m, in \u001b[0;36m<genexpr>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 479\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_variables\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 480\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m FrozenDict(\n\u001b[1;32m 481\u001b[0m (k, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mopen_store_variable(k, v)) \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mzarr_group\u001b[38;5;241m.\u001b[39marrays()\n\u001b[1;32m 482\u001b[0m )\n", | |
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/zarr/hierarchy.py:629\u001b[0m, in \u001b[0;36mGroup._array_iter\u001b[0;34m(self, keys_only, method, recurse)\u001b[0m\n\u001b[1;32m 627\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_array_iter\u001b[39m(\u001b[38;5;28mself\u001b[39m, keys_only, method, recurse):\n\u001b[1;32m 628\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_version \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m2\u001b[39m:\n\u001b[0;32m--> 629\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28msorted\u001b[39m(\u001b[43mlistdir\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_store\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_path\u001b[49m\u001b[43m)\u001b[49m):\n\u001b[1;32m 630\u001b[0m path \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_key_prefix \u001b[38;5;241m+\u001b[39m key\n\u001b[1;32m 631\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m contains_array(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_store, path):\n", | |
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/zarr/storage.py:212\u001b[0m, in \u001b[0;36mlistdir\u001b[0;34m(store, path)\u001b[0m\n\u001b[1;32m 209\u001b[0m path \u001b[38;5;241m=\u001b[39m normalize_storage_path(path)\n\u001b[1;32m 210\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(store, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlistdir\u001b[39m\u001b[38;5;124m'\u001b[39m):\n\u001b[1;32m 211\u001b[0m \u001b[38;5;66;03m# pass through\u001b[39;00m\n\u001b[0;32m--> 212\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mstore\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlistdir\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m \u001b[38;5;66;03m# slow version, iterate through all keys\u001b[39;00m\n\u001b[1;32m 215\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 216\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mStore \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstore\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m has no `listdir` method. From zarr 2.9 onwards \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 217\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmay want to inherit from `Store`.\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 218\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m,\n\u001b[1;32m 219\u001b[0m )\n", | |
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/zarr/_storage/store.py:144\u001b[0m, in \u001b[0;36mStore.listdir\u001b[0;34m(self, path)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mlistdir\u001b[39m(\u001b[38;5;28mself\u001b[39m, path: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m List[\u001b[38;5;28mstr\u001b[39m]:\n\u001b[1;32m 143\u001b[0m path \u001b[38;5;241m=\u001b[39m normalize_storage_path(path)\n\u001b[0;32m--> 144\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_listdir_from_keys\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/zarr/_storage/store.py:413\u001b[0m, in \u001b[0;36m_listdir_from_keys\u001b[0;34m(store, path)\u001b[0m\n\u001b[1;32m 411\u001b[0m prefix \u001b[38;5;241m=\u001b[39m _path_to_prefix(path)\n\u001b[1;32m 412\u001b[0m children \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n\u001b[0;32m--> 413\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mstore\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkeys\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 414\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m key\u001b[38;5;241m.\u001b[39mstartswith(prefix) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(key) \u001b[38;5;241m>\u001b[39m \u001b[38;5;28mlen\u001b[39m(prefix):\n\u001b[1;32m 415\u001b[0m suffix \u001b[38;5;241m=\u001b[39m key[\u001b[38;5;28mlen\u001b[39m(prefix):]\n", | |
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/_collections_abc.py:866\u001b[0m, in \u001b[0;36mMappingView.__len__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 865\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__len__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 866\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_mapping\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/zarr/storage.py:742\u001b[0m, in \u001b[0;36mKVStore.__len__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 741\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__len__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 742\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_mutable_mapping\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/fsspec/mapping.py:177\u001b[0m, in \u001b[0;36mFSMap.__len__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__len__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 177\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfind\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mroot\u001b[49m\u001b[43m)\u001b[49m)\n", | |
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/fsspec/implementations/reference.py:1019\u001b[0m, in \u001b[0;36mReferenceFileSystem.find\u001b[0;34m(self, path, maxdepth, withdirs, detail, **kwargs)\u001b[0m\n\u001b[1;32m 1017\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[38;5;28msorted\u001b[39m(k \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreferences \u001b[38;5;28;01mif\u001b[39;00m k\u001b[38;5;241m.\u001b[39mstartswith(path))\n\u001b[1;32m 1018\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1019\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msorted\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreferences\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1020\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m detail:\n\u001b[1;32m 1021\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdircache:\n", | |
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/fsspec/implementations/reference.py:453\u001b[0m, in \u001b[0;36mLazyReferenceMapper.__len__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 451\u001b[0m count \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 452\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 453\u001b[0m chunk_sizes \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_chunk_sizes\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfield\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 454\u001b[0m nchunks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnp\u001b[38;5;241m.\u001b[39mproduct(chunk_sizes)\n\u001b[1;32m 455\u001b[0m count \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m nchunks\n", | |
"File \u001b[0;32m/home/conda/users/bd2319cf4333f345a1cd12ec666552166ea6b7f9c8d73c7807050c6e88c2f494-20231011-145530-645692-242-pangeo/lib/python3.10/site-packages/fsspec/implementations/reference.py:278\u001b[0m, in \u001b[0;36mLazyReferenceMapper._get_chunk_sizes\u001b[0;34m(self, field)\u001b[0m\n\u001b[1;32m 275\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m field \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mchunk_sizes:\n\u001b[1;32m 276\u001b[0m zarray \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mzmetadata[\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfield\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/.zarray\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 277\u001b[0m size_ratio \u001b[38;5;241m=\u001b[39m [\n\u001b[0;32m--> 278\u001b[0m math\u001b[38;5;241m.\u001b[39mceil(s \u001b[38;5;241m/\u001b[39m c) \u001b[38;5;28;01mfor\u001b[39;00m s, c \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(\u001b[43mzarray\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mshape\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m, zarray[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mchunks\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 279\u001b[0m ]\n\u001b[1;32m 280\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mchunk_sizes[field] \u001b[38;5;241m=\u001b[39m size_ratio\n\u001b[1;32m 281\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mchunk_sizes[field]\n", | |
"\u001b[0;31mKeyError\u001b[0m: 'shape'" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"ds = xr.open_dataset(\n", | |
" fs_ref.get_mapper(), engine=\"zarr\", drop_variables=['dstart'], chunks={},\n", | |
" backend_kwargs={\"consolidated\": False}\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "dd07ecbf-232f-45d1-965c-10e08dd7856a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import xarray as xr\n", | |
"\n", | |
"files = fsspec.open(location_of_data)\n", | |
"\n", | |
"# Create LazyReferenceMapper to pass to MultiZarrToZarr\n", | |
"fs = fsspec.filesystem(\"file\")\n", | |
"\n", | |
"os.makedirs(\"combined.parq\")\n", | |
"out = LazyReferenceMapper.create(1000, \"combined.parq\", fs)\n", | |
"\n", | |
"# Create references from input files\n", | |
"single_ref_sets = [hdf.SingleHdf5ToZarr(_).translate() for _ in files]\n", | |
"\n", | |
"out_dict = MultiZarrToZarr(\n", | |
" single_ref_sets,\n", | |
" remote_protocol=\"memory\",\n", | |
" concat_dims=[\"time\"],\n", | |
" out=out).translate()\n", | |
"\n", | |
"out.flush()\n", | |
"\n", | |
"df.refs_to_dataframe(out_dict, \"combined.parq\")\n", | |
"\n", | |
"fs = fsspec.implementations.reference.ReferenceFileSystem(\n", | |
" \"combined.parq\", remote_protocol=\"s3\", target_protocol=\"file\", lazy=True)\n", | |
"ds = xr.open_dataset(\n", | |
" fs.get_mapper(), engine=\"zarr\",\n", | |
" backend_kwargs={\"consolidated\": False}\n", | |
")" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "users-users-pangeo", | |
"language": "python", | |
"name": "conda-env-users-users-pangeo-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.12" | |
}, | |
"widgets": { | |
"application/vnd.jupyter.widget-state+json": { | |
"state": {}, | |
"version_major": 2, | |
"version_minor": 0 | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment