Created
June 16, 2022 10:56
-
-
Save rsignell-usgs/f42127a23d91fa574299e315a005aaea to your computer and use it in GitHub Desktop.
fill_value.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "5b86af71-052b-4081-844b-779db459068d", | |
"metadata": {}, | |
"source": [ | |
"# Fill value issue with xarray/zarr/kerchunk\n", | |
"The original NetCDF file here (ROMS model output) has `float32` vars with `_FillValue` set as 1e37. \n", | |
"* reading NetCDF with Xarray (and `decode_cf=True`) correctly sets these values to `NaN`\n", | |
"* reading the corresponding Zarr or kerchunk JSON with Xarray does not" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "3b6ba617-9772-4ecf-aa7a-2be2a4bcfa1d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import fsspec\n", | |
"import xarray as xr" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "daaa298b-2b15-46a2-b8ef-d4828c5ae291", | |
"metadata": {}, | |
"source": [ | |
"#### Read NetCDF file directly with Xarray\n", | |
"`_FillValue` is correctly converted to `NaN`. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "d6b937c9-ffb0-42fe-94fc-e591cd297255", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fs = fsspec.filesystem('s3', anon=True, client_kwargs={'endpoint_url': 'https://mghp.osn.xsede.org'})\n", | |
"url = 's3://rsignellbucket1/COAWST/coawst_us_20220101_01.nc'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "a7a9ea7a-89bc-4560-baf5-292cd2cbe368", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ds = xr.open_dataset(fs.open(url), decode_cf=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "30951466-d166-4306-bcda-d015cf583bf7", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array(nan)" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ds.temp[0,0,0,0].values" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "ff2ea828-1ce7-4781-b73e-fb8d4d4dc46b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ds = xr.open_dataset(fs.open(url), decode_cf=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "75160414-374d-4ef8-8d99-7f2fdac45187", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array(1.e+37, dtype=float32)" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ds.temp[0,0,0,0].values" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "bcedf939-0b09-4f23-ae79-ffa72d84d03f", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1e+37" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ds.temp._FillValue" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "529cd7e9-5a1f-4e6b-a515-fe238c61c6ea", | |
"metadata": {}, | |
"source": [ | |
"#### Write to Zarr. Read resulting Zarr with xarray. \n", | |
"User sees `NaN` in xarray, but `fill_value` (the attribute used to store the fill value in Zarr) is `9.999999933815813e+36` instead of `1.e+37` " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "d6c3a17e-3fc7-4520-8080-bb76a9cb7416", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 736 ms, sys: 136 ms, total: 871 ms\n", | |
"Wall time: 9.74 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"<xarray.backends.zarr.ZarrStore at 0x7fc955646200>" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"ds[['temp','salt']].isel(ocean_time=slice(0,2)).to_zarr('foo.zarr', compute=True, mode='w')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "284ccc02-f603-4210-809b-5c3e62d948eb", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ds2 = xr.open_dataset('foo.zarr', engine='zarr', decode_cf=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "f037fe4b-6aac-4e52-9080-e425329db099", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1e+37" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ds2.temp._FillValue" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "994a6553-725e-43ad-baa5-7d3fee332d95", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array(1.e+37, dtype=float32)" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ds2.temp[0,0,0,0].values" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "ae422b44-cddb-4aeb-9005-6c5e864c02e4", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array(nan)" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ds2 = xr.open_dataset('foo.zarr', engine='zarr', decode_cf=True)\n", | |
"ds2.temp[0,0,0,0].values" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "768e9d44-2756-42ec-9c12-c51cfbd59998", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{\n", | |
" \"chunks\": [\n", | |
" 1,\n", | |
" 4,\n", | |
" 84,\n", | |
" 448\n", | |
" ],\n", | |
" \"compressor\": {\n", | |
" \"blocksize\": 0,\n", | |
" \"clevel\": 5,\n", | |
" \"cname\": \"lz4\",\n", | |
" \"id\": \"blosc\",\n", | |
" \"shuffle\": 1\n", | |
" },\n", | |
" \"dtype\": \"<f4\",\n", | |
" \"fill_value\": 9.999999933815813e+36,\n", | |
" \"filters\": null,\n", | |
" \"order\": \"C\",\n", | |
" \"shape\": [\n", | |
" 2,\n", | |
" 16,\n", | |
" 336,\n", | |
" 896\n", | |
" ],\n", | |
" \"zarr_format\": 2\n", | |
"}" | |
] | |
} | |
], | |
"source": [ | |
"! cat ./foo.zarr/temp/.zarray" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "d1856c8d-5f22-4386-b616-678b1ced5ee1", | |
"metadata": {}, | |
"source": [ | |
"#### Read Kerchunk JSON representation of the above NetCDF file\n", | |
"Here the user doesn't get `NaN` values in the masked regions, but a value close too but " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "22d6fcf5-27b5-47bc-b3cc-44dae863314a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"json_url = 's3://rsignellbucket1/COAWST/jsons/coawst_us_20220101_01.nc.json'" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "694458e0-50cf-4124-b896-0abd1e1fced2", | |
"metadata": {}, | |
"source": [ | |
"Try with `decode_cf=True`:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "c59e3c10-ebb9-4915-9f8e-8fcb43ee9475", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"s_opts = dict(skip_instance_cache=True, anon=True, client_kwargs={'endpoint_url': 'https://mghp.osn.xsede.org'}) #json \n", | |
"r_opts = dict(anon=True, client_kwargs={'endpoint_url': 'https://mghp.osn.xsede.org'}) #data\n", | |
"\n", | |
"fs = fsspec.filesystem(\"reference\", fo=json_url, ref_storage_args=s_opts,\n", | |
" remote_protocol='s3', remote_options=r_opts)\n", | |
"m = fs.get_mapper(\"\")\n", | |
"\n", | |
"ds = xr.open_dataset(m, engine=\"zarr\", chunks={}, \n", | |
" backend_kwargs=dict(consolidated=False), decode_cf=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "d2e3c687-7288-4266-bf1f-073fdeaf4df9", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array(9.99999993e+36)" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ds.temp[0,0,0,0].values" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "0eacc5d0-03f2-4c8f-bb70-a6d3890c6ab2", | |
"metadata": {}, | |
"source": [ | |
"print with full precision:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "8c68bf9e-19fb-4298-9814-413bd3fe89a4", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'9999999933815812510711506376257961984'" | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"format(ds.temp[0,0,0,0].values, '.60g') " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "80904a40-592b-43db-b01d-06d8a214ffa2", | |
"metadata": {}, | |
"source": [ | |
"So these came in as (non-NaN) values because they are different than \n", | |
"`fill_value: 9.999999933815813e+36` ?" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "a4cc1585-03f8-405c-ad7b-34e3f3b571ab", | |
"metadata": {}, | |
"source": [ | |
"Try with `decode_cf=False`:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"id": "520036ab-5fc0-43b7-b0c5-4efbf5fdb3f9", | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"ds = xr.open_dataset(m, engine=\"zarr\", chunks={}, \n", | |
" backend_kwargs=dict(consolidated=False), decode_cf=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"id": "61c10df9-b632-4680-9b48-13cb78e429b8", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array(1.e+37, dtype=float32)" | |
] | |
}, | |
"execution_count": 19, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ds.temp[0,0,0,0].values" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "a359958a-746e-4433-b6bb-77589ee067f7", | |
"metadata": {}, | |
"source": [ | |
"The kerchunk-generated JSON of course reflects what the Zarr file has:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"id": "0c47572a-c4fa-4c2d-8435-4f01a79101ac", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fs.download('temp/.zattrs', 'foo')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"id": "149f0fea-c368-4bb7-9174-cb75f3c77afa", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{\n", | |
" \"_ARRAY_DIMENSIONS\": [\n", | |
" \"ocean_time\",\n", | |
" \"s_rho\",\n", | |
" \"eta_rho\",\n", | |
" \"xi_rho\"\n", | |
" ],\n", | |
" \"_FillValue\": 9.999999933815813e+36,\n", | |
" \"coordinates\": \"lon_rho lat_rho s_rho ocean_time\",\n", | |
" \"field\": \"temperature, scalar, series\",\n", | |
" \"grid\": \"grid\",\n", | |
" \"location\": \"face\",\n", | |
" \"long_name\": \"potential temperature\",\n", | |
" \"time\": \"ocean_time\",\n", | |
" \"units\": \"Celsius\"\n", | |
"}\n" | |
] | |
} | |
], | |
"source": [ | |
"!more foo" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "users-pangeo", | |
"language": "python", | |
"name": "conda-env-users-pangeo-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.13" | |
}, | |
"widgets": { | |
"application/vnd.jupyter.widget-state+json": { | |
"state": {}, | |
"version_major": 2, | |
"version_minor": 0 | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment