Skip to content

Instantly share code, notes, and snippets.

@aaronspring
Last active January 5, 2020 16:47
Show Gist options
  • Save aaronspring/149955a1378bbfdf1302f73881625e6b to your computer and use it in GitHub Desktop.
Save aaronspring/149955a1378bbfdf1302f73881625e6b to your computer and use it in GitHub Desktop.
using zarr to speed up climpred
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Testing zarr performance"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"base = '/work/mh0727/m300524/miklip_post/baseline1'"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"v = 'tos'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"ds_p = f'{base}/ds_{v}.nc'"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import xarray as xr"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"ds = xr.open_dataset(ds_p, chunks={'i':256//8})"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"persist = False\n",
"if persist:\n",
" ds = ds.persist()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table>\n",
"<tr>\n",
"<td>\n",
"<table>\n",
" <thead>\n",
" <tr><td> </td><th> Array </th><th> Chunk </th></tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr><th> Bytes </th><td> 2.57 GB </td> <td> 321.02 MB </td></tr>\n",
" <tr><th> Shape </th><td> (10, 57, 10, 220, 256) </td> <td> (10, 57, 10, 220, 32) </td></tr>\n",
" <tr><th> Count </th><td> 9 Tasks </td><td> 8 Chunks </td></tr>\n",
" <tr><th> Type </th><td> float64 </td><td> numpy.ndarray </td></tr>\n",
" </tbody>\n",
"</table>\n",
"</td>\n",
"<td>\n",
"<svg width=\"413\" height=\"172\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n",
"\n",
" <!-- Horizontal lines -->\n",
" <line x1=\"0\" y1=\"0\" x2=\"42\" y2=\"0\" style=\"stroke-width:2\" />\n",
" <line x1=\"0\" y1=\"32\" x2=\"42\" y2=\"32\" style=\"stroke-width:2\" />\n",
"\n",
" <!-- Vertical lines -->\n",
" <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"32\" style=\"stroke-width:2\" />\n",
" <line x1=\"42\" y1=\"0\" x2=\"42\" y2=\"32\" style=\"stroke-width:2\" />\n",
"\n",
" <!-- Colored Rectangle -->\n",
" <polygon points=\"0.000000,0.000000 42.450911,0.000000 42.450911,32.988909 0.000000,32.988909\" style=\"fill:#ECB172A0;stroke-width:0\"/>\n",
"\n",
" <!-- Text -->\n",
" <text x=\"21.225456\" y=\"52.988909\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >57</text>\n",
" <text x=\"62.450911\" y=\"16.494455\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(0,62.450911,16.494455)\">10</text>\n",
"\n",
"\n",
" <!-- Horizontal lines -->\n",
" <line x1=\"112\" y1=\"0\" x2=\"131\" y2=\"19\" style=\"stroke-width:2\" />\n",
" <line x1=\"112\" y1=\"103\" x2=\"131\" y2=\"122\" style=\"stroke-width:2\" />\n",
"\n",
" <!-- Vertical lines -->\n",
" <line x1=\"112\" y1=\"0\" x2=\"112\" y2=\"103\" style=\"stroke-width:2\" />\n",
" <line x1=\"131\" y1=\"19\" x2=\"131\" y2=\"122\" style=\"stroke-width:2\" />\n",
"\n",
" <!-- Colored Rectangle -->\n",
" <polygon points=\"112.000000,0.000000 131.405241,19.405241 131.405241,122.530241 112.000000,103.125000\" style=\"fill:#ECB172A0;stroke-width:0\"/>\n",
"\n",
" <!-- Horizontal lines -->\n",
" <line x1=\"112\" y1=\"0\" x2=\"232\" y2=\"0\" style=\"stroke-width:2\" />\n",
" <line x1=\"131\" y1=\"19\" x2=\"251\" y2=\"19\" style=\"stroke-width:2\" />\n",
"\n",
" <!-- Vertical lines -->\n",
" <line x1=\"112\" y1=\"0\" x2=\"131\" y2=\"19\" style=\"stroke-width:2\" />\n",
" <line x1=\"127\" y1=\"0\" x2=\"146\" y2=\"19\" />\n",
" <line x1=\"142\" y1=\"0\" x2=\"161\" y2=\"19\" />\n",
" <line x1=\"157\" y1=\"0\" x2=\"176\" y2=\"19\" />\n",
" <line x1=\"172\" y1=\"0\" x2=\"191\" y2=\"19\" />\n",
" <line x1=\"187\" y1=\"0\" x2=\"206\" y2=\"19\" />\n",
" <line x1=\"202\" y1=\"0\" x2=\"221\" y2=\"19\" />\n",
" <line x1=\"217\" y1=\"0\" x2=\"236\" y2=\"19\" />\n",
" <line x1=\"232\" y1=\"0\" x2=\"251\" y2=\"19\" style=\"stroke-width:2\" />\n",
"\n",
" <!-- Colored Rectangle -->\n",
" <polygon points=\"112.000000,0.000000 232.000000,0.000000 251.405241,19.405241 131.405241,19.405241\" style=\"fill:#ECB172A0;stroke-width:0\"/>\n",
"\n",
" <!-- Horizontal lines -->\n",
" <line x1=\"131\" y1=\"19\" x2=\"251\" y2=\"19\" style=\"stroke-width:2\" />\n",
" <line x1=\"131\" y1=\"122\" x2=\"251\" y2=\"122\" style=\"stroke-width:2\" />\n",
"\n",
" <!-- Vertical lines -->\n",
" <line x1=\"131\" y1=\"19\" x2=\"131\" y2=\"122\" style=\"stroke-width:2\" />\n",
" <line x1=\"146\" y1=\"19\" x2=\"146\" y2=\"122\" />\n",
" <line x1=\"161\" y1=\"19\" x2=\"161\" y2=\"122\" />\n",
" <line x1=\"176\" y1=\"19\" x2=\"176\" y2=\"122\" />\n",
" <line x1=\"191\" y1=\"19\" x2=\"191\" y2=\"122\" />\n",
" <line x1=\"206\" y1=\"19\" x2=\"206\" y2=\"122\" />\n",
" <line x1=\"221\" y1=\"19\" x2=\"221\" y2=\"122\" />\n",
" <line x1=\"236\" y1=\"19\" x2=\"236\" y2=\"122\" />\n",
" <line x1=\"251\" y1=\"19\" x2=\"251\" y2=\"122\" style=\"stroke-width:2\" />\n",
"\n",
" <!-- Colored Rectangle -->\n",
" <polygon points=\"131.405241,19.405241 251.405241,19.405241 251.405241,122.530241 131.405241,122.530241\" style=\"fill:#ECB172A0;stroke-width:0\"/>\n",
"\n",
" <!-- Text -->\n",
" <text x=\"191.405241\" y=\"142.530241\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >256</text>\n",
" <text x=\"271.405241\" y=\"70.967741\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,271.405241,70.967741)\">220</text>\n",
" <text x=\"111.702620\" y=\"132.827620\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(45,111.702620,132.827620)\">10</text>\n",
"</svg>\n",
"</td>\n",
"</tr>\n",
"</table>"
],
"text/plain": [
"dask.array<open_dataset-ccf733da648f86f53e6763671c2ef03dtos, shape=(10, 57, 10, 220, 256), dtype=float64, chunksize=(10, 57, 10, 220, 32), chunktype=numpy.ndarray>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ds[v].data"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"ds_p_z = f'{base}/zarr_ex'"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 4.6 s, sys: 7.15 s, total: 11.7 s\n",
"Wall time: 8.2 s\n"
]
},
{
"data": {
"text/plain": [
"<xarray.backends.zarr.ZarrStore at 0x2b1c5e96dd58>"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%time ds.to_zarr(ds_p_z, consolidated=True, mode='w')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from dask.utils import format_bytes\n",
"\n",
"def folder_size(path='.'):\n",
" total = 0\n",
" for entry in os.scandir(path):\n",
" if entry.is_file():\n",
" total += entry.stat().st_size\n",
" elif entry.is_dir():\n",
" total += folder_size(entry.path)\n",
" return total"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('size in zarr format', '1.16 GB')"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'size in zarr format',format_bytes(folder_size(path=ds_p_z))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('size in netcdf format', '2.57 GB')"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'size in netcdf format',format_bytes(ds.nbytes)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# open zarr now\n",
"za = xr.open_zarr(ds_p_z)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"if persist:\n",
" za = za.persist()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table>\n",
"<tr>\n",
"<td>\n",
"<table>\n",
" <thead>\n",
" <tr><td> </td><th> Array </th><th> Chunk </th></tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr><th> Bytes </th><td> 2.57 GB </td> <td> 321.02 MB </td></tr>\n",
" <tr><th> Shape </th><td> (10, 57, 10, 220, 256) </td> <td> (10, 57, 10, 220, 32) </td></tr>\n",
" <tr><th> Count </th><td> 9 Tasks </td><td> 8 Chunks </td></tr>\n",
" <tr><th> Type </th><td> float64 </td><td> numpy.ndarray </td></tr>\n",
" </tbody>\n",
"</table>\n",
"</td>\n",
"<td>\n",
"<svg width=\"413\" height=\"172\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n",
"\n",
" <!-- Horizontal lines -->\n",
" <line x1=\"0\" y1=\"0\" x2=\"42\" y2=\"0\" style=\"stroke-width:2\" />\n",
" <line x1=\"0\" y1=\"32\" x2=\"42\" y2=\"32\" style=\"stroke-width:2\" />\n",
"\n",
" <!-- Vertical lines -->\n",
" <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"32\" style=\"stroke-width:2\" />\n",
" <line x1=\"42\" y1=\"0\" x2=\"42\" y2=\"32\" style=\"stroke-width:2\" />\n",
"\n",
" <!-- Colored Rectangle -->\n",
" <polygon points=\"0.000000,0.000000 42.450911,0.000000 42.450911,32.988909 0.000000,32.988909\" style=\"fill:#ECB172A0;stroke-width:0\"/>\n",
"\n",
" <!-- Text -->\n",
" <text x=\"21.225456\" y=\"52.988909\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >57</text>\n",
" <text x=\"62.450911\" y=\"16.494455\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(0,62.450911,16.494455)\">10</text>\n",
"\n",
"\n",
" <!-- Horizontal lines -->\n",
" <line x1=\"112\" y1=\"0\" x2=\"131\" y2=\"19\" style=\"stroke-width:2\" />\n",
" <line x1=\"112\" y1=\"103\" x2=\"131\" y2=\"122\" style=\"stroke-width:2\" />\n",
"\n",
" <!-- Vertical lines -->\n",
" <line x1=\"112\" y1=\"0\" x2=\"112\" y2=\"103\" style=\"stroke-width:2\" />\n",
" <line x1=\"131\" y1=\"19\" x2=\"131\" y2=\"122\" style=\"stroke-width:2\" />\n",
"\n",
" <!-- Colored Rectangle -->\n",
" <polygon points=\"112.000000,0.000000 131.405241,19.405241 131.405241,122.530241 112.000000,103.125000\" style=\"fill:#ECB172A0;stroke-width:0\"/>\n",
"\n",
" <!-- Horizontal lines -->\n",
" <line x1=\"112\" y1=\"0\" x2=\"232\" y2=\"0\" style=\"stroke-width:2\" />\n",
" <line x1=\"131\" y1=\"19\" x2=\"251\" y2=\"19\" style=\"stroke-width:2\" />\n",
"\n",
" <!-- Vertical lines -->\n",
" <line x1=\"112\" y1=\"0\" x2=\"131\" y2=\"19\" style=\"stroke-width:2\" />\n",
" <line x1=\"127\" y1=\"0\" x2=\"146\" y2=\"19\" />\n",
" <line x1=\"142\" y1=\"0\" x2=\"161\" y2=\"19\" />\n",
" <line x1=\"157\" y1=\"0\" x2=\"176\" y2=\"19\" />\n",
" <line x1=\"172\" y1=\"0\" x2=\"191\" y2=\"19\" />\n",
" <line x1=\"187\" y1=\"0\" x2=\"206\" y2=\"19\" />\n",
" <line x1=\"202\" y1=\"0\" x2=\"221\" y2=\"19\" />\n",
" <line x1=\"217\" y1=\"0\" x2=\"236\" y2=\"19\" />\n",
" <line x1=\"232\" y1=\"0\" x2=\"251\" y2=\"19\" style=\"stroke-width:2\" />\n",
"\n",
" <!-- Colored Rectangle -->\n",
" <polygon points=\"112.000000,0.000000 232.000000,0.000000 251.405241,19.405241 131.405241,19.405241\" style=\"fill:#ECB172A0;stroke-width:0\"/>\n",
"\n",
" <!-- Horizontal lines -->\n",
" <line x1=\"131\" y1=\"19\" x2=\"251\" y2=\"19\" style=\"stroke-width:2\" />\n",
" <line x1=\"131\" y1=\"122\" x2=\"251\" y2=\"122\" style=\"stroke-width:2\" />\n",
"\n",
" <!-- Vertical lines -->\n",
" <line x1=\"131\" y1=\"19\" x2=\"131\" y2=\"122\" style=\"stroke-width:2\" />\n",
" <line x1=\"146\" y1=\"19\" x2=\"146\" y2=\"122\" />\n",
" <line x1=\"161\" y1=\"19\" x2=\"161\" y2=\"122\" />\n",
" <line x1=\"176\" y1=\"19\" x2=\"176\" y2=\"122\" />\n",
" <line x1=\"191\" y1=\"19\" x2=\"191\" y2=\"122\" />\n",
" <line x1=\"206\" y1=\"19\" x2=\"206\" y2=\"122\" />\n",
" <line x1=\"221\" y1=\"19\" x2=\"221\" y2=\"122\" />\n",
" <line x1=\"236\" y1=\"19\" x2=\"236\" y2=\"122\" />\n",
" <line x1=\"251\" y1=\"19\" x2=\"251\" y2=\"122\" style=\"stroke-width:2\" />\n",
"\n",
" <!-- Colored Rectangle -->\n",
" <polygon points=\"131.405241,19.405241 251.405241,19.405241 251.405241,122.530241 131.405241,122.530241\" style=\"fill:#ECB172A0;stroke-width:0\"/>\n",
"\n",
" <!-- Text -->\n",
" <text x=\"191.405241\" y=\"142.530241\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >256</text>\n",
" <text x=\"271.405241\" y=\"70.967741\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,271.405241,70.967741)\">220</text>\n",
" <text x=\"111.702620\" y=\"132.827620\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(45,111.702620,132.827620)\">10</text>\n",
"</svg>\n",
"</td>\n",
"</tr>\n",
"</table>"
],
"text/plain": [
"dask.array<zarr, shape=(10, 57, 10, 220, 256), dtype=float64, chunksize=(10, 57, 10, 220, 32), chunktype=numpy.ndarray>"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"za[v].data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# benchmarks"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## std"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/work/mh0727/m300524/miniconda3/envs/pymistral/lib/python3.7/site-packages/dask/array/numpy_compat.py:40: RuntimeWarning: invalid value encountered in true_divide\n",
" x = np.divide(x1, x2, out)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 6.2 s, sys: 9.82 s, total: 16 s\n",
"Wall time: 6.69 s\n"
]
}
],
"source": [
"# chunked\n",
"%time r_ds = ds.std('member').load()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/work/mh0727/m300524/miniconda3/envs/pymistral/lib/python3.7/site-packages/dask/array/numpy_compat.py:40: RuntimeWarning: invalid value encountered in true_divide\n",
" x = np.divide(x1, x2, out)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 8.6 s, sys: 9.43 s, total: 18 s\n",
"Wall time: 3.1 s\n"
]
}
],
"source": [
"# zarr\n",
"%time r_za = za.std('member').load()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 861 ms, sys: 84 ms, total: 945 ms\n",
"Wall time: 976 ms\n"
]
}
],
"source": [
"# loaded\n",
"dsl = ds.load()\n",
"# in-memory std is much faster than lazy for a simle operation\n",
"%time r_dsl = dsl.std('member')"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"xr.testing.assert_equal(r_za, r_ds)\n",
"xr.testing.assert_equal(r_dsl, r_za)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## climpred"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"from climpred.prediction import compute_perfect_model\n",
"kw = {'metric': 'rmse', 'comparison': 'm2c'}"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 12.1 s, sys: 12.7 s, total: 24.8 s\n",
"Wall time: 7.74 s\n"
]
}
],
"source": [
"# chunked\n",
"%time s_ds = compute_perfect_model(ds, ds.rename({'init':'time'}), **kw).load()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 13.8 s, sys: 14.8 s, total: 28.6 s\n",
"Wall time: 4.13 s\n"
]
}
],
"source": [
"# zarr\n",
"%time s_za = compute_perfect_model(za, za.rename({'init':'time'}), **kw).load()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 5.92 s, sys: 4.49 s, total: 10.4 s\n",
"Wall time: 10.4 s\n"
]
}
],
"source": [
"# loaded\n",
"%time s_dsl = compute_perfect_model(dsl, dsl.rename({'init':'time'}), **kw)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"xr.testing.assert_equal(s_za, s_ds)\n",
"xr.testing.assert_equal(s_za, s_dsl)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:pymistral]",
"language": "python",
"name": "conda-env-pymistral-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment