Skip to content

Instantly share code, notes, and snippets.

@tinaok
Last active October 1, 2019 14:27
Show Gist options
  • Save tinaok/c2ef193e94508a5ba426979d01e99307 to your computer and use it in GitHub Desktop.
Save tinaok/c2ef193e94508a5ba426979d01e99307 to your computer and use it in GitHub Desktop.
automatic chunk size
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [],
"source": [
"import numpy \n",
"from distributed.utils import parse_bytes\n",
"import math\n",
"import dask\n",
"from numbers import Number, Integral\n",
"from dask.utils import factors\n",
"def auto_chunks(chunks, shape, limit, dtype, previous_chunks=None):\n",
" \"\"\" Determine automatic chunks\n",
" This takes in a chunks value that contains ``\"auto\"`` values in certain\n",
" dimensions and replaces those values with concrete dimension sizes that try\n",
" to get chunks to be of a certain size in bytes, provided by the ``limit=``\n",
" keyword. If multiple dimensions are marked as ``\"auto\"`` then they will\n",
" all respond to meet the desired byte limit, trying to respect the aspect\n",
" ratio of their dimensions in ``previous_chunks=``, if given.\n",
" Parameters\n",
" ----------\n",
" chunks: Tuple\n",
" A tuple of either dimensions or tuples of explicit chunk dimensions\n",
" Some entries should be \"auto\"\n",
" shape: Tuple[int]\n",
" limit: int, str\n",
" The maximum allowable size of a chunk in bytes\n",
" previous_chunks: Tuple[Tuple[int]]\n",
" See also\n",
" --------\n",
" normalize_chunks: for full docstring and parameters\n",
" \"\"\"\n",
" if previous_chunks is not None:\n",
" previous_chunks = tuple(\n",
" c if isinstance(c, tuple) else (c,) for c in previous_chunks\n",
" )\n",
" chunks = list(chunks)\n",
"\n",
" autos = {i for i, c in enumerate(chunks) if c == \"auto\"}\n",
" if not autos:\n",
" return tuple(chunks)\n",
"\n",
" if limit is None:\n",
" limit = config.get(\"array.chunk-size\")\n",
" if isinstance(limit, str):\n",
" limit = parse_bytes(limit)\n",
"\n",
" if dtype is None:\n",
" raise TypeError(\"DType must be known for auto-chunking\")\n",
"\n",
" if dtype.hasobject:\n",
" raise NotImplementedError(\n",
" \"Can not use auto rechunking with object dtype. \"\n",
" \"We are unable to estimate the size in bytes of object data\"\n",
" )\n",
"\n",
" for x in tuple(chunks) + tuple(shape):\n",
" if (\n",
" isinstance(x, Number)\n",
" and np.isnan(x)\n",
" or isinstance(x, tuple)\n",
" and np.isnan(x).any()\n",
" ):\n",
" raise ValueError(\n",
" \"Can not perform automatic rechunking with unknown \"\n",
" \"(nan) chunk sizes.%s\" % unknown_chunk_message\n",
" )\n",
"\n",
" limit = max(1, limit)\n",
" print('tina',limit)\n",
" largest_block = np.prod(\n",
" [cs if isinstance(cs, Number) else max(cs) for cs in chunks if cs != \"auto\"]\n",
" )\n",
" print('tina largest block ',largest_block)\n",
" if previous_chunks:\n",
" # Base ideal ratio on the median chunk size of the previous chunks\n",
" result = {a: np.median(previous_chunks[a]) for a in autos}\n",
"\n",
" ideal_shape = []\n",
" for i, s in enumerate(shape):\n",
" chunk_frequencies = frequencies(previous_chunks[i])\n",
" mode, count = max(chunk_frequencies.items(), key=lambda kv: kv[1])\n",
" if mode > 1 and count >= len(previous_chunks[i]) / 2:\n",
" ideal_shape.append(mode)\n",
" else:\n",
" ideal_shape.append(s)\n",
"\n",
" # How much larger or smaller the ideal chunk size is relative to what we have now\n",
" multiplier = (\n",
" limit / dtype.itemsize / largest_block / np.prod(list(result.values()))\n",
" )\n",
" last_multiplier = 0\n",
" last_autos = set()\n",
"\n",
" while (\n",
" multiplier != last_multiplier or autos != last_autos\n",
" ): # while things change\n",
" last_multiplier = multiplier # record previous values\n",
" last_autos = set(autos) # record previous values\n",
"\n",
" # Expand or contract each of the dimensions appropriately\n",
" for a in sorted(autos):\n",
" proposed = result[a] * multiplier ** (1 / len(autos))\n",
" if proposed > shape[a]: # we've hit the shape boundary\n",
" autos.remove(a)\n",
" largest_block *= shape[a]\n",
" chunks[a] = shape[a]\n",
" del result[a]\n",
" else:\n",
" result[a] = round_to(proposed, ideal_shape[a])\n",
"\n",
" # recompute how much multiplier we have left, repeat\n",
" multiplier = (\n",
" limit / dtype.itemsize / largest_block / np.prod(list(result.values()))\n",
" )\n",
"\n",
"\n",
" for k, v in result.items():\n",
" chunks[k] = v\n",
" return tuple(chunks)\n",
"\n",
" else:\n",
" size = (limit / dtype.itemsize / largest_block) ** (1 / len(autos))\n",
" small = [i for i in autos if shape[i] < size]\n",
" if small:\n",
" for i in small:\n",
" chunks[i] = (shape[i],)\n",
" return auto_chunks(chunks, shape, limit, dtype)\n",
"\n",
" for i in autos:\n",
" chunks[i] = round_to(size, shape[i])\n",
"\n",
" return tuple(chunks)\n",
" \n",
"def round_to(c, s):\n",
" \"\"\" Return a chunk dimension that is close to an even multiple or factor\n",
" We want values for c that are nicely aligned with s.\n",
" If c is smaller than s then we want the largest factor of s that is less than the\n",
" desired chunk size, but not less than half, which is too much. If no such\n",
" factor exists then we just go with the original chunk size and accept an\n",
" uneven chunk at the end.\n",
" If c is larger than s then we want the largest multiple of s that is still\n",
" smaller than c.\n",
" \"\"\"\n",
" if c <= s:\n",
" try:\n",
" return max(f for f in factors(s) if c / 2 <= f <= c)\n",
" except ValueError: # no matching factors within factor of two\n",
" return max(1, int(c))\n",
" else:\n",
" return c // s * s\n"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tina 256000000\n",
"tina largest block 1.0\n"
]
},
{
"data": {
"text/plain": [
"(317, 192, 160)"
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"timesteps=20834\n",
"lat=320\n",
"lon=384\n",
"shape=(timesteps,lon,lat)\n",
"chunks=(\"auto\",\"auto\",\"auto\")\n",
"limit='256MB'\n",
"random_data =da.random.RandomState(0).standard_normal(shape, chunks='auto' )\n",
"dtype = random_data.dtype\n",
"auto_chunks(chunks, shape, limit, dtype=dtype, previous_chunks=None)"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"77905920"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"317*192*160*8"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('auto', 'auto', 'auto')\n"
]
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "pangeobench",
"language": "python",
"name": "pangeobench"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment