tinaok/anomaly-chunksize.ipynb

## anomaly-chunksize.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              anomaly-chunksize.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## anomaly-chunksizesimplifyed.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              anomaly-chunksizesimplifyed.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## auto_chunkstest.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy \n",
    "from distributed.utils import parse_bytes\n",
    "import math\n",
    "import dask\n",
    "from numbers import Number, Integral\n",
    "from dask.utils import  factors\n",
    "def auto_chunks(chunks, shape, limit, dtype, previous_chunks=None):\n",
    "    \"\"\" Determine automatic chunks\n",
    "    This takes in a chunks value that contains ``\"auto\"`` values in certain\n",
    "    dimensions and replaces those values with concrete dimension sizes that try\n",
    "    to get chunks to be of a certain size in bytes, provided by the ``limit=``\n",
    "    keyword.  If multiple dimensions are marked as ``\"auto\"`` then they will\n",
    "    all respond to meet the desired byte limit, trying to respect the aspect\n",
    "    ratio of their dimensions in ``previous_chunks=``, if given.\n",
    "    Parameters\n",
    "    ----------\n",
    "    chunks: Tuple\n",
    "        A tuple of either dimensions or tuples of explicit chunk dimensions\n",
    "        Some entries should be \"auto\"\n",
    "    shape: Tuple[int]\n",
    "    limit: int, str\n",
    "        The maximum allowable size of a chunk in bytes\n",
    "    previous_chunks: Tuple[Tuple[int]]\n",
    "    See also\n",
    "    --------\n",
    "    normalize_chunks: for full docstring and parameters\n",
    "    \"\"\"\n",
    "    if previous_chunks is not None:\n",
    "        previous_chunks = tuple(\n",
    "            c if isinstance(c, tuple) else (c,) for c in previous_chunks\n",
    "        )\n",
    "    chunks = list(chunks)\n",
    "\n",
    "    autos = {i for i, c in enumerate(chunks) if c == \"auto\"}\n",
    "    if not autos:\n",
    "        return tuple(chunks)\n",
    "\n",
    "    if limit is None:\n",
    "        limit = config.get(\"array.chunk-size\")\n",
    "    if isinstance(limit, str):\n",
    "        limit = parse_bytes(limit)\n",
    "\n",
    "    if dtype is None:\n",
    "        raise TypeError(\"DType must be known for auto-chunking\")\n",
    "\n",
    "    if dtype.hasobject:\n",
    "        raise NotImplementedError(\n",
    "            \"Can not use auto rechunking with object dtype. \"\n",
    "            \"We are unable to estimate the size in bytes of object data\"\n",
    "        )\n",
    "\n",
    "    for x in tuple(chunks) + tuple(shape):\n",
    "        if (\n",
    "            isinstance(x, Number)\n",
    "            and np.isnan(x)\n",
    "            or isinstance(x, tuple)\n",
    "            and np.isnan(x).any()\n",
    "        ):\n",
    "            raise ValueError(\n",
    "                \"Can not perform automatic rechunking with unknown \"\n",
    "                \"(nan) chunk sizes.%s\" % unknown_chunk_message\n",
    "            )\n",
    "\n",
    "    limit = max(1, limit)\n",
    "    print('tina',limit)\n",
    "    largest_block = np.prod(\n",
    "        [cs if isinstance(cs, Number) else max(cs) for cs in chunks if cs != \"auto\"]\n",
    "    )\n",
    "    print('tina largest block ',largest_block)\n",
    "    if previous_chunks:\n",
    "        # Base ideal ratio on the median chunk size of the previous chunks\n",
    "        result = {a: np.median(previous_chunks[a]) for a in autos}\n",
    "\n",
    "        ideal_shape = []\n",
    "        for i, s in enumerate(shape):\n",
    "            chunk_frequencies = frequencies(previous_chunks[i])\n",
    "            mode, count = max(chunk_frequencies.items(), key=lambda kv: kv[1])\n",
    "            if mode > 1 and count >= len(previous_chunks[i]) / 2:\n",
    "                ideal_shape.append(mode)\n",
    "            else:\n",
    "                ideal_shape.append(s)\n",
    "\n",
    "        # How much larger or smaller the ideal chunk size is relative to what we have now\n",
    "        multiplier = (\n",
    "            limit / dtype.itemsize / largest_block / np.prod(list(result.values()))\n",
    "        )\n",
    "        last_multiplier = 0\n",
    "        last_autos = set()\n",
    "\n",
    "        while (\n",
    "            multiplier != last_multiplier or autos != last_autos\n",
    "        ):  # while things change\n",
    "            last_multiplier = multiplier  # record previous values\n",
    "            last_autos = set(autos)  # record previous values\n",
    "\n",
    "            # Expand or contract each of the dimensions appropriately\n",
    "            for a in sorted(autos):\n",
    "                proposed = result[a] * multiplier ** (1 / len(autos))\n",
    "                if proposed > shape[a]:  # we've hit the shape boundary\n",
    "                    autos.remove(a)\n",
    "                    largest_block *= shape[a]\n",
    "                    chunks[a] = shape[a]\n",
    "                    del result[a]\n",
    "                else:\n",
    "                    result[a] = round_to(proposed, ideal_shape[a])\n",
    "\n",
    "            # recompute how much multiplier we have left, repeat\n",
    "            multiplier = (\n",
    "                limit / dtype.itemsize / largest_block / np.prod(list(result.values()))\n",
    "            )\n",
    "\n",
    "\n",
    "        for k, v in result.items():\n",
    "            chunks[k] = v\n",
    "        return tuple(chunks)\n",
    "\n",
    "    else:\n",
    "        size = (limit / dtype.itemsize / largest_block) ** (1 / len(autos))\n",
    "        small = [i for i in autos if shape[i] < size]\n",
    "        if small:\n",
    "            for i in small:\n",
    "                chunks[i] = (shape[i],)\n",
    "            return auto_chunks(chunks, shape, limit, dtype)\n",
    "\n",
    "        for i in autos:\n",
    "            chunks[i] = round_to(size, shape[i])\n",
    "\n",
    "        return tuple(chunks)\n",
    "    \n",
    "def round_to(c, s):\n",
    "    \"\"\" Return a chunk dimension that is close to an even multiple or factor\n",
    "    We want values for c that are nicely aligned with s.\n",
    "    If c is smaller than s then we want the largest factor of s that is less than the\n",
    "    desired chunk size, but not less than half, which is too much.  If no such\n",
    "    factor exists then we just go with the original chunk size and accept an\n",
    "    uneven chunk at the end.\n",
    "    If c is larger than s then we want the largest multiple of s that is still\n",
    "    smaller than c.\n",
    "    \"\"\"\n",
    "    if c <= s:\n",
    "        try:\n",
    "            return max(f for f in factors(s) if c / 2 <= f <= c)\n",
    "        except ValueError:  # no matching factors within factor of two\n",
    "            return max(1, int(c))\n",
    "    else:\n",
    "        return c // s * s\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tina 256000000\n",
      "tina largest block  1.0\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(317, 192, 160)"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "timesteps=20834\n",
    "lat=320\n",
    "lon=384\n",
    "shape=(timesteps,lon,lat)\n",
    "chunks=(\"auto\",\"auto\",\"auto\")\n",
    "limit='256MB'\n",
    "random_data =da.random.RandomState(0).standard_normal(shape, chunks='auto'  )\n",
    "dtype = random_data.dtype\n",
    "auto_chunks(chunks, shape, limit, dtype=dtype, previous_chunks=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "77905920"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "317*192*160*8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('auto', 'auto', 'auto')\n"
     ]
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pangeobench",
   "language": "python",
   "name": "pangeobench"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}

## automatic_chunksize.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              automatic_chunksize.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 88,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy \n",
	"from distributed.utils import parse_bytes\n",
	"import math\n",
	"import dask\n",
	"from numbers import Number, Integral\n",
	"from dask.utils import factors\n",
	"def auto_chunks(chunks, shape, limit, dtype, previous_chunks=None):\n",
	" \"\"\" Determine automatic chunks\n",
	" This takes in a chunks value that contains ``\"auto\"`` values in certain\n",
	" dimensions and replaces those values with concrete dimension sizes that try\n",
	" to get chunks to be of a certain size in bytes, provided by the ``limit=``\n",
	" keyword. If multiple dimensions are marked as ``\"auto\"`` then they will\n",
	" all respond to meet the desired byte limit, trying to respect the aspect\n",
	" ratio of their dimensions in ``previous_chunks=``, if given.\n",
	" Parameters\n",
	" ----------\n",
	" chunks: Tuple\n",
	" A tuple of either dimensions or tuples of explicit chunk dimensions\n",
	" Some entries should be \"auto\"\n",
	" shape: Tuple[int]\n",
	" limit: int, str\n",
	" The maximum allowable size of a chunk in bytes\n",
	" previous_chunks: Tuple[Tuple[int]]\n",
	" See also\n",
	" --------\n",
	" normalize_chunks: for full docstring and parameters\n",
	" \"\"\"\n",
	" if previous_chunks is not None:\n",
	" previous_chunks = tuple(\n",
	" c if isinstance(c, tuple) else (c,) for c in previous_chunks\n",
	" )\n",
	" chunks = list(chunks)\n",
	"\n",
	" autos = {i for i, c in enumerate(chunks) if c == \"auto\"}\n",
	" if not autos:\n",
	" return tuple(chunks)\n",
	"\n",
	" if limit is None:\n",
	" limit = config.get(\"array.chunk-size\")\n",
	" if isinstance(limit, str):\n",
	" limit = parse_bytes(limit)\n",
	"\n",
	" if dtype is None:\n",
	" raise TypeError(\"DType must be known for auto-chunking\")\n",
	"\n",
	" if dtype.hasobject:\n",
	" raise NotImplementedError(\n",
	" \"Can not use auto rechunking with object dtype. \"\n",
	" \"We are unable to estimate the size in bytes of object data\"\n",
	" )\n",
	"\n",
	" for x in tuple(chunks) + tuple(shape):\n",
	" if (\n",
	" isinstance(x, Number)\n",
	" and np.isnan(x)\n",
	" or isinstance(x, tuple)\n",
	" and np.isnan(x).any()\n",
	" ):\n",
	" raise ValueError(\n",
	" \"Can not perform automatic rechunking with unknown \"\n",
	" \"(nan) chunk sizes.%s\" % unknown_chunk_message\n",
	" )\n",
	"\n",
	" limit = max(1, limit)\n",
	" print('tina',limit)\n",
	" largest_block = np.prod(\n",
	" [cs if isinstance(cs, Number) else max(cs) for cs in chunks if cs != \"auto\"]\n",
	" )\n",
	" print('tina largest block ',largest_block)\n",
	" if previous_chunks:\n",
	" # Base ideal ratio on the median chunk size of the previous chunks\n",
	" result = {a: np.median(previous_chunks[a]) for a in autos}\n",
	"\n",
	" ideal_shape = []\n",
	" for i, s in enumerate(shape):\n",
	" chunk_frequencies = frequencies(previous_chunks[i])\n",
	" mode, count = max(chunk_frequencies.items(), key=lambda kv: kv[1])\n",
	" if mode > 1 and count >= len(previous_chunks[i]) / 2:\n",
	" ideal_shape.append(mode)\n",
	" else:\n",
	" ideal_shape.append(s)\n",
	"\n",
	" # How much larger or smaller the ideal chunk size is relative to what we have now\n",
	" multiplier = (\n",
	" limit / dtype.itemsize / largest_block / np.prod(list(result.values()))\n",
	" )\n",
	" last_multiplier = 0\n",
	" last_autos = set()\n",
	"\n",
	" while (\n",
	" multiplier != last_multiplier or autos != last_autos\n",
	" ): # while things change\n",
	" last_multiplier = multiplier # record previous values\n",
	" last_autos = set(autos) # record previous values\n",
	"\n",
	" # Expand or contract each of the dimensions appropriately\n",
	" for a in sorted(autos):\n",
	" proposed = result[a] * multiplier ** (1 / len(autos))\n",
	" if proposed > shape[a]: # we've hit the shape boundary\n",
	" autos.remove(a)\n",
	" largest_block *= shape[a]\n",
	" chunks[a] = shape[a]\n",
	" del result[a]\n",
	" else:\n",
	" result[a] = round_to(proposed, ideal_shape[a])\n",
	"\n",
	" # recompute how much multiplier we have left, repeat\n",
	" multiplier = (\n",
	" limit / dtype.itemsize / largest_block / np.prod(list(result.values()))\n",
	" )\n",
	"\n",
	"\n",
	" for k, v in result.items():\n",
	" chunks[k] = v\n",
	" return tuple(chunks)\n",
	"\n",
	" else:\n",
	" size = (limit / dtype.itemsize / largest_block) ** (1 / len(autos))\n",
	" small = [i for i in autos if shape[i] < size]\n",
	" if small:\n",
	" for i in small:\n",
	" chunks[i] = (shape[i],)\n",
	" return auto_chunks(chunks, shape, limit, dtype)\n",
	"\n",
	" for i in autos:\n",
	" chunks[i] = round_to(size, shape[i])\n",
	"\n",
	" return tuple(chunks)\n",
	" \n",
	"def round_to(c, s):\n",
	" \"\"\" Return a chunk dimension that is close to an even multiple or factor\n",
	" We want values for c that are nicely aligned with s.\n",
	" If c is smaller than s then we want the largest factor of s that is less than the\n",
	" desired chunk size, but not less than half, which is too much. If no such\n",
	" factor exists then we just go with the original chunk size and accept an\n",
	" uneven chunk at the end.\n",
	" If c is larger than s then we want the largest multiple of s that is still\n",
	" smaller than c.\n",
	" \"\"\"\n",
	" if c <= s:\n",
	" try:\n",
	" return max(f for f in factors(s) if c / 2 <= f <= c)\n",
	" except ValueError: # no matching factors within factor of two\n",
	" return max(1, int(c))\n",
	" else:\n",
	" return c // s * s\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 89,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"tina 256000000\n",
	"tina largest block 1.0\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"(317, 192, 160)"
	]
	},
	"execution_count": 89,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"timesteps=20834\n",
	"lat=320\n",
	"lon=384\n",
	"shape=(timesteps,lon,lat)\n",
	"chunks=(\"auto\",\"auto\",\"auto\")\n",
	"limit='256MB'\n",
	"random_data =da.random.RandomState(0).standard_normal(shape, chunks='auto' )\n",
	"dtype = random_data.dtype\n",
	"auto_chunks(chunks, shape, limit, dtype=dtype, previous_chunks=None)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 90,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"77905920"
	]
	},
	"execution_count": 90,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"317192160*8"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 64,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"('auto', 'auto', 'auto')\n"
	]
	}
	],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "pangeobench",
	"language": "python",
	"name": "pangeobench"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}