ayushdg/Groupby_sort_test.ipynb

## Groupby_sort_test.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dask Cudf Tests"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from dask import dataframe\n",
    "import dask\n",
    "from distributed import Client, wait\n",
    "import dask_cudf\n",
    "from dask_cuda import LocalCUDACluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table style=\"border: 2px solid white;\">\n",
       "<tr>\n",
       "<td style=\"vertical-align: top; border: 0px solid white\">\n",
       "<h3 style=\"text-align: left;\">Client</h3>\n",
       "<ul style=\"text-align: left; list-style: none; margin: 0; padding: 0;\">\n",
       "  <li><b>Scheduler: </b>tcp://127.0.0.1:33414</li>\n",
       "  <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a>\n",
       "</ul>\n",
       "</td>\n",
       "<td style=\"vertical-align: top; border: 0px solid white\">\n",
       "<h3 style=\"text-align: left;\">Cluster</h3>\n",
       "<ul style=\"text-align: left; list-style:none; margin: 0; padding: 0;\">\n",
       "  <li><b>Workers: </b>4</li>\n",
       "  <li><b>Cores: </b>4</li>\n",
       "  <li><b>Memory: </b>404.34 GB</li>\n",
       "</ul>\n",
       "</td>\n",
       "</tr>\n",
       "</table>"
      ],
      "text/plain": [
       "<Client: 'tcp://127.0.0.1:33414' processes=4 threads=4, memory=404.34 GB>"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cluster = LocalCUDACluster()\n",
    "client = Client(cluster)\n",
    "client"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- Create a dask dataframe with 1 partition.\n",
    "- N_groups should be in the order of n_elem // 2 (approximately)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import cudf\n",
    "df = cudf.DataFrame()\n",
    "\n",
    "n_elem = 100_000_000\n",
    "df['a'] = np.random.randint(0,n_elem // 2,n_elem)\n",
    "df['b'] = np.random.random(n_elem)\n",
    "ddf = dask_cudf.from_cudf(df,npartitions=1)\n",
    "ddf = ddf.persist()\n",
    "wait(ddf)\n",
    "del(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Case 1: Multiple aggs\n",
    "\n",
    "- `Sort=True`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/conda/envs/rapids/lib/python3.7/site-packages/cudf/core/dataframe.py:387: UserWarning: Columns may not be added to a DataFrame using a new attribute name. A new attribute will be created: 'multi_cols'\n",
      "  UserWarning,\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4.67 s ± 12.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "x = ddf.groupby(by=\"a\", sort=True).agg({'b':[\"count\", \"max\"]})\n",
    "x = x.persist()\n",
    "wait(x)\n",
    "del(x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- `Sort=False`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8.77 s ± 14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "x = ddf.groupby(by=\"a\", sort=False).agg({'b':[\"count\", \"max\"]})\n",
    "x = x.persist()\n",
    "wait(x)\n",
    "del(x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Not relevant to the issue, but since we have 1 partition comparing this to cudf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def gbyfunc(df):\n",
    "    return df.groupby(by=\"a\", sort=True).agg({'b':[\"count\", \"max\"]})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.74 s ± 10.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "x = ddf.map_partitions(gbyfunc)\n",
    "x = x.persist()\n",
    "wait(x)\n",
    "del(x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Case 2: Mean\n",
    "\n",
    "- `sort=True`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4.34 s ± 6.71 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "x = ddf.groupby(by=\"a\", sort=True).b.mean()\n",
    "x = x.persist()\n",
    "wait(x)\n",
    "del(x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- `sort=False`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6.61 s ± 14.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "x = ddf.groupby(by=\"a\", sort=False).b.mean()\n",
    "x = x.persist()\n",
    "wait(x)\n",
    "del(x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Expected behavior visible for a single agg that is not `mean`\n",
    "\n",
    "- `sort=True`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.17 s ± 11.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "x = ddf.groupby(by=\"a\", sort=True).b.count()\n",
    "x = x.persist()\n",
    "wait(x)\n",
    "del(x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- `sort=False`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.35 s ± 2.91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "x = ddf.groupby(by=\"a\", sort=False).b.count()\n",
    "x = x.persist()\n",
    "wait(x)\n",
    "del(x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sorting behavior differences\n",
    "\n",
    "- The output below is not sorted (as expected)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a\n",
       "44945698    3\n",
       "36588377    3\n",
       "42946380    3\n",
       "17303107    1\n",
       "47236603    4\n",
       "Name: b, dtype: int64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ddf.groupby(by=\"a\", sort=False).b.count().head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- The output below is sorted (unexpected). Also the column name `a` is lost (though that might be a different issue)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    0.340670\n",
       "2    0.453529\n",
       "4    0.884728\n",
       "5    0.117211\n",
       "6    0.524516\n",
       "Name: b, dtype: float64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ddf.groupby(by=\"a\", sort=False).b.mean().head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Dask Cudf Tests"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"from dask import dataframe\n",
	"import dask\n",
	"from distributed import Client, wait\n",
	"import dask_cudf\n",
	"from dask_cuda import LocalCUDACluster"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<table style=\"border: 2px solid white;\">\n",
	"<tr>\n",
	"<td style=\"vertical-align: top; border: 0px solid white\">\n",
	"<h3 style=\"text-align: left;\">Client</h3>\n",
	"<ul style=\"text-align: left; list-style: none; margin: 0; padding: 0;\">\n",
	" <li><b>Scheduler: </b>tcp://127.0.0.1:33414</li>\n",
	" <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a>\n",
	"</ul>\n",
	"</td>\n",
	"<td style=\"vertical-align: top; border: 0px solid white\">\n",
	"<h3 style=\"text-align: left;\">Cluster</h3>\n",
	"<ul style=\"text-align: left; list-style:none; margin: 0; padding: 0;\">\n",
	" <li><b>Workers: </b>4</li>\n",
	" <li><b>Cores: </b>4</li>\n",
	" <li><b>Memory: </b>404.34 GB</li>\n",
	"</ul>\n",
	"</td>\n",
	"</tr>\n",
	"</table>"
	],
	"text/plain": [
	"<Client: 'tcp://127.0.0.1:33414' processes=4 threads=4, memory=404.34 GB>"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"cluster = LocalCUDACluster()\n",
	"client = Client(cluster)\n",
	"client"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"- Create a dask dataframe with 1 partition.\n",
	"- N_groups should be in the order of n_elem // 2 (approximately)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"import cudf\n",
	"df = cudf.DataFrame()\n",
	"\n",
	"n_elem = 100_000_000\n",
	"df['a'] = np.random.randint(0,n_elem // 2,n_elem)\n",
	"df['b'] = np.random.random(n_elem)\n",
	"ddf = dask_cudf.from_cudf(df,npartitions=1)\n",
	"ddf = ddf.persist()\n",
	"wait(ddf)\n",
	"del(df)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Case 1: Multiple aggs\n",
	"\n",
	"- `Sort=True`"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"/opt/conda/envs/rapids/lib/python3.7/site-packages/cudf/core/dataframe.py:387: UserWarning: Columns may not be added to a DataFrame using a new attribute name. A new attribute will be created: 'multi_cols'\n",
	" UserWarning,\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"4.67 s ± 12.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"x = ddf.groupby(by=\"a\", sort=True).agg({'b':[\"count\", \"max\"]})\n",
	"x = x.persist()\n",
	"wait(x)\n",
	"del(x)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"- `Sort=False`"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"8.77 s ± 14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"x = ddf.groupby(by=\"a\", sort=False).agg({'b':[\"count\", \"max\"]})\n",
	"x = x.persist()\n",
	"wait(x)\n",
	"del(x)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#### Not relevant to the issue, but since we have 1 partition comparing this to cudf"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"def gbyfunc(df):\n",
	" return df.groupby(by=\"a\", sort=True).agg({'b':[\"count\", \"max\"]})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1.74 s ± 10.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"x = ddf.map_partitions(gbyfunc)\n",
	"x = x.persist()\n",
	"wait(x)\n",
	"del(x)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Case 2: Mean\n",
	"\n",
	"- `sort=True`"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"4.34 s ± 6.71 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"x = ddf.groupby(by=\"a\", sort=True).b.mean()\n",
	"x = x.persist()\n",
	"wait(x)\n",
	"del(x)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"- `sort=False`"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"6.61 s ± 14.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"x = ddf.groupby(by=\"a\", sort=False).b.mean()\n",
	"x = x.persist()\n",
	"wait(x)\n",
	"del(x)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Expected behavior visible for a single agg that is not `mean`\n",
	"\n",
	"- `sort=True`"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"2.17 s ± 11.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"x = ddf.groupby(by=\"a\", sort=True).b.count()\n",
	"x = x.persist()\n",
	"wait(x)\n",
	"del(x)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"- `sort=False`"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1.35 s ± 2.91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"x = ddf.groupby(by=\"a\", sort=False).b.count()\n",
	"x = x.persist()\n",
	"wait(x)\n",
	"del(x)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Sorting behavior differences\n",
	"\n",
	"- The output below is not sorted (as expected)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"a\n",
	"44945698 3\n",
	"36588377 3\n",
	"42946380 3\n",
	"17303107 1\n",
	"47236603 4\n",
	"Name: b, dtype: int64"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"ddf.groupby(by=\"a\", sort=False).b.count().head()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"- The output below is sorted (unexpected). Also the column name `a` is lost (though that might be a different issue)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0 0.340670\n",
	"2 0.453529\n",
	"4 0.884728\n",
	"5 0.117211\n",
	"6 0.524516\n",
	"Name: b, dtype: float64"
	]
	},
	"execution_count": 13,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"ddf.groupby(by=\"a\", sort=False).b.mean().head()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}