ayushdg/gby-sort-perf.ipynb

## gby-sort-perf.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import cudf\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Single column agg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = cudf.DataFrame()\n",
    "\n",
    "n_elem = 100_000_000\n",
    "n_groups = [n_elem // 10, n_elem // 2, n_elem]\n",
    "df['b'] = np.random.random(n_elem)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num Groups(approx): 10,000,000\n",
      "726 ms ± 4.45 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
      "Num Groups(approx): 50,000,000\n",
      "1.88 s ± 1.66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
      "Num Groups(approx): 100,000,000\n",
      "2.6 s ± 1.07 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "for n in n_groups:\n",
    "    df['a'] = np.random.randint(0,n,n_elem)\n",
    "    print(\"Num Groups(approx): {:,}\".format(n))\n",
    "    %timeit res = df.groupby('a', sort=True).b.min()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num Groups(approx): 10,000,000\n",
      "524 ms ± 1.01 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
      "Num Groups(approx): 50,000,000\n",
      "1e+03 ms ± 17.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
      "Num Groups(approx): 100,000,000\n",
      "1.26 s ± 208 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "for n in n_groups:\n",
    "    df['a'] = np.random.randint(0,n,n_elem)\n",
    "    print(\"Num Groups(approx): {:,}\".format(n))\n",
    "    %timeit res = df.groupby('a', sort=False).b.min()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Multi-col agg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = cudf.DataFrame()\n",
    "\n",
    "n_elem = 10_000_000\n",
    "n_groups = [n_elem // 10, n_elem // 2, n_elem]\n",
    "n_cols = 20\n",
    "aggs = {}\n",
    "for i in range(n_cols - 1):\n",
    "    df[str(i)] = np.random.random(n_elem)\n",
    "    aggs[str(i)] = \"min\" if i%2 else \"max\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num Groups(approx): 1,000,000\n",
      "571 ms ± 63.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
      "Num Groups(approx): 5,000,000\n",
      "724 ms ± 48.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
      "Num Groups(approx): 10,000,000\n",
      "852 ms ± 51.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "for n in n_groups:\n",
    "    df['a'] = np.random.randint(0,n,n_elem)\n",
    "    print(\"Num Groups(approx): {:,}\".format(n))\n",
    "    %timeit res = df.groupby('a', sort=True).agg(aggs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num Groups(approx): 1,000,000\n",
      "385 ms ± 46.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
      "Num Groups(approx): 5,000,000\n",
      "385 ms ± 27.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
      "Num Groups(approx): 10,000,000\n",
      "403 ms ± 48.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "for n in n_groups:\n",
    "    df['a'] = np.random.randint(0,n,n_elem)\n",
    "    print(\"Num Groups(approx): {:,}\".format(n))\n",
    "    %timeit res = df.groupby('a', sort=False).agg(aggs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dask Cudf Tests"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from dask import dataframe\n",
    "import dask\n",
    "from distributed import Client, wait\n",
    "import dask_cudf\n",
    "from dask_cuda import LocalCUDACluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table style=\"border: 2px solid white;\">\n",
       "<tr>\n",
       "<td style=\"vertical-align: top; border: 0px solid white\">\n",
       "<h3 style=\"text-align: left;\">Client</h3>\n",
       "<ul style=\"text-align: left; list-style: none; margin: 0; padding: 0;\">\n",
       "  <li><b>Scheduler: </b>tcp://127.0.0.1:37851</li>\n",
       "  <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a>\n",
       "</ul>\n",
       "</td>\n",
       "<td style=\"vertical-align: top; border: 0px solid white\">\n",
       "<h3 style=\"text-align: left;\">Cluster</h3>\n",
       "<ul style=\"text-align: left; list-style:none; margin: 0; padding: 0;\">\n",
       "  <li><b>Workers: </b>4</li>\n",
       "  <li><b>Cores: </b>4</li>\n",
       "  <li><b>Memory: </b>404.34 GB</li>\n",
       "</ul>\n",
       "</td>\n",
       "</tr>\n",
       "</table>"
      ],
      "text/plain": [
       "<Client: scheduler='tcp://127.0.0.1:37851' processes=4 cores=4>"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cluster = LocalCUDACluster()\n",
    "client = Client(cluster)\n",
    "client"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import cudf\n",
    "df = cudf.DataFrame()\n",
    "\n",
    "n_elem = 100_000_000\n",
    "df['a'] = np.random.randint(0,10_000_000,n_elem)\n",
    "df['b'] = np.random.random(n_elem)\n",
    "ddf = dask_cudf.from_cudf(df,npartitions=4)\n",
    "ddf = ddf.persist()\n",
    "wait(ddf)\n",
    "del(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 304 ms, sys: 132 ms, total: 436 ms\n",
      "Wall time: 3.21 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "9999528"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "len(ddf.groupby(by=\"a\").b.min())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "client.profile(filename=\"groupby-prof.html\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### dask groupby with sort set to false"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\u001b[0;31mSignature:\u001b[0m\n",
       "\u001b[0mcudf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mby\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0msort\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mas_index\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'hash'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mlevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mgroup_keys\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m    \u001b[0mdropna\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
       "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
       "\u001b[0;31mDocstring:\u001b[0m\n",
       "Groupby\n",
       "\n",
       "Parameters\n",
       "----------\n",
       "by : list-of-str or str\n",
       "    Column name(s) to form that groups by.\n",
       "sort : bool, default True\n",
       "    Force sorting group keys.\n",
       "as_index : bool, default True\n",
       "    Indicates whether the grouped by columns become the index\n",
       "    of the returned DataFrame\n",
       "method : str, optional\n",
       "    A string indicating the method to use to perform the group by.\n",
       "    Valid values are \"hash\" or \"cudf\".\n",
       "    \"cudf\" method may be deprecated in the future, but is currently\n",
       "    the only method supporting group UDFs via the `apply` function.\n",
       "dropna : bool, optional\n",
       "    If True (default), drop null keys.\n",
       "    If False, perform grouping by keys containing null(s).\n",
       "\n",
       "Returns\n",
       "-------\n",
       "The groupby object\n",
       "\n",
       "Notes\n",
       "-----\n",
       "No empty rows are returned.  (For categorical keys, pandas returns\n",
       "rows for all categories even if they are no corresponding values.)\n",
       "\u001b[0;31mFile:\u001b[0m      /opt/conda/envs/rapids/lib/python3.7/site-packages/cudf/core/dataframe.py\n",
       "\u001b[0;31mType:\u001b[0m      function\n"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import cudf\n",
    "?cudf.DataFrame.groupby"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = cudf.DataFrame()\n",
    "\n",
    "n_elem = 100_000_000\n",
    "df['a'] = np.random.randint(0,10_000_000,n_elem)\n",
    "df['b'] = np.random.random(n_elem)\n",
    "ddf = dask_cudf.from_cudf(df,npartitions=4)\n",
    "ddf = ddf.persist()\n",
    "wait(ddf)\n",
    "del(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 152 ms, sys: 48 ms, total: 200 ms\n",
      "Wall time: 2.82 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "9999528"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "len(ddf.groupby(by=\"a\").b.min())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "client.profile(filename=\"groupby-prof-nosort.html\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import cudf\n",
	"import numpy as np"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Single column agg"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = cudf.DataFrame()\n",
	"\n",
	"n_elem = 100_000_000\n",
	"n_groups = [n_elem // 10, n_elem // 2, n_elem]\n",
	"df['b'] = np.random.random(n_elem)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Num Groups(approx): 10,000,000\n",
	"726 ms ± 4.45 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
	"Num Groups(approx): 50,000,000\n",
	"1.88 s ± 1.66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
	"Num Groups(approx): 100,000,000\n",
	"2.6 s ± 1.07 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"for n in n_groups:\n",
	" df['a'] = np.random.randint(0,n,n_elem)\n",
	" print(\"Num Groups(approx): {:,}\".format(n))\n",
	" %timeit res = df.groupby('a', sort=True).b.min()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Num Groups(approx): 10,000,000\n",
	"524 ms ± 1.01 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
	"Num Groups(approx): 50,000,000\n",
	"1e+03 ms ± 17.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
	"Num Groups(approx): 100,000,000\n",
	"1.26 s ± 208 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"for n in n_groups:\n",
	" df['a'] = np.random.randint(0,n,n_elem)\n",
	" print(\"Num Groups(approx): {:,}\".format(n))\n",
	" %timeit res = df.groupby('a', sort=False).b.min()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Multi-col agg"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = cudf.DataFrame()\n",
	"\n",
	"n_elem = 10_000_000\n",
	"n_groups = [n_elem // 10, n_elem // 2, n_elem]\n",
	"n_cols = 20\n",
	"aggs = {}\n",
	"for i in range(n_cols - 1):\n",
	" df[str(i)] = np.random.random(n_elem)\n",
	" aggs[str(i)] = \"min\" if i%2 else \"max\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Num Groups(approx): 1,000,000\n",
	"571 ms ± 63.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
	"Num Groups(approx): 5,000,000\n",
	"724 ms ± 48.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
	"Num Groups(approx): 10,000,000\n",
	"852 ms ± 51.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"for n in n_groups:\n",
	" df['a'] = np.random.randint(0,n,n_elem)\n",
	" print(\"Num Groups(approx): {:,}\".format(n))\n",
	" %timeit res = df.groupby('a', sort=True).agg(aggs)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Num Groups(approx): 1,000,000\n",
	"385 ms ± 46.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
	"Num Groups(approx): 5,000,000\n",
	"385 ms ± 27.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
	"Num Groups(approx): 10,000,000\n",
	"403 ms ± 48.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"for n in n_groups:\n",
	" df['a'] = np.random.randint(0,n,n_elem)\n",
	" print(\"Num Groups(approx): {:,}\".format(n))\n",
	" %timeit res = df.groupby('a', sort=False).agg(aggs)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Dask Cudf Tests"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"from dask import dataframe\n",
	"import dask\n",
	"from distributed import Client, wait\n",
	"import dask_cudf\n",
	"from dask_cuda import LocalCUDACluster"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<table style=\"border: 2px solid white;\">\n",
	"<tr>\n",
	"<td style=\"vertical-align: top; border: 0px solid white\">\n",
	"<h3 style=\"text-align: left;\">Client</h3>\n",
	"<ul style=\"text-align: left; list-style: none; margin: 0; padding: 0;\">\n",
	" <li><b>Scheduler: </b>tcp://127.0.0.1:37851</li>\n",
	" <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a>\n",
	"</ul>\n",
	"</td>\n",
	"<td style=\"vertical-align: top; border: 0px solid white\">\n",
	"<h3 style=\"text-align: left;\">Cluster</h3>\n",
	"<ul style=\"text-align: left; list-style:none; margin: 0; padding: 0;\">\n",
	" <li><b>Workers: </b>4</li>\n",
	" <li><b>Cores: </b>4</li>\n",
	" <li><b>Memory: </b>404.34 GB</li>\n",
	"</ul>\n",
	"</td>\n",
	"</tr>\n",
	"</table>"
	],
	"text/plain": [
	"<Client: scheduler='tcp://127.0.0.1:37851' processes=4 cores=4>"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"cluster = LocalCUDACluster()\n",
	"client = Client(cluster)\n",
	"client"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"import cudf\n",
	"df = cudf.DataFrame()\n",
	"\n",
	"n_elem = 100_000_000\n",
	"df['a'] = np.random.randint(0,10_000_000,n_elem)\n",
	"df['b'] = np.random.random(n_elem)\n",
	"ddf = dask_cudf.from_cudf(df,npartitions=4)\n",
	"ddf = ddf.persist()\n",
	"wait(ddf)\n",
	"del(df)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 304 ms, sys: 132 ms, total: 436 ms\n",
	"Wall time: 3.21 s\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"9999528"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%time\n",
	"len(ddf.groupby(by=\"a\").b.min())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"client.profile(filename=\"groupby-prof.html\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### dask groupby with sort set to false"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"\u001b[0;31mSignature:\u001b[0m\n",
	"\u001b[0mcudf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n",
	"\u001b[0;34m\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
	"\u001b[0;34m\u001b[0m \u001b[0mby\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
	"\u001b[0;34m\u001b[0m \u001b[0msort\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
	"\u001b[0;34m\u001b[0m \u001b[0mas_index\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
	"\u001b[0;34m\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'hash'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
	"\u001b[0;34m\u001b[0m \u001b[0mlevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
	"\u001b[0;34m\u001b[0m \u001b[0mgroup_keys\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
	"\u001b[0;34m\u001b[0m \u001b[0mdropna\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
	"\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;31mDocstring:\u001b[0m\n",
	"Groupby\n",
	"\n",
	"Parameters\n",
	"----------\n",
	"by : list-of-str or str\n",
	" Column name(s) to form that groups by.\n",
	"sort : bool, default True\n",
	" Force sorting group keys.\n",
	"as_index : bool, default True\n",
	" Indicates whether the grouped by columns become the index\n",
	" of the returned DataFrame\n",
	"method : str, optional\n",
	" A string indicating the method to use to perform the group by.\n",
	" Valid values are \"hash\" or \"cudf\".\n",
	" \"cudf\" method may be deprecated in the future, but is currently\n",
	" the only method supporting group UDFs via the `apply` function.\n",
	"dropna : bool, optional\n",
	" If True (default), drop null keys.\n",
	" If False, perform grouping by keys containing null(s).\n",
	"\n",
	"Returns\n",
	"-------\n",
	"The groupby object\n",
	"\n",
	"Notes\n",
	"-----\n",
	"No empty rows are returned. (For categorical keys, pandas returns\n",
	"rows for all categories even if they are no corresponding values.)\n",
	"\u001b[0;31mFile:\u001b[0m /opt/conda/envs/rapids/lib/python3.7/site-packages/cudf/core/dataframe.py\n",
	"\u001b[0;31mType:\u001b[0m function\n"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	}
	],
	"source": [
	"import cudf\n",
	"?cudf.DataFrame.groupby"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"df = cudf.DataFrame()\n",
	"\n",
	"n_elem = 100_000_000\n",
	"df['a'] = np.random.randint(0,10_000_000,n_elem)\n",
	"df['b'] = np.random.random(n_elem)\n",
	"ddf = dask_cudf.from_cudf(df,npartitions=4)\n",
	"ddf = ddf.persist()\n",
	"wait(ddf)\n",
	"del(df)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 152 ms, sys: 48 ms, total: 200 ms\n",
	"Wall time: 2.82 s\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"9999528"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%time\n",
	"len(ddf.groupby(by=\"a\").b.min())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"client.profile(filename=\"groupby-prof-nosort.html\")"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}