Skip to content

Instantly share code, notes, and snippets.

@ayushdg
Last active September 4, 2019 03:33
Show Gist options
  • Save ayushdg/7ead40391c1573bef975b417218b1d31 to your computer and use it in GitHub Desktop.
Save ayushdg/7ead40391c1573bef975b417218b1d31 to your computer and use it in GitHub Desktop.
Groupby sort performance
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import cudf\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Single column agg"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df = cudf.DataFrame()\n",
"\n",
"n_elem = 100_000_000\n",
"n_groups = [n_elem // 10, n_elem // 2, n_elem]\n",
"df['b'] = np.random.random(n_elem)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Num Groups(approx): 10,000,000\n",
"726 ms ± 4.45 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
"Num Groups(approx): 50,000,000\n",
"1.88 s ± 1.66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
"Num Groups(approx): 100,000,000\n",
"2.6 s ± 1.07 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"for n in n_groups:\n",
" df['a'] = np.random.randint(0,n,n_elem)\n",
" print(\"Num Groups(approx): {:,}\".format(n))\n",
" %timeit res = df.groupby('a', sort=True).b.min()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Num Groups(approx): 10,000,000\n",
"524 ms ± 1.01 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
"Num Groups(approx): 50,000,000\n",
"1e+03 ms ± 17.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
"Num Groups(approx): 100,000,000\n",
"1.26 s ± 208 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"for n in n_groups:\n",
" df['a'] = np.random.randint(0,n,n_elem)\n",
" print(\"Num Groups(approx): {:,}\".format(n))\n",
" %timeit res = df.groupby('a', sort=False).b.min()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Multi-col agg"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"df = cudf.DataFrame()\n",
"\n",
"n_elem = 10_000_000\n",
"n_groups = [n_elem // 10, n_elem // 2, n_elem]\n",
"n_cols = 20\n",
"aggs = {}\n",
"for i in range(n_cols - 1):\n",
" df[str(i)] = np.random.random(n_elem)\n",
" aggs[str(i)] = \"min\" if i%2 else \"max\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Num Groups(approx): 1,000,000\n",
"571 ms ± 63.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
"Num Groups(approx): 5,000,000\n",
"724 ms ± 48.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
"Num Groups(approx): 10,000,000\n",
"852 ms ± 51.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"for n in n_groups:\n",
" df['a'] = np.random.randint(0,n,n_elem)\n",
" print(\"Num Groups(approx): {:,}\".format(n))\n",
" %timeit res = df.groupby('a', sort=True).agg(aggs)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Num Groups(approx): 1,000,000\n",
"385 ms ± 46.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
"Num Groups(approx): 5,000,000\n",
"385 ms ± 27.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n",
"Num Groups(approx): 10,000,000\n",
"403 ms ± 48.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"for n in n_groups:\n",
" df['a'] = np.random.randint(0,n,n_elem)\n",
" print(\"Num Groups(approx): {:,}\".format(n))\n",
" %timeit res = df.groupby('a', sort=False).agg(aggs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Dask Cudf Tests"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from dask import dataframe\n",
"import dask\n",
"from distributed import Client, wait\n",
"import dask_cudf\n",
"from dask_cuda import LocalCUDACluster"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table style=\"border: 2px solid white;\">\n",
"<tr>\n",
"<td style=\"vertical-align: top; border: 0px solid white\">\n",
"<h3 style=\"text-align: left;\">Client</h3>\n",
"<ul style=\"text-align: left; list-style: none; margin: 0; padding: 0;\">\n",
" <li><b>Scheduler: </b>tcp://127.0.0.1:37851</li>\n",
" <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a>\n",
"</ul>\n",
"</td>\n",
"<td style=\"vertical-align: top; border: 0px solid white\">\n",
"<h3 style=\"text-align: left;\">Cluster</h3>\n",
"<ul style=\"text-align: left; list-style:none; margin: 0; padding: 0;\">\n",
" <li><b>Workers: </b>4</li>\n",
" <li><b>Cores: </b>4</li>\n",
" <li><b>Memory: </b>404.34 GB</li>\n",
"</ul>\n",
"</td>\n",
"</tr>\n",
"</table>"
],
"text/plain": [
"<Client: scheduler='tcp://127.0.0.1:37851' processes=4 cores=4>"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cluster = LocalCUDACluster()\n",
"client = Client(cluster)\n",
"client"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import cudf\n",
"df = cudf.DataFrame()\n",
"\n",
"n_elem = 100_000_000\n",
"df['a'] = np.random.randint(0,10_000_000,n_elem)\n",
"df['b'] = np.random.random(n_elem)\n",
"ddf = dask_cudf.from_cudf(df,npartitions=4)\n",
"ddf = ddf.persist()\n",
"wait(ddf)\n",
"del(df)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 304 ms, sys: 132 ms, total: 436 ms\n",
"Wall time: 3.21 s\n"
]
},
{
"data": {
"text/plain": [
"9999528"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"len(ddf.groupby(by=\"a\").b.min())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"client.profile(filename=\"groupby-prof.html\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### dask groupby with sort set to false"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\u001b[0;31mSignature:\u001b[0m\n",
"\u001b[0mcudf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mby\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0msort\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mas_index\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'hash'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mlevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mgroup_keys\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m \u001b[0mdropna\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
"\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mDocstring:\u001b[0m\n",
"Groupby\n",
"\n",
"Parameters\n",
"----------\n",
"by : list-of-str or str\n",
" Column name(s) to form that groups by.\n",
"sort : bool, default True\n",
" Force sorting group keys.\n",
"as_index : bool, default True\n",
" Indicates whether the grouped by columns become the index\n",
" of the returned DataFrame\n",
"method : str, optional\n",
" A string indicating the method to use to perform the group by.\n",
" Valid values are \"hash\" or \"cudf\".\n",
" \"cudf\" method may be deprecated in the future, but is currently\n",
" the only method supporting group UDFs via the `apply` function.\n",
"dropna : bool, optional\n",
" If True (default), drop null keys.\n",
" If False, perform grouping by keys containing null(s).\n",
"\n",
"Returns\n",
"-------\n",
"The groupby object\n",
"\n",
"Notes\n",
"-----\n",
"No empty rows are returned. (For categorical keys, pandas returns\n",
"rows for all categories even if they are no corresponding values.)\n",
"\u001b[0;31mFile:\u001b[0m /opt/conda/envs/rapids/lib/python3.7/site-packages/cudf/core/dataframe.py\n",
"\u001b[0;31mType:\u001b[0m function\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import cudf\n",
"?cudf.DataFrame.groupby"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df = cudf.DataFrame()\n",
"\n",
"n_elem = 100_000_000\n",
"df['a'] = np.random.randint(0,10_000_000,n_elem)\n",
"df['b'] = np.random.random(n_elem)\n",
"ddf = dask_cudf.from_cudf(df,npartitions=4)\n",
"ddf = ddf.persist()\n",
"wait(ddf)\n",
"del(df)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 152 ms, sys: 48 ms, total: 200 ms\n",
"Wall time: 2.82 s\n"
]
},
{
"data": {
"text/plain": [
"9999528"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"len(ddf.groupby(by=\"a\").b.min())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"client.profile(filename=\"groupby-prof-nosort.html\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment