Last active
September 4, 2019 03:33
-
-
Save ayushdg/7ead40391c1573bef975b417218b1d31 to your computer and use it in GitHub Desktop.
Groupby sort performance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import cudf\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Single column agg" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = cudf.DataFrame()\n", | |
"\n", | |
"n_elem = 100_000_000\n", | |
"n_groups = [n_elem // 10, n_elem // 2, n_elem]\n", | |
"df['b'] = np.random.random(n_elem)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Num Groups(approx): 10,000,000\n", | |
"726 ms ± 4.45 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", | |
"Num Groups(approx): 50,000,000\n", | |
"1.88 s ± 1.66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", | |
"Num Groups(approx): 100,000,000\n", | |
"2.6 s ± 1.07 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"for n in n_groups:\n", | |
" df['a'] = np.random.randint(0,n,n_elem)\n", | |
" print(\"Num Groups(approx): {:,}\".format(n))\n", | |
" %timeit res = df.groupby('a', sort=True).b.min()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Num Groups(approx): 10,000,000\n", | |
"524 ms ± 1.01 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", | |
"Num Groups(approx): 50,000,000\n", | |
"1e+03 ms ± 17.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", | |
"Num Groups(approx): 100,000,000\n", | |
"1.26 s ± 208 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"for n in n_groups:\n", | |
" df['a'] = np.random.randint(0,n,n_elem)\n", | |
" print(\"Num Groups(approx): {:,}\".format(n))\n", | |
" %timeit res = df.groupby('a', sort=False).b.min()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Multi-col agg" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = cudf.DataFrame()\n", | |
"\n", | |
"n_elem = 10_000_000\n", | |
"n_groups = [n_elem // 10, n_elem // 2, n_elem]\n", | |
"n_cols = 20\n", | |
"aggs = {}\n", | |
"for i in range(n_cols - 1):\n", | |
" df[str(i)] = np.random.random(n_elem)\n", | |
" aggs[str(i)] = \"min\" if i%2 else \"max\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Num Groups(approx): 1,000,000\n", | |
"571 ms ± 63.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", | |
"Num Groups(approx): 5,000,000\n", | |
"724 ms ± 48.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", | |
"Num Groups(approx): 10,000,000\n", | |
"852 ms ± 51.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"for n in n_groups:\n", | |
" df['a'] = np.random.randint(0,n,n_elem)\n", | |
" print(\"Num Groups(approx): {:,}\".format(n))\n", | |
" %timeit res = df.groupby('a', sort=True).agg(aggs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Num Groups(approx): 1,000,000\n", | |
"385 ms ± 46.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", | |
"Num Groups(approx): 5,000,000\n", | |
"385 ms ± 27.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n", | |
"Num Groups(approx): 10,000,000\n", | |
"403 ms ± 48.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"for n in n_groups:\n", | |
" df['a'] = np.random.randint(0,n,n_elem)\n", | |
" print(\"Num Groups(approx): {:,}\".format(n))\n", | |
" %timeit res = df.groupby('a', sort=False).agg(aggs)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Dask Cudf Tests" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"from dask import dataframe\n", | |
"import dask\n", | |
"from distributed import Client, wait\n", | |
"import dask_cudf\n", | |
"from dask_cuda import LocalCUDACluster" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table style=\"border: 2px solid white;\">\n", | |
"<tr>\n", | |
"<td style=\"vertical-align: top; border: 0px solid white\">\n", | |
"<h3 style=\"text-align: left;\">Client</h3>\n", | |
"<ul style=\"text-align: left; list-style: none; margin: 0; padding: 0;\">\n", | |
" <li><b>Scheduler: </b>tcp://127.0.0.1:37851</li>\n", | |
" <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a>\n", | |
"</ul>\n", | |
"</td>\n", | |
"<td style=\"vertical-align: top; border: 0px solid white\">\n", | |
"<h3 style=\"text-align: left;\">Cluster</h3>\n", | |
"<ul style=\"text-align: left; list-style:none; margin: 0; padding: 0;\">\n", | |
" <li><b>Workers: </b>4</li>\n", | |
" <li><b>Cores: </b>4</li>\n", | |
" <li><b>Memory: </b>404.34 GB</li>\n", | |
"</ul>\n", | |
"</td>\n", | |
"</tr>\n", | |
"</table>" | |
], | |
"text/plain": [ | |
"<Client: scheduler='tcp://127.0.0.1:37851' processes=4 cores=4>" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"cluster = LocalCUDACluster()\n", | |
"client = Client(cluster)\n", | |
"client" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import cudf\n", | |
"df = cudf.DataFrame()\n", | |
"\n", | |
"n_elem = 100_000_000\n", | |
"df['a'] = np.random.randint(0,10_000_000,n_elem)\n", | |
"df['b'] = np.random.random(n_elem)\n", | |
"ddf = dask_cudf.from_cudf(df,npartitions=4)\n", | |
"ddf = ddf.persist()\n", | |
"wait(ddf)\n", | |
"del(df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 304 ms, sys: 132 ms, total: 436 ms\n", | |
"Wall time: 3.21 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"9999528" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"len(ddf.groupby(by=\"a\").b.min())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"client.profile(filename=\"groupby-prof.html\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### dask groupby with sort set to false" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"\u001b[0;31mSignature:\u001b[0m\n", | |
"\u001b[0mcudf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", | |
"\u001b[0;34m\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", | |
"\u001b[0;34m\u001b[0m \u001b[0mby\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", | |
"\u001b[0;34m\u001b[0m \u001b[0msort\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", | |
"\u001b[0;34m\u001b[0m \u001b[0mas_index\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", | |
"\u001b[0;34m\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'hash'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", | |
"\u001b[0;34m\u001b[0m \u001b[0mlevel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", | |
"\u001b[0;34m\u001b[0m \u001b[0mgroup_keys\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", | |
"\u001b[0;34m\u001b[0m \u001b[0mdropna\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", | |
"\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;31mDocstring:\u001b[0m\n", | |
"Groupby\n", | |
"\n", | |
"Parameters\n", | |
"----------\n", | |
"by : list-of-str or str\n", | |
" Column name(s) to form that groups by.\n", | |
"sort : bool, default True\n", | |
" Force sorting group keys.\n", | |
"as_index : bool, default True\n", | |
" Indicates whether the grouped by columns become the index\n", | |
" of the returned DataFrame\n", | |
"method : str, optional\n", | |
" A string indicating the method to use to perform the group by.\n", | |
" Valid values are \"hash\" or \"cudf\".\n", | |
" \"cudf\" method may be deprecated in the future, but is currently\n", | |
" the only method supporting group UDFs via the `apply` function.\n", | |
"dropna : bool, optional\n", | |
" If True (default), drop null keys.\n", | |
" If False, perform grouping by keys containing null(s).\n", | |
"\n", | |
"Returns\n", | |
"-------\n", | |
"The groupby object\n", | |
"\n", | |
"Notes\n", | |
"-----\n", | |
"No empty rows are returned. (For categorical keys, pandas returns\n", | |
"rows for all categories even if they are no corresponding values.)\n", | |
"\u001b[0;31mFile:\u001b[0m /opt/conda/envs/rapids/lib/python3.7/site-packages/cudf/core/dataframe.py\n", | |
"\u001b[0;31mType:\u001b[0m function\n" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"import cudf\n", | |
"?cudf.DataFrame.groupby" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = cudf.DataFrame()\n", | |
"\n", | |
"n_elem = 100_000_000\n", | |
"df['a'] = np.random.randint(0,10_000_000,n_elem)\n", | |
"df['b'] = np.random.random(n_elem)\n", | |
"ddf = dask_cudf.from_cudf(df,npartitions=4)\n", | |
"ddf = ddf.persist()\n", | |
"wait(ddf)\n", | |
"del(df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 152 ms, sys: 48 ms, total: 200 ms\n", | |
"Wall time: 2.82 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"9999528" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"len(ddf.groupby(by=\"a\").b.min())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"client.profile(filename=\"groupby-prof-nosort.html\")" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment