Created
February 11, 2020 01:51
-
-
Save ayushdg/e276b8f3118160c3d9aacaf2b754911c to your computer and use it in GitHub Desktop.
Groupby sort test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Dask Cudf Tests" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"from dask import dataframe\n", | |
"import dask\n", | |
"from distributed import Client, wait\n", | |
"import dask_cudf\n", | |
"from dask_cuda import LocalCUDACluster" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table style=\"border: 2px solid white;\">\n", | |
"<tr>\n", | |
"<td style=\"vertical-align: top; border: 0px solid white\">\n", | |
"<h3 style=\"text-align: left;\">Client</h3>\n", | |
"<ul style=\"text-align: left; list-style: none; margin: 0; padding: 0;\">\n", | |
" <li><b>Scheduler: </b>tcp://127.0.0.1:33414</li>\n", | |
" <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a>\n", | |
"</ul>\n", | |
"</td>\n", | |
"<td style=\"vertical-align: top; border: 0px solid white\">\n", | |
"<h3 style=\"text-align: left;\">Cluster</h3>\n", | |
"<ul style=\"text-align: left; list-style:none; margin: 0; padding: 0;\">\n", | |
" <li><b>Workers: </b>4</li>\n", | |
" <li><b>Cores: </b>4</li>\n", | |
" <li><b>Memory: </b>404.34 GB</li>\n", | |
"</ul>\n", | |
"</td>\n", | |
"</tr>\n", | |
"</table>" | |
], | |
"text/plain": [ | |
"<Client: 'tcp://127.0.0.1:33414' processes=4 threads=4, memory=404.34 GB>" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"cluster = LocalCUDACluster()\n", | |
"client = Client(cluster)\n", | |
"client" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- Create a dask dataframe with 1 partition.\n", | |
"- N_groups should be in the order of n_elem // 2 (approximately)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import cudf\n", | |
"df = cudf.DataFrame()\n", | |
"\n", | |
"n_elem = 100_000_000\n", | |
"df['a'] = np.random.randint(0,n_elem // 2,n_elem)\n", | |
"df['b'] = np.random.random(n_elem)\n", | |
"ddf = dask_cudf.from_cudf(df,npartitions=1)\n", | |
"ddf = ddf.persist()\n", | |
"wait(ddf)\n", | |
"del(df)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Case 1: Multiple aggs\n", | |
"\n", | |
"- `Sort=True`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/opt/conda/envs/rapids/lib/python3.7/site-packages/cudf/core/dataframe.py:387: UserWarning: Columns may not be added to a DataFrame using a new attribute name. A new attribute will be created: 'multi_cols'\n", | |
" UserWarning,\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"4.67 s ± 12.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"x = ddf.groupby(by=\"a\", sort=True).agg({'b':[\"count\", \"max\"]})\n", | |
"x = x.persist()\n", | |
"wait(x)\n", | |
"del(x)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- `Sort=False`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"8.77 s ± 14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"x = ddf.groupby(by=\"a\", sort=False).agg({'b':[\"count\", \"max\"]})\n", | |
"x = x.persist()\n", | |
"wait(x)\n", | |
"del(x)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Not relevant to the issue, but since we have 1 partition comparing this to cudf" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def gbyfunc(df):\n", | |
" return df.groupby(by=\"a\", sort=True).agg({'b':[\"count\", \"max\"]})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1.74 s ± 10.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"x = ddf.map_partitions(gbyfunc)\n", | |
"x = x.persist()\n", | |
"wait(x)\n", | |
"del(x)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Case 2: Mean\n", | |
"\n", | |
"- `sort=True`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"4.34 s ± 6.71 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"x = ddf.groupby(by=\"a\", sort=True).b.mean()\n", | |
"x = x.persist()\n", | |
"wait(x)\n", | |
"del(x)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- `sort=False`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"6.61 s ± 14.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"x = ddf.groupby(by=\"a\", sort=False).b.mean()\n", | |
"x = x.persist()\n", | |
"wait(x)\n", | |
"del(x)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Expected behavior visible for a single agg that is not `mean`\n", | |
"\n", | |
"- `sort=True`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"2.17 s ± 11.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"x = ddf.groupby(by=\"a\", sort=True).b.count()\n", | |
"x = x.persist()\n", | |
"wait(x)\n", | |
"del(x)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- `sort=False`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1.35 s ± 2.91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"x = ddf.groupby(by=\"a\", sort=False).b.count()\n", | |
"x = x.persist()\n", | |
"wait(x)\n", | |
"del(x)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Sorting behavior differences\n", | |
"\n", | |
"- The output below is not sorted (as expected)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"a\n", | |
"44945698 3\n", | |
"36588377 3\n", | |
"42946380 3\n", | |
"17303107 1\n", | |
"47236603 4\n", | |
"Name: b, dtype: int64" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ddf.groupby(by=\"a\", sort=False).b.count().head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"- The output below is sorted (unexpected). Also the column name `a` is lost (though that might be a different issue)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 0.340670\n", | |
"2 0.453529\n", | |
"4 0.884728\n", | |
"5 0.117211\n", | |
"6 0.524516\n", | |
"Name: b, dtype: float64" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ddf.groupby(by=\"a\", sort=False).b.mean().head()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment