Last active
June 1, 2021 06:49
-
-
Save GenevieveBuckley/f9f8219de5c052c3deb234cc44ebc0a2 to your computer and use it in GitHub Desktop.
Dask task graph handling costs on the client
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "6313a4df", | |
"metadata": {}, | |
"source": [ | |
"# Dask task graph handling costs on the client" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "08935a8f", | |
"metadata": {}, | |
"source": [ | |
"Example from the \"Doing Nothing Poorly: Accelerating Dask Scheduling\" workshop.\n", | |
"Dask Summit 2021.\n", | |
"\n", | |
"https://summit.dask.org/schedule/presentation/42/doing-nothing-poorly-accelerating-dask-scheduling/" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "59946940", | |
"metadata": {}, | |
"source": [ | |
"[Link to slides - click here!](https://docs.google.com/presentation/d/e/2PACX-1vQB5mHjhi3f3AvVvLpdsnNqCxQDDlvKTR8cfinQRG3AbIPLBUaiSu6TROSZh7HRcN0UZ2uak_C-Q7FJ/pub?start=false&loop=false&delayms=3000#slide=id.gd9b9e2cb51_0_107)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "3151ec96", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pickle\n", | |
"import dask\n", | |
"from dask.datasets import timeseries" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "29d55731", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"2021.04.1+25.ge9fdb758\n" | |
] | |
} | |
], | |
"source": [ | |
"print(dask.__version__)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "1f43d470", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 787 ms, sys: 925 ms, total: 1.71 s\n", | |
"Wall time: 655 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"# Create task graph on client\n", | |
"%time ddf = timeseries().shuffle(\"id\", shuffle=\"tasks\").head(compute=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "c5126df5", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 751 µs, sys: 377 µs, total: 1.13 ms\n", | |
"Wall time: 947 µs\n" | |
] | |
} | |
], | |
"source": [ | |
"# Optimize\n", | |
"%time ddf_opt, = dask.optimize(ddf)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "2348c87d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 4.21 ms, sys: 139 µs, total: 4.35 ms\n", | |
"Wall time: 4.25 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"# Serialize\n", | |
"byte_total = 0\n", | |
"for k, v in ddf_opt.__dask_graph__().items():\n", | |
" byte_total += len(pickle.dumps(k)) + len(pickle.dumps(v))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "58749d07", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'29.79 kiB'" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Send to the scheduler\n", | |
"dask.utils.format_bytes(byte_total)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "f0c0a41a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Example from "Doing Nothing Poorly: Accelerating the Dask Scheduler" workshop | |
# Dask Summit 2021 | |
# Task graph handling costs on the client | |
import pickle | |
import dask | |
from dask.datasets import timeseries | |
# Create dask task graph | |
%time ddf = timeseries().shuffle("id", shuffle="tasks").head(compute=False) | |
# Wall time: 4.01 s | |
# Optimize | |
%time ddf_opt, = dask.optimize(ddf) | |
# Wall time: 1.3 s | |
# Serialize | |
byte_total = 0 | |
for k, v in ddf_opt.__dask_graph__().items(): | |
byte_total += len(pickle.dumps(k)) + len(pickle.dumps(v)) | |
# Wall time: 731 ms | |
# Send to the scheduler | |
dask.utils.format_bytes(byte_total) | |
# '15.88 MB' (Assume ~587 ms at 100MB/s) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment