Created
April 25, 2020 16:46
-
-
Save rjzamora/06e03f89dff7bf4dbe941492d0c6c989 to your computer and use it in GitHub Desktop.
Experiments for caching cudf DataFrame objects that will be re-read many times
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import cudf\n", | |
"import pickle\n", | |
"from io import BytesIO\n", | |
"from dask.utils import parse_bytes\n", | |
"from dask_cuda.device_host_file import DeviceHostFile\n", | |
"\n", | |
"path = \"/datasets/criteo/rzamora/crit_pq_int_2gb_split/ds_part.217.parquet\"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Reading parquet from file (disk)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"233 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit cudf.io.read_parquet(path)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Reading parquet from `BytesIO` (CPU memory) object" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 2.58 ms, sys: 251 ms, total: 253 ms\n", | |
"Wall time: 252 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"mem_file = None\n", | |
"with open(path, \"rb\") as f:\n", | |
" mem_file = BytesIO(f.read())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"241 ms ± 19.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit cudf.io.read_parquet(mem_file)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### From Pandas (CPU memory) object" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 940 ms, sys: 1.1 s, total: 2.04 s\n", | |
"Wall time: 2.04 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"pdf = cudf.io.read_parquet(path).to_pandas()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"418 ms ± 2.43 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit cudf.from_pandas(pdf)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### From Arrow (CPU memory) object" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 535 ms, sys: 620 ms, total: 1.16 s\n", | |
"Wall time: 1.15 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"adf = cudf.io.read_parquet(path).to_arrow()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"443 ms ± 5.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit cudf.DataFrame.from_arrow(adf)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### From Pickled bytes (CPU memory) object" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1.55 s, sys: 2.56 s, total: 4.11 s\n", | |
"Wall time: 4.11 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"pdf = pickle.dumps(cudf.io.read_parquet(path))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"2.03 s ± 265 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit pickle.loads(pdf)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Using `DeviceHostFile`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 499 ms, sys: 647 ms, total: 1.15 s\n", | |
"Wall time: 1.15 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"\n", | |
"dhf = DeviceHostFile(\n", | |
" device_memory_limit=parse_bytes(\"0GB\"),\n", | |
" memory_limit=parse_bytes(\"128GB\"),\n", | |
" local_directory=\"/raid/dask_space/rzamora/scratch\",\n", | |
")\n", | |
"dhf[0] = cudf.io.read_parquet(path)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"360 ms ± 1.83 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%timeit dhf.get(0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment