Skip to content

Instantly share code, notes, and snippets.

@rjzamora
Created April 25, 2020 16:46
Show Gist options
  • Save rjzamora/06e03f89dff7bf4dbe941492d0c6c989 to your computer and use it in GitHub Desktop.
Save rjzamora/06e03f89dff7bf4dbe941492d0c6c989 to your computer and use it in GitHub Desktop.
Experiments for caching cudf DataFrame objects that will be re-read many times
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import cudf\n",
"import pickle\n",
"from io import BytesIO\n",
"from dask.utils import parse_bytes\n",
"from dask_cuda.device_host_file import DeviceHostFile\n",
"\n",
"path = \"/datasets/criteo/rzamora/crit_pq_int_2gb_split/ds_part.217.parquet\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Reading parquet from file (disk)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"233 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%timeit cudf.io.read_parquet(path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Reading parquet from `BytesIO` (CPU memory) object"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2.58 ms, sys: 251 ms, total: 253 ms\n",
"Wall time: 252 ms\n"
]
}
],
"source": [
"%%time\n",
"mem_file = None\n",
"with open(path, \"rb\") as f:\n",
" mem_file = BytesIO(f.read())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"241 ms ± 19.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%timeit cudf.io.read_parquet(mem_file)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### From Pandas (CPU memory) object"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 940 ms, sys: 1.1 s, total: 2.04 s\n",
"Wall time: 2.04 s\n"
]
}
],
"source": [
"%%time\n",
"pdf = cudf.io.read_parquet(path).to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"418 ms ± 2.43 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%timeit cudf.from_pandas(pdf)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### From Arrow (CPU memory) object"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 535 ms, sys: 620 ms, total: 1.16 s\n",
"Wall time: 1.15 s\n"
]
}
],
"source": [
"%%time\n",
"adf = cudf.io.read_parquet(path).to_arrow()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"443 ms ± 5.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%timeit cudf.DataFrame.from_arrow(adf)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### From Pickled bytes (CPU memory) object"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.55 s, sys: 2.56 s, total: 4.11 s\n",
"Wall time: 4.11 s\n"
]
}
],
"source": [
"%%time\n",
"pdf = pickle.dumps(cudf.io.read_parquet(path))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2.03 s ± 265 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%timeit pickle.loads(pdf)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Using `DeviceHostFile`"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 499 ms, sys: 647 ms, total: 1.15 s\n",
"Wall time: 1.15 s\n"
]
}
],
"source": [
"%%time\n",
"\n",
"dhf = DeviceHostFile(\n",
" device_memory_limit=parse_bytes(\"0GB\"),\n",
" memory_limit=parse_bytes(\"128GB\"),\n",
" local_directory=\"/raid/dask_space/rzamora/scratch\",\n",
")\n",
"dhf[0] = cudf.io.read_parquet(path)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"360 ms ± 1.83 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%timeit dhf.get(0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment