Skip to content

Instantly share code, notes, and snippets.

@crusaderky
Last active July 16, 2018 22:32
Show Gist options
  • Save crusaderky/89819258ff960d06136d45526f7d05db to your computer and use it in GitHub Desktop.
Save crusaderky/89819258ff960d06136d45526f7d05db to your computer and use it in GitHub Desktop.
to_csv benchmark
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<xarray.DataArray 'mul-a39f522da57bff7c1e435f9a642d1171' (r: 25000, c: 750)>\n",
"dask.array<shape=(25000, 750), dtype=float64, chunksize=(3125, 750)>\n",
"Coordinates:\n",
" * r (r) <U9 'row 0' 'row 1' 'row 2' 'row 3' 'row 4' 'row 5' 'row 6' ...\n",
" * c (c) <U7 'col 0' 'col 1' 'col 2' 'col 3' 'col 4' 'col 5' 'col 6' ..."
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import xarray\n",
"import dask.array as da\n",
"from xarray_extras.csv import to_csv\n",
"\n",
"# Production sizes\n",
"# ROWS = 500000\n",
"# Quick test\n",
"ROWS = 25000\n",
"\n",
"a = xarray.DataArray(da.random.random((ROWS, 750), chunks=(ROWS // 8, -1)) * 1e6,\n",
" dims=['r', 'c'],\n",
" coords={'r': ['row %d' % i for i in range(25000)],\n",
" 'c': ['col %d' % i for i in range(750)]})\n",
"a"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"!rm -rf /dev/shm/{v1,v2}\n",
"!mkdir /dev/shm/v2"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 6 ms, sys: 5 ms, total: 11 ms\n",
"Wall time: 10.9 ms\n"
]
}
],
"source": [
"%%time\n",
"future1 = to_csv(a, '/dev/shm/v1.csv.gz', compression='gzip', float_format='%.2f')"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 205 ms, sys: 2.89 s, total: 3.1 s\n",
"Wall time: 10 s\n"
]
}
],
"source": [
"%%time\n",
"future1.compute()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 5.72 s, sys: 0 ns, total: 5.72 s\n",
"Wall time: 5.77 s\n"
]
}
],
"source": [
"%%time\n",
"future2 = a.to_dataset('c').to_dask_dataframe().to_csv(\n",
" '/dev/shm/v2', compression='gzip', float_format='%.2f', index=False, compute=False)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2min 7s, sys: 18.4 s, total: 2min 25s\n",
"Wall time: 1min 57s\n"
]
}
],
"source": [
"%%time\n",
"import dask\n",
"dask.compute(*future2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment