Skip to content

Instantly share code, notes, and snippets.

@kayush2O6
Last active March 14, 2019 05:12
Show Gist options
  • Save kayush2O6/c004868c6cdee15698d2582b13a671e7 to your computer and use it in GitHub Desktop.
Save kayush2O6/c004868c6cdee15698d2582b13a671e7 to your computer and use it in GitHub Desktop.
Final version with suggested solution
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table style=\"border: 2px solid white;\">\n",
"<tr>\n",
"<td style=\"vertical-align: top; border: 0px solid white\">\n",
"<h3>Client</h3>\n",
"<ul>\n",
" <li><b>Scheduler: </b>tcp://127.0.0.1:37180\n",
" <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a>\n",
"</ul>\n",
"</td>\n",
"<td style=\"vertical-align: top; border: 0px solid white\">\n",
"<h3>Cluster</h3>\n",
"<ul>\n",
" <li><b>Workers: </b>2</li>\n",
" <li><b>Cores: </b>2</li>\n",
" <li><b>Memory: </b>236.66 GB</li>\n",
"</ul>\n",
"</td>\n",
"</tr>\n",
"</table>"
],
"text/plain": [
"<Client: scheduler='tcp://127.0.0.1:37180' processes=2 cores=2>"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from dask.distributed import Client, wait\n",
"from dask_cuda import LocalCUDACluster\n",
"from dask.delayed import delayed\n",
"cluster = LocalCUDACluster()\n",
"client = Client(cluster)\n",
"client"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import cudf\n",
"import numpy as np\n",
"from librmm_cffi import librmm"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def read_table(filename):\n",
" names = ['a', 'b', 'c', 'd']\n",
" dtypes = ['int', 'str', 'str', 'str']\n",
" columns = cudf.io.csv.read_csv_strings(filename, delimiter='\\t',\n",
" names=names, dtype=dtypes,\n",
" skiprows=1)\n",
" return (columns, names)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def prepare_df(ret, item=0):\n",
" cols = ret[0]\n",
" names = ret[1]\n",
" size = cols[1].sublist([item]).lstrip('[').rstrip(']').split(',')[0].size()\n",
" gdf = cudf.dataframe.DataFrame()\n",
" for i in range(1, 4):\n",
" float_array = librmm.device_array(size, dtype=np.float32)\n",
" cols[i].sublist([item]).lstrip('[').rstrip(']').split(',')[0].stof(float_array.device_ctypes_pointer.value)\n",
" gdf[names[i]]=cudf.Series(float_array)\n",
" return gdf"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"ret = delayed(read_table)(\"foo.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"b'\\x80\\x03cdask.delayed\\nDelayed\\nq\\x00)\\x81q\\x01X/\\x00\\x00\\x00read_table-bcd5ed0b-4913-4888-adda-7a2576fef250q\\x02cdask.highlevelgraph\\nHighLevelGraph\\nq\\x03)\\x81q\\x04}q\\x05(X\\x06\\x00\\x00\\x00layersq\\x06}q\\x07h\\x02}q\\x08h\\x02c__main__\\nread_table\\nq\\tX\\x07\\x00\\x00\\x00foo.csvq\\n\\x86q\\x0bssX\\x0c\\x00\\x00\\x00dependenciesq\\x0c}q\\rh\\x02cbuiltins\\nset\\nq\\x0e]q\\x0f\\x85q\\x10Rq\\x11subN\\x87q\\x12b.'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pickle\n",
"pickle.dumps(ret)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"gdf1 = delayed(prepare_df)(ret)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"b'\\x80\\x03cdask.delayed\\nDelayed\\nq\\x00)\\x81q\\x01X/\\x00\\x00\\x00prepare_df-4770a828-e3be-4814-80a5-bcece29e1136q\\x02cdask.highlevelgraph\\nHighLevelGraph\\nq\\x03)\\x81q\\x04}q\\x05(X\\x06\\x00\\x00\\x00layersq\\x06}q\\x07(h\\x02}q\\x08h\\x02c__main__\\nprepare_df\\nq\\tX/\\x00\\x00\\x00read_table-bcd5ed0b-4913-4888-adda-7a2576fef250q\\n\\x86q\\x0bsh\\n}q\\x0ch\\nc__main__\\nread_table\\nq\\rX\\x07\\x00\\x00\\x00foo.csvq\\x0e\\x86q\\x0fsuX\\x0c\\x00\\x00\\x00dependenciesq\\x10}q\\x11(h\\x02cbuiltins\\nset\\nq\\x12]q\\x13h\\na\\x85q\\x14Rq\\x15h\\nh\\x12]q\\x16\\x85q\\x17Rq\\x18uubN\\x87q\\x19b.'"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pickle.dumps(gdf1)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"image/svg+xml": [
"<svg height=\"396pt\" viewBox=\"0.00 0.00 113.71 396.21\" width=\"114pt\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
"<g class=\"graph\" id=\"graph0\" transform=\"scale(1 1) rotate(0) translate(4 392.2117)\">\n",
"<title>%3</title>\n",
"<polygon fill=\"#ffffff\" points=\"-4,4 -4,-392.2117 109.7099,-392.2117 109.7099,4 -4,4\" stroke=\"transparent\"/>\n",
"<!-- &#45;3842856131783809843 -->\n",
"<g class=\"node\" id=\"node1\">\n",
"<title>-3842856131783809843</title>\n",
"<polygon fill=\"none\" points=\"79.8549,-388.2117 25.8549,-388.2117 25.8549,-352.2117 79.8549,-352.2117 79.8549,-388.2117\" stroke=\"#000000\"/>\n",
"</g>\n",
"<!-- &#45;8716603847870901542 -->\n",
"<g class=\"node\" id=\"node2\">\n",
"<title>-8716603847870901542</title>\n",
"<ellipse cx=\"52.8549\" cy=\"-263.3568\" fill=\"none\" rx=\"52.7103\" ry=\"52.7103\" stroke=\"#000000\"/>\n",
"<text fill=\"#000000\" font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"52.8549\" y=\"-259.1568\">prepare_df</text>\n",
"</g>\n",
"<!-- &#45;8716603847870901542&#45;&gt;&#45;3842856131783809843 -->\n",
"<g class=\"edge\" id=\"edge1\">\n",
"<title>-8716603847870901542-&gt;-3842856131783809843</title>\n",
"<path d=\"M52.8549,-316.4712C52.8549,-325.2577 52.8549,-334.0554 52.8549,-341.9139\" fill=\"none\" stroke=\"#000000\"/>\n",
"<polygon fill=\"#000000\" points=\"49.355,-341.9584 52.8549,-351.9584 56.355,-341.9585 49.355,-341.9584\" stroke=\"#000000\"/>\n",
"</g>\n",
"<!-- &#45;4317651821274558598 -->\n",
"<g class=\"node\" id=\"node3\">\n",
"<title>-4317651821274558598</title>\n",
"<polygon fill=\"none\" points=\"79.8549,-174.5018 25.8549,-174.5018 25.8549,-138.5018 79.8549,-138.5018 79.8549,-174.5018\" stroke=\"#000000\"/>\n",
"</g>\n",
"<!-- &#45;4317651821274558598&#45;&gt;&#45;8716603847870901542 -->\n",
"<g class=\"edge\" id=\"edge2\">\n",
"<title>-4317651821274558598-&gt;-8716603847870901542</title>\n",
"<path d=\"M52.8549,-174.6283C52.8549,-181.887 52.8549,-190.7193 52.8549,-199.9642\" fill=\"none\" stroke=\"#000000\"/>\n",
"<polygon fill=\"#000000\" points=\"49.355,-200.1029 52.8549,-210.103 56.355,-200.103 49.355,-200.1029\" stroke=\"#000000\"/>\n",
"</g>\n",
"<!-- 8439453770685328357 -->\n",
"<g class=\"node\" id=\"node4\">\n",
"<title>8439453770685328357</title>\n",
"<ellipse cx=\"52.8549\" cy=\"-51.2509\" fill=\"none\" rx=\"51.003\" ry=\"51.003\" stroke=\"#000000\"/>\n",
"<text fill=\"#000000\" font-family=\"Times,serif\" font-size=\"14.00\" text-anchor=\"middle\" x=\"52.8549\" y=\"-47.0509\">read_table</text>\n",
"</g>\n",
"<!-- 8439453770685328357&#45;&gt;&#45;4317651821274558598 -->\n",
"<g class=\"edge\" id=\"edge3\">\n",
"<title>8439453770685328357-&gt;-4317651821274558598</title>\n",
"<path d=\"M52.8549,-102.6431C52.8549,-111.3967 52.8549,-120.1958 52.8549,-128.0709\" fill=\"none\" stroke=\"#000000\"/>\n",
"<polygon fill=\"#000000\" points=\"49.355,-128.1459 52.8549,-138.146 56.355,-128.146 49.355,-128.1459\" stroke=\"#000000\"/>\n",
"</g>\n",
"</g>\n",
"</svg>"
],
"text/plain": [
"<IPython.core.display.SVG object>"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gdf1.visualize(filename='gdf1.svg')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"gdf = gdf1.compute(scheduler='single-threaded')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>b</th>\n",
" <th>c</th>\n",
" <th>d</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.1</td>\n",
" <td>2.0</td>\n",
" <td>0.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2.2</td>\n",
" <td>3.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" b c d\n",
"0 1.1 2.0 0.4\n",
"1 2.2 3.0 0.0\n",
"2 3.0 4.0 0.0"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gdf.to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
We can make this file beautiful and searchable if this error is corrected: It looks like row 2 should actually have 1 column, instead of 7. in line 1.
a b c d
12 [1.1,2.2,3.0] [2,3,4] [0.4, 0.2, 0.9]
15 [3.1,4.1] [3,2] [0.6, 0.8]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment