quasiben/ridge-cuml-dask.ipynb

## ridge-cuml-dask.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dask ML and Gridsearch with cuML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from cuml import Ridge as cumlRidge\n",
    "import cudf\n",
    "from sklearn import datasets, linear_model\n",
    "from sklearn.externals.joblib import parallel_backend\n",
    "from sklearn.model_selection import train_test_split, GridSearchCV\n",
    "import dask_ml.model_selection as dcv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Use a DGX"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table style=\"border: 2px solid white;\">\n",
       "<tr>\n",
       "<td style=\"vertical-align: top; border: 0px solid white\">\n",
       "<h3>Client</h3>\n",
       "<ul>\n",
       "  <li><b>Scheduler: </b>tcp://127.0.0.1:36485\n",
       "  <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a>\n",
       "</ul>\n",
       "</td>\n",
       "<td style=\"vertical-align: top; border: 0px solid white\">\n",
       "<h3>Cluster</h3>\n",
       "<ul>\n",
       "  <li><b>Workers: </b>8</li>\n",
       "  <li><b>Cores: </b>8</li>\n",
       "  <li><b>Memory: </b>540.96 GB</li>\n",
       "</ul>\n",
       "</td>\n",
       "</tr>\n",
       "</table>"
      ],
      "text/plain": [
       "<Client: scheduler='tcp://127.0.0.1:36485' processes=8 cores=8>"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from dask.distributed import Client\n",
    "from dask_cuda import LocalCUDACluster\n",
    "\n",
    "# Start one worker per GPU on the local system\n",
    "cluster = LocalCUDACluster()\n",
    "client = Client(cluster)\n",
    "client"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Diabetes Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "diabetes = datasets.load_diabetes()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "diabetes.feature_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.03807591,  0.05068012,  0.06169621,  0.02187235, -0.0442235 ,\n",
       "       -0.03482076, -0.04340085, -0.00259226,  0.01990842, -0.01764613])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# row of data\n",
    "diabetes.data[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fit Data with Ridge Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split the data into training/testing sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.02824"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# data in MB\n",
    "X_train.nbytes/1e6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "fit_intercept = True\n",
    "normalize = False\n",
    "alpha = np.array([1.0]) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "ridge = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')\n",
    "cu_ridge = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver=\"eig\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 176 ms, sys: 44 ms, total: 220 ms\n",
      "Wall time: 28 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Ridge(alpha=array([1.]), copy_X=True, fit_intercept=True, max_iter=None,\n",
       "   normalize=False, random_state=None, solver='cholesky', tol=0.001)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "ridge.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 7.93 s, sys: 184 ms, total: 8.12 s\n",
      "Wall time: 1.12 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<cuml.linear_model.ridge.Ridge at 0x7f83e017d1d0>"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "cu_ridge.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Verify Output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.testing.assert_allclose(cu_ridge.coef_.to_array(), ridge.coef_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Increase Data Size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Duplicated data in memory: 2824.0 MB\n"
     ]
    }
   ],
   "source": [
    "dup_data = np.array(np.vstack([X_train]*int(1e5)))\n",
    "dup_train = np.array(np.hstack([y_train]*int(1e5)))\n",
    "print(f'Duplicated data in memory: {dup_data.nbytes / 1e6} MB')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "dup_ridge = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')\n",
    "dup_cu_ridge = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver=\"eig\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Data onto GPU"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 13.6 s, sys: 7.26 s, total: 20.9 s\n",
      "Wall time: 19.7 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "record_data = (('fea%d'%i, dup_data[:,i]) for i in range(dup_data.shape[1]))\n",
    "gdf_data = cudf.DataFrame(record_data)\n",
    "gdf_train = cudf.DataFrame(dict(train=dup_train))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4.82 s ± 694 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "dup_ridge.fit(dup_data, dup_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "450 ms ± 47.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "dup_cu_ridge.fit(gdf_data, gdf_train.train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Verify Output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "dup_ridge.fit(dup_data, dup_train)\n",
    "dup_cu_ridge.fit(gdf_data, gdf_train.train)\n",
    "np.testing.assert_allclose(dup_cu_ridge.coef_.to_array(), dup_ridge.coef_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Hyperparameter Optimization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "params = {'alpha': np.logspace(-3, -1, 10)}\n",
    "clf = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')\n",
    "cu_clf = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver=\"eig\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "88.4 ms ± 6.11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "sk_grid = GridSearchCV(clf, params, scoring='r2', cv=5, iid=False)\n",
    "sk_grid.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6.51 s ± 132 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "sk_cu_grid = GridSearchCV(cu_clf, params, scoring='r2', cv=5, iid=False)\n",
    "sk_cu_grid.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Swap Sklearn Gridsearch with DaskML Gridsearch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The slowest run took 39.07 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
      "873 ms ± 347 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "grid = dcv.GridSearchCV(clf, params, scoring='r2', cv=5)\n",
    "grid.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "740 ms ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "cu_grid = dcv.GridSearchCV(cu_clf, params, scoring='r2')\n",
    "cu_grid.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Two Dup Data: 2.824 MB\n",
      "Three Dup Data: 28.24 MB\n"
     ]
    }
   ],
   "source": [
    "two_dup_data = np.array(np.vstack([X_train]*int(1e2)))\n",
    "two_dup_train = np.array(np.hstack([y_train]*int(1e2)))\n",
    "three_dup_data = np.array(np.vstack([X_train]*int(1e3)))\n",
    "three_dup_train = np.array(np.hstack([y_train]*int(1e3)))\n",
    "print(f'Two Dup Data: {two_dup_data.nbytes / 1e6} MB\\nThree Dup Data: {three_dup_data.nbytes / 1e6} MB')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 13 s, sys: 1 s, total: 14 s\n",
      "Wall time: 1min 17s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "cu_grid = dcv.GridSearchCV(cu_clf, params, scoring='r2', cv=5)\n",
    "cu_grid.fit(two_dup_data, two_dup_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 53.8 s, sys: 9.07 s, total: 1min 2s\n",
      "Wall time: 12min 50s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "cu_grid = dcv.GridSearchCV(cu_clf, params, scoring='r2', cv=5)\n",
    "cu_grid.fit(three_dup_data, three_dup_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.41 s, sys: 244 ms, total: 1.65 s\n",
      "Wall time: 13.8 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "grid = dcv.GridSearchCV(clf, params, scoring='r2', cv=5)\n",
    "grid.fit(three_dup_data, three_dup_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "with parallel_backend('dask', scatter=[dup_data, dup_train]):\n",
    "    cu_grid = dcv.GridSearchCV(cu_clf, params, scoring='r2')\n",
    "    cu_grid.fit(dup_data, dup_train)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:cudf-dev]",
   "language": "python",
   "name": "conda-env-cudf-dev-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Dask ML and Gridsearch with cuML"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"from cuml import Ridge as cumlRidge\n",
	"import cudf\n",
	"from sklearn import datasets, linear_model\n",
	"from sklearn.externals.joblib import parallel_backend\n",
	"from sklearn.model_selection import train_test_split, GridSearchCV\n",
	"import dask_ml.model_selection as dcv"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Use a DGX"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<table style=\"border: 2px solid white;\">\n",
	"<tr>\n",
	"<td style=\"vertical-align: top; border: 0px solid white\">\n",
	"<h3>Client</h3>\n",
	"<ul>\n",
	" <li><b>Scheduler: </b>tcp://127.0.0.1:36485\n",
	" <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a>\n",
	"</ul>\n",
	"</td>\n",
	"<td style=\"vertical-align: top; border: 0px solid white\">\n",
	"<h3>Cluster</h3>\n",
	"<ul>\n",
	" <li><b>Workers: </b>8</li>\n",
	" <li><b>Cores: </b>8</li>\n",
	" <li><b>Memory: </b>540.96 GB</li>\n",
	"</ul>\n",
	"</td>\n",
	"</tr>\n",
	"</table>"
	],
	"text/plain": [
	"<Client: scheduler='tcp://127.0.0.1:36485' processes=8 cores=8>"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from dask.distributed import Client\n",
	"from dask_cuda import LocalCUDACluster\n",
	"\n",
	"# Start one worker per GPU on the local system\n",
	"cluster = LocalCUDACluster()\n",
	"client = Client(cluster)\n",
	"client"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Load Diabetes Data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"diabetes = datasets.load_diabetes()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"diabetes.feature_names"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([ 0.03807591, 0.05068012, 0.06169621, 0.02187235, -0.0442235 ,\n",
	" -0.03482076, -0.04340085, -0.00259226, 0.01990842, -0.01764613])"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# row of data\n",
	"diabetes.data[0]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Fit Data with Ridge Regression"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Split the data into training/testing sets\n",
	"X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size=0.2)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.02824"
	]
	},
	"execution_count": 7,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# data in MB\n",
	"X_train.nbytes/1e6"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"fit_intercept = True\n",
	"normalize = False\n",
	"alpha = np.array([1.0]) "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [],
	"source": [
	"ridge = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')\n",
	"cu_ridge = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver=\"eig\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 176 ms, sys: 44 ms, total: 220 ms\n",
	"Wall time: 28 ms\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"Ridge(alpha=array([1.]), copy_X=True, fit_intercept=True, max_iter=None,\n",
	" normalize=False, random_state=None, solver='cholesky', tol=0.001)"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%time\n",
	"ridge.fit(X_train, y_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 7.93 s, sys: 184 ms, total: 8.12 s\n",
	"Wall time: 1.12 s\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"<cuml.linear_model.ridge.Ridge at 0x7f83e017d1d0>"
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"%%time\n",
	"cu_ridge.fit(X_train, y_train)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Verify Output"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [],
	"source": [
	"np.testing.assert_allclose(cu_ridge.coef_.to_array(), ridge.coef_)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Increase Data Size"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Duplicated data in memory: 2824.0 MB\n"
	]
	}
	],
	"source": [
	"dup_data = np.array(np.vstack([X_train]*int(1e5)))\n",
	"dup_train = np.array(np.hstack([y_train]*int(1e5)))\n",
	"print(f'Duplicated data in memory: {dup_data.nbytes / 1e6} MB')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [],
	"source": [
	"dup_ridge = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')\n",
	"dup_cu_ridge = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver=\"eig\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Load Data onto GPU"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 13.6 s, sys: 7.26 s, total: 20.9 s\n",
	"Wall time: 19.7 s\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"record_data = (('fea%d'%i, dup_data[:,i]) for i in range(dup_data.shape[1]))\n",
	"gdf_data = cudf.DataFrame(record_data)\n",
	"gdf_train = cudf.DataFrame(dict(train=dup_train))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"4.82 s ± 694 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"dup_ridge.fit(dup_data, dup_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"450 ms ± 47.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"dup_cu_ridge.fit(gdf_data, gdf_train.train)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Verify Output"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [],
	"source": [
	"dup_ridge.fit(dup_data, dup_train)\n",
	"dup_cu_ridge.fit(gdf_data, gdf_train.train)\n",
	"np.testing.assert_allclose(dup_cu_ridge.coef_.to_array(), dup_ridge.coef_)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Hyperparameter Optimization"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {},
	"outputs": [],
	"source": [
	"params = {'alpha': np.logspace(-3, -1, 10)}\n",
	"clf = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')\n",
	"cu_clf = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver=\"eig\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"88.4 ms ± 6.11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"sk_grid = GridSearchCV(clf, params, scoring='r2', cv=5, iid=False)\n",
	"sk_grid.fit(X_train, y_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"6.51 s ± 132 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"sk_cu_grid = GridSearchCV(cu_clf, params, scoring='r2', cv=5, iid=False)\n",
	"sk_cu_grid.fit(X_train, y_train)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Swap Sklearn Gridsearch with DaskML Gridsearch"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"The slowest run took 39.07 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
	"873 ms ± 347 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"grid = dcv.GridSearchCV(clf, params, scoring='r2', cv=5)\n",
	"grid.fit(X_train, y_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 31,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"740 ms ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"cu_grid = dcv.GridSearchCV(cu_clf, params, scoring='r2')\n",
	"cu_grid.fit(X_train, y_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Two Dup Data: 2.824 MB\n",
	"Three Dup Data: 28.24 MB\n"
	]
	}
	],
	"source": [
	"two_dup_data = np.array(np.vstack([X_train]*int(1e2)))\n",
	"two_dup_train = np.array(np.hstack([y_train]*int(1e2)))\n",
	"three_dup_data = np.array(np.vstack([X_train]*int(1e3)))\n",
	"three_dup_train = np.array(np.hstack([y_train]*int(1e3)))\n",
	"print(f'Two Dup Data: {two_dup_data.nbytes / 1e6} MB\\nThree Dup Data: {three_dup_data.nbytes / 1e6} MB')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 13 s, sys: 1 s, total: 14 s\n",
	"Wall time: 1min 17s\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"cu_grid = dcv.GridSearchCV(cu_clf, params, scoring='r2', cv=5)\n",
	"cu_grid.fit(two_dup_data, two_dup_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 53.8 s, sys: 9.07 s, total: 1min 2s\n",
	"Wall time: 12min 50s\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"cu_grid = dcv.GridSearchCV(cu_clf, params, scoring='r2', cv=5)\n",
	"cu_grid.fit(three_dup_data, three_dup_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 1.41 s, sys: 244 ms, total: 1.65 s\n",
	"Wall time: 13.8 s\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"grid = dcv.GridSearchCV(clf, params, scoring='r2', cv=5)\n",
	"grid.fit(three_dup_data, three_dup_data)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"%%time\n",
	"with parallel_backend('dask', scatter=[dup_data, dup_train]):\n",
	" cu_grid = dcv.GridSearchCV(cu_clf, params, scoring='r2')\n",
	" cu_grid.fit(dup_data, dup_train)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python [conda env:cudf-dev]",
	"language": "python",
	"name": "conda-env-cudf-dev-py"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}