quasiben/ridge-cuml-dask-timings.ipynb

## ridge-cuml-dask-timings.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Hyperparameter Optimization\n",
    "\n",
    "Find best hyperparameters (parameters which govern modelling (`alpha`).  How GridsearchCV works:\n",
    "\n",
    "> Exhaustive search over specified parameter values for an estimator.\n",
    "\n",
    "- building classifier for all parameter combinations (cuml)\n",
    "- randomly split data into test/train (cudf)\n",
    "- fit and record score each estimator (cuml)\n",
    "- best score (highest) is returned along with estimator with the best parameters\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Gridsearch with cuML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from cuml import Ridge as cumlRidge\n",
    "import cudf\n",
    "from sklearn import datasets, linear_model\n",
    "from sklearn.externals.joblib import parallel_backend\n",
    "from sklearn.model_selection import train_test_split, GridSearchCV\n",
    "import dask_ml.model_selection as dcv\n",
    "import sklearn"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Diabetes Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "diabetes = datasets.load_diabetes()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "diabetes.feature_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.03807591,  0.05068012,  0.06169621,  0.02187235, -0.0442235 ,\n",
       "       -0.03482076, -0.04340085, -0.00259226,  0.01990842, -0.01764613])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# row of data\n",
    "diabetes.data[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fit Data with Ridge Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split the data into training/testing sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.02824"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# data in MB\n",
    "X_train.nbytes/1e6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "fit_intercept = True\n",
    "normalize = False\n",
    "alpha = np.array([1.0]) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "ridge = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')\n",
    "cu_ridge = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver=\"eig\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "646 µs ± 95 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "ridge.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4.82 ms ± 337 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "cu_ridge.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Verify Output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.testing.assert_allclose(cu_ridge.coef_.to_array(), ridge.coef_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Increase Data Size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Duplicated data in memory: 2824.0 MB\n"
     ]
    }
   ],
   "source": [
    "dup_data = np.array(np.vstack([X_train]*int(1e5)))\n",
    "dup_train = np.array(np.hstack([y_train]*int(1e5)))\n",
    "print(f'Duplicated data in memory: {dup_data.nbytes / 1e6} MB')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "dup_ridge = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')\n",
    "dup_cu_ridge = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver=\"eig\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Data onto GPU"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 9.48 s, sys: 1.6 s, total: 11.1 s\n",
      "Wall time: 4.87 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "record_data = (('fea%d'%i, dup_data[:,i]) for i in range(dup_data.shape[1]))\n",
    "gdf_data = cudf.DataFrame(record_data)\n",
    "gdf_train = cudf.DataFrame(dict(train=dup_train))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3.81 s ± 480 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "dup_ridge.fit(dup_data, dup_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "382 ms ± 1.21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "dup_cu_ridge.fit(gdf_data, gdf_train.train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Verify Output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "dup_ridge.fit(dup_data, dup_train)\n",
    "dup_cu_ridge.fit(gdf_data, gdf_train.train)\n",
    "np.testing.assert_allclose(dup_cu_ridge.coef_.to_array(), dup_ridge.coef_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "params = {'alpha': np.logspace(-3, -1, 10)}\n",
    "clf = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')\n",
    "cu_clf = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver=\"eig\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4min 44s ± 4.02 s per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "sk_grid = GridSearchCV(clf, params, cv=5, iid=False)\n",
    "sk_grid.fit(dup_data, dup_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1min 27s ± 446 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "sk_cu_grid = GridSearchCV(cu_clf, params, cv=5, iid=False)\n",
    "sk_cu_grid.fit(gdf_data, gdf_train.train)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:rapids]",
   "language": "python",
   "name": "conda-env-rapids-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Hyperparameter Optimization\n",
	"\n",
	"Find best hyperparameters (parameters which govern modelling (`alpha`). How GridsearchCV works:\n",
	"\n",
	"> Exhaustive search over specified parameter values for an estimator.\n",
	"\n",
	"- building classifier for all parameter combinations (cuml)\n",
	"- randomly split data into test/train (cudf)\n",
	"- fit and record score each estimator (cuml)\n",
	"- best score (highest) is returned along with estimator with the best parameters\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Gridsearch with cuML"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"from cuml import Ridge as cumlRidge\n",
	"import cudf\n",
	"from sklearn import datasets, linear_model\n",
	"from sklearn.externals.joblib import parallel_backend\n",
	"from sklearn.model_selection import train_test_split, GridSearchCV\n",
	"import dask_ml.model_selection as dcv\n",
	"import sklearn"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Load Diabetes Data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"diabetes = datasets.load_diabetes()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"diabetes.feature_names"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array([ 0.03807591, 0.05068012, 0.06169621, 0.02187235, -0.0442235 ,\n",
	" -0.03482076, -0.04340085, -0.00259226, 0.01990842, -0.01764613])"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# row of data\n",
	"diabetes.data[0]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Fit Data with Ridge Regression"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Split the data into training/testing sets\n",
	"X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size=0.2)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.02824"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# data in MB\n",
	"X_train.nbytes/1e6"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"fit_intercept = True\n",
	"normalize = False\n",
	"alpha = np.array([1.0]) "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [],
	"source": [
	"ridge = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')\n",
	"cu_ridge = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver=\"eig\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"646 µs ± 95 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"ridge.fit(X_train, y_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"4.82 ms ± 337 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"cu_ridge.fit(X_train, y_train)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Verify Output"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [],
	"source": [
	"np.testing.assert_allclose(cu_ridge.coef_.to_array(), ridge.coef_)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Increase Data Size"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Duplicated data in memory: 2824.0 MB\n"
	]
	}
	],
	"source": [
	"dup_data = np.array(np.vstack([X_train]*int(1e5)))\n",
	"dup_train = np.array(np.hstack([y_train]*int(1e5)))\n",
	"print(f'Duplicated data in memory: {dup_data.nbytes / 1e6} MB')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [],
	"source": [
	"dup_ridge = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')\n",
	"dup_cu_ridge = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver=\"eig\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Load Data onto GPU"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"CPU times: user 9.48 s, sys: 1.6 s, total: 11.1 s\n",
	"Wall time: 4.87 s\n"
	]
	}
	],
	"source": [
	"%%time\n",
	"record_data = (('fea%d'%i, dup_data[:,i]) for i in range(dup_data.shape[1]))\n",
	"gdf_data = cudf.DataFrame(record_data)\n",
	"gdf_train = cudf.DataFrame(dict(train=dup_train))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"3.81 s ± 480 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"dup_ridge.fit(dup_data, dup_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"382 ms ± 1.21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"dup_cu_ridge.fit(gdf_data, gdf_train.train)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Verify Output"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [],
	"source": [
	"dup_ridge.fit(dup_data, dup_train)\n",
	"dup_cu_ridge.fit(gdf_data, gdf_train.train)\n",
	"np.testing.assert_allclose(dup_cu_ridge.coef_.to_array(), dup_ridge.coef_)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [],
	"source": [
	"params = {'alpha': np.logspace(-3, -1, 10)}\n",
	"clf = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')\n",
	"cu_clf = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver=\"eig\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"4min 44s ± 4.02 s per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"sk_grid = GridSearchCV(clf, params, cv=5, iid=False)\n",
	"sk_grid.fit(dup_data, dup_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"1min 27s ± 446 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
	]
	}
	],
	"source": [
	"%%timeit\n",
	"sk_cu_grid = GridSearchCV(cu_clf, params, cv=5, iid=False)\n",
	"sk_cu_grid.fit(gdf_data, gdf_train.train)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python [conda env:rapids]",
	"language": "python",
	"name": "conda-env-rapids-py"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}