Skip to content

Instantly share code, notes, and snippets.

@quasiben
Created March 20, 2019 14:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save quasiben/a96ce952b7eb54356f7f8390319473e4 to your computer and use it in GitHub Desktop.
Save quasiben/a96ce952b7eb54356f7f8390319473e4 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Dask ML and Gridsearch with cuML"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from cuml import Ridge as cumlRidge\n",
"import cudf\n",
"from sklearn import datasets, linear_model\n",
"from sklearn.externals.joblib import parallel_backend\n",
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
"import dask_ml.model_selection as dcv"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Use a DGX"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table style=\"border: 2px solid white;\">\n",
"<tr>\n",
"<td style=\"vertical-align: top; border: 0px solid white\">\n",
"<h3>Client</h3>\n",
"<ul>\n",
" <li><b>Scheduler: </b>tcp://127.0.0.1:36485\n",
" <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a>\n",
"</ul>\n",
"</td>\n",
"<td style=\"vertical-align: top; border: 0px solid white\">\n",
"<h3>Cluster</h3>\n",
"<ul>\n",
" <li><b>Workers: </b>8</li>\n",
" <li><b>Cores: </b>8</li>\n",
" <li><b>Memory: </b>540.96 GB</li>\n",
"</ul>\n",
"</td>\n",
"</tr>\n",
"</table>"
],
"text/plain": [
"<Client: scheduler='tcp://127.0.0.1:36485' processes=8 cores=8>"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from dask.distributed import Client\n",
"from dask_cuda import LocalCUDACluster\n",
"\n",
"# Start one worker per GPU on the local system\n",
"cluster = LocalCUDACluster()\n",
"client = Client(cluster)\n",
"client"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Diabetes Data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"diabetes = datasets.load_diabetes()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"diabetes.feature_names"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0.03807591, 0.05068012, 0.06169621, 0.02187235, -0.0442235 ,\n",
" -0.03482076, -0.04340085, -0.00259226, 0.01990842, -0.01764613])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# row of data\n",
"diabetes.data[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Fit Data with Ridge Regression"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Split the data into training/testing sets\n",
"X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size=0.2)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.02824"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# data in MB\n",
"X_train.nbytes/1e6"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"fit_intercept = True\n",
"normalize = False\n",
"alpha = np.array([1.0]) "
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"ridge = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')\n",
"cu_ridge = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver=\"eig\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 176 ms, sys: 44 ms, total: 220 ms\n",
"Wall time: 28 ms\n"
]
},
{
"data": {
"text/plain": [
"Ridge(alpha=array([1.]), copy_X=True, fit_intercept=True, max_iter=None,\n",
" normalize=False, random_state=None, solver='cholesky', tol=0.001)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"ridge.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 7.93 s, sys: 184 ms, total: 8.12 s\n",
"Wall time: 1.12 s\n"
]
},
{
"data": {
"text/plain": [
"<cuml.linear_model.ridge.Ridge at 0x7f83e017d1d0>"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"cu_ridge.fit(X_train, y_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Verify Output"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"np.testing.assert_allclose(cu_ridge.coef_.to_array(), ridge.coef_)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Increase Data Size"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Duplicated data in memory: 2824.0 MB\n"
]
}
],
"source": [
"dup_data = np.array(np.vstack([X_train]*int(1e5)))\n",
"dup_train = np.array(np.hstack([y_train]*int(1e5)))\n",
"print(f'Duplicated data in memory: {dup_data.nbytes / 1e6} MB')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"dup_ridge = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')\n",
"dup_cu_ridge = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver=\"eig\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Data onto GPU"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 13.6 s, sys: 7.26 s, total: 20.9 s\n",
"Wall time: 19.7 s\n"
]
}
],
"source": [
"%%time\n",
"record_data = (('fea%d'%i, dup_data[:,i]) for i in range(dup_data.shape[1]))\n",
"gdf_data = cudf.DataFrame(record_data)\n",
"gdf_train = cudf.DataFrame(dict(train=dup_train))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4.82 s ± 694 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"dup_ridge.fit(dup_data, dup_train)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"450 ms ± 47.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"dup_cu_ridge.fit(gdf_data, gdf_train.train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Verify Output"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"dup_ridge.fit(dup_data, dup_train)\n",
"dup_cu_ridge.fit(gdf_data, gdf_train.train)\n",
"np.testing.assert_allclose(dup_cu_ridge.coef_.to_array(), dup_ridge.coef_)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Hyperparameter Optimization"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"params = {'alpha': np.logspace(-3, -1, 10)}\n",
"clf = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')\n",
"cu_clf = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver=\"eig\")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"88.4 ms ± 6.11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
]
}
],
"source": [
"%%timeit\n",
"sk_grid = GridSearchCV(clf, params, scoring='r2', cv=5, iid=False)\n",
"sk_grid.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"6.51 s ± 132 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"sk_cu_grid = GridSearchCV(cu_clf, params, scoring='r2', cv=5, iid=False)\n",
"sk_cu_grid.fit(X_train, y_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Swap Sklearn Gridsearch with DaskML Gridsearch"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The slowest run took 39.07 times longer than the fastest. This could mean that an intermediate result is being cached.\n",
"873 ms ± 347 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"grid = dcv.GridSearchCV(clf, params, scoring='r2', cv=5)\n",
"grid.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"740 ms ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
}
],
"source": [
"%%timeit\n",
"cu_grid = dcv.GridSearchCV(cu_clf, params, scoring='r2')\n",
"cu_grid.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Two Dup Data: 2.824 MB\n",
"Three Dup Data: 28.24 MB\n"
]
}
],
"source": [
"two_dup_data = np.array(np.vstack([X_train]*int(1e2)))\n",
"two_dup_train = np.array(np.hstack([y_train]*int(1e2)))\n",
"three_dup_data = np.array(np.vstack([X_train]*int(1e3)))\n",
"three_dup_train = np.array(np.hstack([y_train]*int(1e3)))\n",
"print(f'Two Dup Data: {two_dup_data.nbytes / 1e6} MB\\nThree Dup Data: {three_dup_data.nbytes / 1e6} MB')"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 13 s, sys: 1 s, total: 14 s\n",
"Wall time: 1min 17s\n"
]
}
],
"source": [
"%%time\n",
"cu_grid = dcv.GridSearchCV(cu_clf, params, scoring='r2', cv=5)\n",
"cu_grid.fit(two_dup_data, two_dup_train)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 53.8 s, sys: 9.07 s, total: 1min 2s\n",
"Wall time: 12min 50s\n"
]
}
],
"source": [
"%%time\n",
"cu_grid = dcv.GridSearchCV(cu_clf, params, scoring='r2', cv=5)\n",
"cu_grid.fit(three_dup_data, three_dup_train)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.41 s, sys: 244 ms, total: 1.65 s\n",
"Wall time: 13.8 s\n"
]
}
],
"source": [
"%%time\n",
"grid = dcv.GridSearchCV(clf, params, scoring='r2', cv=5)\n",
"grid.fit(three_dup_data, three_dup_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"with parallel_backend('dask', scatter=[dup_data, dup_train]):\n",
" cu_grid = dcv.GridSearchCV(cu_clf, params, scoring='r2')\n",
" cu_grid.fit(dup_data, dup_train)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:cudf-dev]",
"language": "python",
"name": "conda-env-cudf-dev-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment