Created
March 20, 2019 14:49
-
-
Save quasiben/a96ce952b7eb54356f7f8390319473e4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Dask ML and Gridsearch with cuML" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"from cuml import Ridge as cumlRidge\n", | |
"import cudf\n", | |
"from sklearn import datasets, linear_model\n", | |
"from sklearn.externals.joblib import parallel_backend\n", | |
"from sklearn.model_selection import train_test_split, GridSearchCV\n", | |
"import dask_ml.model_selection as dcv" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Use a DGX" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table style=\"border: 2px solid white;\">\n", | |
"<tr>\n", | |
"<td style=\"vertical-align: top; border: 0px solid white\">\n", | |
"<h3>Client</h3>\n", | |
"<ul>\n", | |
" <li><b>Scheduler: </b>tcp://127.0.0.1:36485\n", | |
" <li><b>Dashboard: </b><a href='http://127.0.0.1:8787/status' target='_blank'>http://127.0.0.1:8787/status</a>\n", | |
"</ul>\n", | |
"</td>\n", | |
"<td style=\"vertical-align: top; border: 0px solid white\">\n", | |
"<h3>Cluster</h3>\n", | |
"<ul>\n", | |
" <li><b>Workers: </b>8</li>\n", | |
" <li><b>Cores: </b>8</li>\n", | |
" <li><b>Memory: </b>540.96 GB</li>\n", | |
"</ul>\n", | |
"</td>\n", | |
"</tr>\n", | |
"</table>" | |
], | |
"text/plain": [ | |
"<Client: scheduler='tcp://127.0.0.1:36485' processes=8 cores=8>" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from dask.distributed import Client\n", | |
"from dask_cuda import LocalCUDACluster\n", | |
"\n", | |
"# Start one worker per GPU on the local system\n", | |
"cluster = LocalCUDACluster()\n", | |
"client = Client(cluster)\n", | |
"client" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Load Diabetes Data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"diabetes = datasets.load_diabetes()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"diabetes.feature_names" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 0.03807591, 0.05068012, 0.06169621, 0.02187235, -0.0442235 ,\n", | |
" -0.03482076, -0.04340085, -0.00259226, 0.01990842, -0.01764613])" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# row of data\n", | |
"diabetes.data[0]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Fit Data with Ridge Regression" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Split the data into training/testing sets\n", | |
"X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size=0.2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.02824" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# data in MB\n", | |
"X_train.nbytes/1e6" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fit_intercept = True\n", | |
"normalize = False\n", | |
"alpha = np.array([1.0]) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ridge = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')\n", | |
"cu_ridge = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver=\"eig\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 176 ms, sys: 44 ms, total: 220 ms\n", | |
"Wall time: 28 ms\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"Ridge(alpha=array([1.]), copy_X=True, fit_intercept=True, max_iter=None,\n", | |
" normalize=False, random_state=None, solver='cholesky', tol=0.001)" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"ridge.fit(X_train, y_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 7.93 s, sys: 184 ms, total: 8.12 s\n", | |
"Wall time: 1.12 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"<cuml.linear_model.ridge.Ridge at 0x7f83e017d1d0>" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"cu_ridge.fit(X_train, y_train)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Verify Output" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"np.testing.assert_allclose(cu_ridge.coef_.to_array(), ridge.coef_)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Increase Data Size" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Duplicated data in memory: 2824.0 MB\n" | |
] | |
} | |
], | |
"source": [ | |
"dup_data = np.array(np.vstack([X_train]*int(1e5)))\n", | |
"dup_train = np.array(np.hstack([y_train]*int(1e5)))\n", | |
"print(f'Duplicated data in memory: {dup_data.nbytes / 1e6} MB')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dup_ridge = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')\n", | |
"dup_cu_ridge = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver=\"eig\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Load Data onto GPU" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 13.6 s, sys: 7.26 s, total: 20.9 s\n", | |
"Wall time: 19.7 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"record_data = (('fea%d'%i, dup_data[:,i]) for i in range(dup_data.shape[1]))\n", | |
"gdf_data = cudf.DataFrame(record_data)\n", | |
"gdf_train = cudf.DataFrame(dict(train=dup_train))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"4.82 s ± 694 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"dup_ridge.fit(dup_data, dup_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"450 ms ± 47.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"dup_cu_ridge.fit(gdf_data, gdf_train.train)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Verify Output" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dup_ridge.fit(dup_data, dup_train)\n", | |
"dup_cu_ridge.fit(gdf_data, gdf_train.train)\n", | |
"np.testing.assert_allclose(dup_cu_ridge.coef_.to_array(), dup_ridge.coef_)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Hyperparameter Optimization" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"params = {'alpha': np.logspace(-3, -1, 10)}\n", | |
"clf = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')\n", | |
"cu_clf = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver=\"eig\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"88.4 ms ± 6.11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"sk_grid = GridSearchCV(clf, params, scoring='r2', cv=5, iid=False)\n", | |
"sk_grid.fit(X_train, y_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"6.51 s ± 132 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"sk_cu_grid = GridSearchCV(cu_clf, params, scoring='r2', cv=5, iid=False)\n", | |
"sk_cu_grid.fit(X_train, y_train)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Swap Sklearn Gridsearch with DaskML Gridsearch" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"The slowest run took 39.07 times longer than the fastest. This could mean that an intermediate result is being cached.\n", | |
"873 ms ± 347 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"grid = dcv.GridSearchCV(clf, params, scoring='r2', cv=5)\n", | |
"grid.fit(X_train, y_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"740 ms ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"cu_grid = dcv.GridSearchCV(cu_clf, params, scoring='r2')\n", | |
"cu_grid.fit(X_train, y_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Two Dup Data: 2.824 MB\n", | |
"Three Dup Data: 28.24 MB\n" | |
] | |
} | |
], | |
"source": [ | |
"two_dup_data = np.array(np.vstack([X_train]*int(1e2)))\n", | |
"two_dup_train = np.array(np.hstack([y_train]*int(1e2)))\n", | |
"three_dup_data = np.array(np.vstack([X_train]*int(1e3)))\n", | |
"three_dup_train = np.array(np.hstack([y_train]*int(1e3)))\n", | |
"print(f'Two Dup Data: {two_dup_data.nbytes / 1e6} MB\\nThree Dup Data: {three_dup_data.nbytes / 1e6} MB')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 13 s, sys: 1 s, total: 14 s\n", | |
"Wall time: 1min 17s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"cu_grid = dcv.GridSearchCV(cu_clf, params, scoring='r2', cv=5)\n", | |
"cu_grid.fit(two_dup_data, two_dup_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 53.8 s, sys: 9.07 s, total: 1min 2s\n", | |
"Wall time: 12min 50s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"cu_grid = dcv.GridSearchCV(cu_clf, params, scoring='r2', cv=5)\n", | |
"cu_grid.fit(three_dup_data, three_dup_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 1.41 s, sys: 244 ms, total: 1.65 s\n", | |
"Wall time: 13.8 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"grid = dcv.GridSearchCV(clf, params, scoring='r2', cv=5)\n", | |
"grid.fit(three_dup_data, three_dup_data)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%%time\n", | |
"with parallel_backend('dask', scatter=[dup_data, dup_train]):\n", | |
" cu_grid = dcv.GridSearchCV(cu_clf, params, scoring='r2')\n", | |
" cu_grid.fit(dup_data, dup_train)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python [conda env:cudf-dev]", | |
"language": "python", | |
"name": "conda-env-cudf-dev-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment