Created
August 14, 2019 15:26
-
-
Save quasiben/91b0b80b2d244ef95242bcc23ce32a5a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Hyperparameter Optimization\n", | |
"\n", | |
"Find best hyperparameters (parameters which govern modelling (`alpha`). How GridsearchCV works:\n", | |
"\n", | |
"> Exhaustive search over specified parameter values for an estimator.\n", | |
"\n", | |
"- building classifier for all parameter combinations (cuml)\n", | |
"- randomly split data into test/train (cudf)\n", | |
"- fit and record score each estimator (cuml)\n", | |
"- best score (highest) is returned along with estimator with the best parameters\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Gridsearch with cuML" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"from cuml import Ridge as cumlRidge\n", | |
"import cudf\n", | |
"from sklearn import datasets, linear_model\n", | |
"from sklearn.externals.joblib import parallel_backend\n", | |
"from sklearn.model_selection import train_test_split, GridSearchCV\n", | |
"import dask_ml.model_selection as dcv\n", | |
"import sklearn" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Load Diabetes Data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"diabetes = datasets.load_diabetes()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"diabetes.feature_names" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 0.03807591, 0.05068012, 0.06169621, 0.02187235, -0.0442235 ,\n", | |
" -0.03482076, -0.04340085, -0.00259226, 0.01990842, -0.01764613])" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# row of data\n", | |
"diabetes.data[0]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Fit Data with Ridge Regression" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Split the data into training/testing sets\n", | |
"X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size=0.2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.02824" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# data in MB\n", | |
"X_train.nbytes/1e6" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fit_intercept = True\n", | |
"normalize = False\n", | |
"alpha = np.array([1.0]) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ridge = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')\n", | |
"cu_ridge = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver=\"eig\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"646 µs ± 95 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"ridge.fit(X_train, y_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"4.82 ms ± 337 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"cu_ridge.fit(X_train, y_train)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Verify Output" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"np.testing.assert_allclose(cu_ridge.coef_.to_array(), ridge.coef_)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Increase Data Size" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Duplicated data in memory: 2824.0 MB\n" | |
] | |
} | |
], | |
"source": [ | |
"dup_data = np.array(np.vstack([X_train]*int(1e5)))\n", | |
"dup_train = np.array(np.hstack([y_train]*int(1e5)))\n", | |
"print(f'Duplicated data in memory: {dup_data.nbytes / 1e6} MB')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dup_ridge = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')\n", | |
"dup_cu_ridge = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver=\"eig\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Load Data onto GPU" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 9.48 s, sys: 1.6 s, total: 11.1 s\n", | |
"Wall time: 4.87 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"record_data = (('fea%d'%i, dup_data[:,i]) for i in range(dup_data.shape[1]))\n", | |
"gdf_data = cudf.DataFrame(record_data)\n", | |
"gdf_train = cudf.DataFrame(dict(train=dup_train))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"3.81 s ± 480 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"dup_ridge.fit(dup_data, dup_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"382 ms ± 1.21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"dup_cu_ridge.fit(gdf_data, gdf_train.train)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Verify Output" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dup_ridge.fit(dup_data, dup_train)\n", | |
"dup_cu_ridge.fit(gdf_data, gdf_train.train)\n", | |
"np.testing.assert_allclose(dup_cu_ridge.coef_.to_array(), dup_ridge.coef_)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"params = {'alpha': np.logspace(-3, -1, 10)}\n", | |
"clf = linear_model.Ridge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver='cholesky')\n", | |
"cu_clf = cumlRidge(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, solver=\"eig\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"4min 44s ± 4.02 s per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"sk_grid = GridSearchCV(clf, params, cv=5, iid=False)\n", | |
"sk_grid.fit(dup_data, dup_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1min 27s ± 446 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" | |
] | |
} | |
], | |
"source": [ | |
"%%timeit\n", | |
"sk_cu_grid = GridSearchCV(cu_clf, params, cv=5, iid=False)\n", | |
"sk_cu_grid.fit(gdf_data, gdf_train.train)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python [conda env:rapids]", | |
"language": "python", | |
"name": "conda-env-rapids-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment