Created
August 20, 2020 14:29
-
-
Save roaramburu/9b4a4e5fe8bcc233b573dd133b3ae014 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Boosted Regression\n", | |
"\n", | |
"Source: [From Hours to Seconds: 100x Faster Boosting, Bagging, and Stacking with RAPIDS cuML and Scikit-learn Machine Learning Model Ensembling](https://medium.com/rapids-ai/100x-faster-machine-learning-model-ensembling-with-rapids-cuml-and-scikit-learn-meta-estimators-d869788ee6b1) by Nick Becker and Dante Gamme Desavne" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import cudf\n", | |
"\n", | |
"cudf.set_allocator(\n", | |
" pool=True,\n", | |
" initial_pool_size=3000000000 # 3 GB\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import time\n", | |
"\n", | |
"from sklearn.ensemble import AdaBoostRegressor\n", | |
"from sklearn.datasets import make_regression\n", | |
"from sklearn.metrics import r2_score\n", | |
"from sklearn.svm import SVR\n", | |
"\n", | |
"import cuml\n", | |
"\n", | |
"\n", | |
"NFEATURES = 10\n", | |
"\n", | |
"X, y = make_regression(\n", | |
" n_samples=20000,\n", | |
" n_features=NFEATURES,\n", | |
" n_informative=NFEATURES,\n", | |
" random_state=12,\n", | |
" noise=200,\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"class Timer: \n", | |
" def __enter__(self):\n", | |
" self.tick = time.time()\n", | |
" return self\n", | |
"\n", | |
" def __exit__(self, *args, **kwargs):\n", | |
" self.tock = time.time()\n", | |
" self.elapsed = self.tock - self.tick" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Sklearn\n", | |
"\n", | |
"**Note:** I have commented out this block to save you waiting an hour for it to complete." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Fit time (seconds): 239.5093755722046\n", | |
"Score time (seconds): 98.0283591747284\n" | |
] | |
} | |
], | |
"source": [ | |
"# boosted_regr = AdaBoostRegressor(\n", | |
"# SVR(),\n", | |
"# n_estimators=10,\n", | |
"# random_state=12,\n", | |
"# )\n", | |
"\n", | |
"# with Timer() as sk_fit_time:\n", | |
"# boosted_regr.fit(X, y)\n", | |
" \n", | |
"# with Timer() as sk_score_time:\n", | |
"# boosted_regr.score(X, y)\n", | |
" \n", | |
"# print(f\"Fit time (seconds): {sk_fit_time.elapsed}\")\n", | |
"# print(f\"Score time (seconds): {sk_score_time.elapsed}\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### cuML" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Fit time (seconds): 8.22884464263916\n", | |
"Score time (seconds): 2.031411647796631\n" | |
] | |
} | |
], | |
"source": [ | |
"boosted_regr = AdaBoostRegressor(\n", | |
" cuml.svm.SVR(),\n", | |
" n_estimators=10,\n", | |
" random_state=12,\n", | |
")\n", | |
"\n", | |
"with Timer() as cuml_fit_time:\n", | |
" boosted_regr.fit(X, y)\n", | |
" \n", | |
"with Timer() as cuml_score_time:\n", | |
" boosted_regr.score(X, y)\n", | |
" \n", | |
"print(f\"Fit time (seconds): {cuml_fit_time.elapsed}\")\n", | |
"print(f\"Score time (seconds): {cuml_score_time.elapsed}\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"**Note:** I have commented this block out since there is no `sk_fit_time` and `sk_score_time` since we are not running the Sklearn example above in the interest of time." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"cuML Boosted SVR Fit Speedup with 20K rows: 139.8x\n", | |
"cuML Boosted SVR Score Speedup with 20K row: 431.4x\n" | |
] | |
} | |
], | |
"source": [ | |
"# print(f\"cuML Boosted SVR Fit Speedup with 20K rows: {round(sk_fit_time.elapsed / cuml_fit_time.elapsed, 1)}x\")\n", | |
"# print(f\"cuML Boosted SVR Score Speedup with 20K row: {round(sk_score_time.elapsed / cuml_score_time.elapsed, 1)}x\")" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Speedups will vary by algorithm, but may be even larger if using more than 20,000 rows." | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "RAPIDS Nightly", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment