roaramburu/cuml-boosted-regression.ipynb

## cuml-boosted-regression.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Boosted Regression\n",
    "\n",
    "Source: [From Hours to Seconds: 100x Faster Boosting, Bagging, and Stacking with RAPIDS cuML and Scikit-learn Machine Learning Model Ensembling](https://medium.com/rapids-ai/100x-faster-machine-learning-model-ensembling-with-rapids-cuml-and-scikit-learn-meta-estimators-d869788ee6b1) by Nick Becker and Dante Gamme Desavne"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import cudf\n",
    "\n",
    "cudf.set_allocator(\n",
    "    pool=True,\n",
    "    initial_pool_size=3000000000 # 3 GB\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "\n",
    "from sklearn.ensemble import AdaBoostRegressor\n",
    "from sklearn.datasets import make_regression\n",
    "from sklearn.metrics import r2_score\n",
    "from sklearn.svm import SVR\n",
    "\n",
    "import cuml\n",
    "\n",
    "\n",
    "NFEATURES = 10\n",
    "\n",
    "X, y = make_regression(\n",
    "    n_samples=20000,\n",
    "    n_features=NFEATURES,\n",
    "    n_informative=NFEATURES,\n",
    "    random_state=12,\n",
    "    noise=200,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Timer:    \n",
    "    def __enter__(self):\n",
    "        self.tick = time.time()\n",
    "        return self\n",
    "\n",
    "    def __exit__(self, *args, **kwargs):\n",
    "        self.tock = time.time()\n",
    "        self.elapsed = self.tock - self.tick"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Sklearn\n",
    "\n",
    "**Note:** I have commented out this block to save you waiting an hour for it to complete."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fit time (seconds): 239.5093755722046\n",
      "Score time (seconds): 98.0283591747284\n"
     ]
    }
   ],
   "source": [
    "# boosted_regr = AdaBoostRegressor(\n",
    "#     SVR(),\n",
    "#     n_estimators=10,\n",
    "#     random_state=12,\n",
    "# )\n",
    "\n",
    "# with Timer() as sk_fit_time:\n",
    "#     boosted_regr.fit(X, y)\n",
    "    \n",
    "# with Timer() as sk_score_time:\n",
    "#     boosted_regr.score(X, y)\n",
    "    \n",
    "# print(f\"Fit time (seconds): {sk_fit_time.elapsed}\")\n",
    "# print(f\"Score time (seconds): {sk_score_time.elapsed}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### cuML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fit time (seconds): 8.22884464263916\n",
      "Score time (seconds): 2.031411647796631\n"
     ]
    }
   ],
   "source": [
    "boosted_regr = AdaBoostRegressor(\n",
    "    cuml.svm.SVR(),\n",
    "    n_estimators=10,\n",
    "    random_state=12,\n",
    ")\n",
    "\n",
    "with Timer() as cuml_fit_time:\n",
    "    boosted_regr.fit(X, y)\n",
    "    \n",
    "with Timer() as cuml_score_time:\n",
    "    boosted_regr.score(X, y)\n",
    "    \n",
    "print(f\"Fit time (seconds): {cuml_fit_time.elapsed}\")\n",
    "print(f\"Score time (seconds): {cuml_score_time.elapsed}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Note:** I have commented this block out since there is no `sk_fit_time` and `sk_score_time` since we are not running the Sklearn example above in the interest of time."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "cuML Boosted SVR Fit Speedup with 20K rows: 139.8x\n",
      "cuML Boosted SVR Score Speedup  with 20K row: 431.4x\n"
     ]
    }
   ],
   "source": [
    "# print(f\"cuML Boosted SVR Fit Speedup with 20K rows: {round(sk_fit_time.elapsed / cuml_fit_time.elapsed, 1)}x\")\n",
    "# print(f\"cuML Boosted SVR Score Speedup  with 20K row: {round(sk_score_time.elapsed / cuml_score_time.elapsed, 1)}x\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Speedups will vary by algorithm, but may be even larger if using more than 20,000 rows."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "RAPIDS Nightly",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Boosted Regression\n",
	"\n",
	"Source: [From Hours to Seconds: 100x Faster Boosting, Bagging, and Stacking with RAPIDS cuML and Scikit-learn Machine Learning Model Ensembling](https://medium.com/rapids-ai/100x-faster-machine-learning-model-ensembling-with-rapids-cuml-and-scikit-learn-meta-estimators-d869788ee6b1) by Nick Becker and Dante Gamme Desavne"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import cudf\n",
	"\n",
	"cudf.set_allocator(\n",
	" pool=True,\n",
	" initial_pool_size=3000000000 # 3 GB\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"import time\n",
	"\n",
	"from sklearn.ensemble import AdaBoostRegressor\n",
	"from sklearn.datasets import make_regression\n",
	"from sklearn.metrics import r2_score\n",
	"from sklearn.svm import SVR\n",
	"\n",
	"import cuml\n",
	"\n",
	"\n",
	"NFEATURES = 10\n",
	"\n",
	"X, y = make_regression(\n",
	" n_samples=20000,\n",
	" n_features=NFEATURES,\n",
	" n_informative=NFEATURES,\n",
	" random_state=12,\n",
	" noise=200,\n",
	")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"class Timer: \n",
	" def __enter__(self):\n",
	" self.tick = time.time()\n",
	" return self\n",
	"\n",
	" def __exit__(self, args, *kwargs):\n",
	" self.tock = time.time()\n",
	" self.elapsed = self.tock - self.tick"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Sklearn\n",
	"\n",
	"Note: I have commented out this block to save you waiting an hour for it to complete."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Fit time (seconds): 239.5093755722046\n",
	"Score time (seconds): 98.0283591747284\n"
	]
	}
	],
	"source": [
	"# boosted_regr = AdaBoostRegressor(\n",
	"# SVR(),\n",
	"# n_estimators=10,\n",
	"# random_state=12,\n",
	"# )\n",
	"\n",
	"# with Timer() as sk_fit_time:\n",
	"# boosted_regr.fit(X, y)\n",
	" \n",
	"# with Timer() as sk_score_time:\n",
	"# boosted_regr.score(X, y)\n",
	" \n",
	"# print(f\"Fit time (seconds): {sk_fit_time.elapsed}\")\n",
	"# print(f\"Score time (seconds): {sk_score_time.elapsed}\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### cuML"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Fit time (seconds): 8.22884464263916\n",
	"Score time (seconds): 2.031411647796631\n"
	]
	}
	],
	"source": [
	"boosted_regr = AdaBoostRegressor(\n",
	" cuml.svm.SVR(),\n",
	" n_estimators=10,\n",
	" random_state=12,\n",
	")\n",
	"\n",
	"with Timer() as cuml_fit_time:\n",
	" boosted_regr.fit(X, y)\n",
	" \n",
	"with Timer() as cuml_score_time:\n",
	" boosted_regr.score(X, y)\n",
	" \n",
	"print(f\"Fit time (seconds): {cuml_fit_time.elapsed}\")\n",
	"print(f\"Score time (seconds): {cuml_score_time.elapsed}\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Note: I have commented this block out since there is no `sk_fit_time` and `sk_score_time` since we are not running the Sklearn example above in the interest of time."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"cuML Boosted SVR Fit Speedup with 20K rows: 139.8x\n",
	"cuML Boosted SVR Score Speedup with 20K row: 431.4x\n"
	]
	}
	],
	"source": [
	"# print(f\"cuML Boosted SVR Fit Speedup with 20K rows: {round(sk_fit_time.elapsed / cuml_fit_time.elapsed, 1)}x\")\n",
	"# print(f\"cuML Boosted SVR Score Speedup with 20K row: {round(sk_score_time.elapsed / cuml_score_time.elapsed, 1)}x\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Speedups will vary by algorithm, but may be even larger if using more than 20,000 rows."
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "RAPIDS Nightly",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}