Skip to content

Instantly share code, notes, and snippets.

@buswedg
Created August 28, 2021 20:36
Show Gist options
  • Save buswedg/6fbcd28e9c5d626cd5a19756a2cdbc14 to your computer and use it in GitHub Desktop.
Save buswedg/6fbcd28e9c5d626cd5a19756a2cdbc14 to your computer and use it in GitHub Desktop.
building_feature_engineering_pipelines\numeric_prediction_using_pipelines
import numpy as np
import sklearn.base
from sklearn import metrics
class transform_predict(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
def __init__(self, clf: sklearn.base.BaseEstimator):
self.clf = clf
def fit(self, *args, **kwargs):
self.clf.fit(*args, **kwargs)
return self
def transform(self, X: np.ndarray, **transform_params):
pred = self.clf.predict(X)
return pred.reshape(-1, 1) if len(pred.shape) == 1 else pred
class transform_predict_proba(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
def __init__(self, clf: sklearn.base.ClassifierMixin, drop: bool = True):
self.clf = clf
self.drop = drop
def fit(self, *args, **kwargs):
self.clf.fit(*args, **kwargs)
return self
def transform(self, X: np.ndarray, **transform_params):
pred = self.clf.predict_proba(X)
return pred[:, 1:] if self.drop else pred
def get_regression_metrics(y_true, y_pred):
print('mean_squared_error', np.round(metrics.mean_squared_error(y_true, y_pred), 4))
print('explained_variance_score', np.round(metrics.explained_variance_score(y_true, y_pred), 4))
print('mean_absolute_error', np.round(metrics.mean_absolute_error(y_true, y_pred), 4))
print('mean_squared_error', np.round(metrics.mean_squared_error(y_true, y_pred), 4))
print('median_absolute_error', np.round(metrics.median_absolute_error(y_true, y_pred), 4))
print('r2_score', np.round(metrics.r2_score(y_true, y_pred), 4))
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "64c31424-bcca-49c4-9932-78216c734ab7",
"metadata": {},
"source": [
"# Numerical Prediction using Pipelines"
]
},
{
"cell_type": "markdown",
"id": "6629109c-4437-4570-be70-f83766eb860a",
"metadata": {},
"source": [
"#### Description:"
]
},
{
"cell_type": "markdown",
"id": "ec759365-4c71-4846-80d7-3550f0b1a154",
"metadata": {},
"source": [
"This codebook covers how to use pipelines with pre-processing steps to make a numerical prediction."
]
},
{
"cell_type": "markdown",
"id": "70a24692-576c-4684-a5ec-58afdd179a3a",
"metadata": {},
"source": [
"#### Skill level:"
]
},
{
"cell_type": "markdown",
"id": "83b78e91-61f3-4636-9dab-c8fc2b1dde76",
"metadata": {},
"source": [
"- Advanced"
]
},
{
"cell_type": "markdown",
"id": "12b5d8a4-4295-4377-b667-417b2d7f967b",
"metadata": {},
"source": [
"-------------------------\n",
"### Import the required libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "68fe0efd-33b4-492c-a271-af6a1b958adc",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"\n",
"platform_path = os.path.abspath(os.path.join(os.path.abspath(''), '../../../'))\n",
"sys.path.append(platform_path)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5bbb75ed-bc12-455c-bcdb-7c4e967d189f",
"metadata": {},
"outputs": [],
"source": [
"import HELPERS.machine_learning.model_development as md\n",
"import HELPERS.machine_learning.model_evaluation as me\n",
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.neural_network import MLPRegressor\n",
"from sklearn.pipeline import Pipeline, FeatureUnion\n",
"from sklearn import metrics\n",
"from IPython.display import display, HTML"
]
},
{
"cell_type": "markdown",
"id": "d39d190c-950e-4bc2-a482-bd558e1e01c6",
"metadata": {},
"source": [
"-------------------------\n",
"### Read data into a dataframe"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "6c740939-dfba-499c-adf1-d5582bf8bdbe",
"metadata": {},
"outputs": [],
"source": [
"df_raw = pd.read_csv(os.path.join(platform_path, 'DATA/boston.txt'))"
]
},
{
"cell_type": "markdown",
"id": "52c08cdb-917d-4815-ab1e-484e7a7b5745",
"metadata": {},
"source": [
"-------------------------\n",
"### Check shape and head of the dataframe"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c0a1cab4-32a7-4fb9-ab66-e1e8dbfa48e8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(506, 14)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_raw.shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e2e82666-63f2-4c17-90f7-25cf10fbdb2c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>CRIM</th>\n",
" <th>ZN</th>\n",
" <th>INDUS</th>\n",
" <th>CHAS</th>\n",
" <th>NOX</th>\n",
" <th>RM</th>\n",
" <th>AGE</th>\n",
" <th>DIS</th>\n",
" <th>RAD</th>\n",
" <th>TAX</th>\n",
" <th>PTRATIO</th>\n",
" <th>B</th>\n",
" <th>LSTAT</th>\n",
" <th>MDEV</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.00632</td>\n",
" <td>18.0</td>\n",
" <td>2.31</td>\n",
" <td>0.0</td>\n",
" <td>0.538</td>\n",
" <td>6.575</td>\n",
" <td>65.2</td>\n",
" <td>4.0900</td>\n",
" <td>1.0</td>\n",
" <td>296.0</td>\n",
" <td>15.3</td>\n",
" <td>396.90</td>\n",
" <td>4.98</td>\n",
" <td>24.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.02731</td>\n",
" <td>0.0</td>\n",
" <td>7.07</td>\n",
" <td>0.0</td>\n",
" <td>0.469</td>\n",
" <td>6.421</td>\n",
" <td>78.9</td>\n",
" <td>4.9671</td>\n",
" <td>2.0</td>\n",
" <td>242.0</td>\n",
" <td>17.8</td>\n",
" <td>396.90</td>\n",
" <td>9.14</td>\n",
" <td>21.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0.02729</td>\n",
" <td>0.0</td>\n",
" <td>7.07</td>\n",
" <td>0.0</td>\n",
" <td>0.469</td>\n",
" <td>7.185</td>\n",
" <td>61.1</td>\n",
" <td>4.9671</td>\n",
" <td>2.0</td>\n",
" <td>242.0</td>\n",
" <td>17.8</td>\n",
" <td>392.83</td>\n",
" <td>4.03</td>\n",
" <td>34.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.03237</td>\n",
" <td>0.0</td>\n",
" <td>2.18</td>\n",
" <td>0.0</td>\n",
" <td>0.458</td>\n",
" <td>6.998</td>\n",
" <td>45.8</td>\n",
" <td>6.0622</td>\n",
" <td>3.0</td>\n",
" <td>222.0</td>\n",
" <td>18.7</td>\n",
" <td>394.63</td>\n",
" <td>2.94</td>\n",
" <td>33.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.06905</td>\n",
" <td>0.0</td>\n",
" <td>2.18</td>\n",
" <td>0.0</td>\n",
" <td>0.458</td>\n",
" <td>7.147</td>\n",
" <td>54.2</td>\n",
" <td>6.0622</td>\n",
" <td>3.0</td>\n",
" <td>222.0</td>\n",
" <td>18.7</td>\n",
" <td>396.90</td>\n",
" <td>5.33</td>\n",
" <td>36.2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n",
"0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n",
"1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n",
"2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 \n",
"3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 \n",
"4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 \n",
"\n",
" PTRATIO B LSTAT MDEV \n",
"0 15.3 396.90 4.98 24.0 \n",
"1 17.8 396.90 9.14 21.6 \n",
"2 17.8 392.83 4.03 34.7 \n",
"3 18.7 394.63 2.94 33.4 \n",
"4 18.7 396.90 5.33 36.2 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_raw.head()"
]
},
{
"cell_type": "markdown",
"id": "f05909f4-a544-4fd7-bf79-9e6631657767",
"metadata": {},
"source": [
"-------------------------\n",
"### Create a dataframe and definition to store results"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "03de3165-94ef-427f-841f-f39975b4a982",
"metadata": {},
"outputs": [],
"source": [
"df_results = pd.DataFrame([], columns=['clf', 'train', 'test'])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "9cceebb5-ab4a-4377-8903-2cdc753623df",
"metadata": {},
"outputs": [],
"source": [
"def append_results(clf, y_true_train, y_pred_train, y_true_test, y_pred_test):\n",
" r2_score_train = np.round(metrics.r2_score(y_true_train, y_pred_train), 4)\n",
" r2_score_test = np.round(metrics.r2_score(y_true_test, y_pred_test), 4)\n",
"\n",
" df_results.loc[len(df_results)] = [clf, r2_score_train, r2_score_test]\n",
" \n",
" display(HTML(df_results.to_html()))"
]
},
{
"cell_type": "markdown",
"id": "b98b4e1e-7cd7-4891-9f85-761a5c5020eb",
"metadata": {},
"source": [
"-------------------------\n",
"### Separate features from the label"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "4d0c8404-04e2-4ae9-a196-7cc962d4497d",
"metadata": {},
"outputs": [],
"source": [
"X_all = df_raw.drop('MDEV', axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "9555d9f9-ffb5-46dd-879f-4ce51e5b75d2",
"metadata": {},
"outputs": [],
"source": [
"y_true_all = df_raw[['MDEV']].values.ravel()"
]
},
{
"cell_type": "markdown",
"id": "c2b1f5f5-8ff4-4f73-9860-060c6eba8027",
"metadata": {},
"source": [
"-------------------------\n",
"### Make a split between training and test sets of data"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "95d0c2f0-aa10-4ab1-a152-383e23591a3f",
"metadata": {},
"outputs": [],
"source": [
"np.random.seed(47)\n",
"\n",
"X_train, X_test, y_true_train, y_true_test = train_test_split(X_all, y_true_all, test_size=0.3)"
]
},
{
"cell_type": "markdown",
"id": "3fb55b29-5d8c-4f7b-82d2-936f90a7af03",
"metadata": {},
"source": [
"-------------------------\n",
"### Fit a neural network regressor"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "35ba47a8-f503-4d32-8430-c830d03e74aa",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\darry\\anaconda3\\envs\\datakick\\lib\\site-packages\\sklearn\\neural_network\\_multilayer_perceptron.py:614: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/plain": [
"MLPRegressor()"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.random.seed(47)\n",
"\n",
"clf = MLPRegressor()\n",
"\n",
"clf.fit(X_train, y_true_train)"
]
},
{
"cell_type": "markdown",
"id": "d4651008-c747-4f2f-8cd3-adaf3f825d4a",
"metadata": {},
"source": [
"-------------------------\n",
"### Generate predictions using the fitted model"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "ec8402de-9cbd-4fd4-9cf6-e474694b7963",
"metadata": {},
"outputs": [],
"source": [
"y_pred_train = clf.predict(X_train.values)\n",
"y_pred_test = clf.predict(X_test.values)"
]
},
{
"cell_type": "markdown",
"id": "fa030a3d-d8e3-4b83-9fee-27fdcc1bc0cb",
"metadata": {},
"source": [
"-------------------------\n",
"### Get model performance metrics"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "94498991-898b-4819-8428-c20d87cb7efc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"mean_squared_error 44.2158\n",
"explained_variance_score 0.4864\n",
"mean_absolute_error 4.7126\n",
"mean_squared_error 44.2158\n",
"median_absolute_error 3.7444\n",
"r2_score 0.4862\n"
]
}
],
"source": [
"me.get_regression_metrics(y_true_train, y_pred_train)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "39854c12-3af0-451f-815f-b03a107b9ed8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"mean_squared_error 40.2137\n",
"explained_variance_score 0.5025\n",
"mean_absolute_error 4.4555\n",
"mean_squared_error 40.2137\n",
"median_absolute_error 3.4952\n",
"r2_score 0.501\n"
]
}
],
"source": [
"me.get_regression_metrics(y_true_test, y_pred_test)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "6c15fc40-80c2-4af6-88e7-6b4f8b3c6460",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>clf</th>\n",
" <th>train</th>\n",
" <th>test</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>MLPRegressor</td>\n",
" <td>0.4862</td>\n",
" <td>0.501</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"append_results('MLPRegressor', y_true_train, y_pred_train, y_true_test, y_pred_test)"
]
},
{
"cell_type": "markdown",
"id": "47eb5140-b24e-4f59-90b2-1222d8fbdc69",
"metadata": {},
"source": [
"-------------------------\n",
"### Fit a neural network regressor w feature scaling"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "59c6dd4d-d033-468c-b003-5c4c31c9a453",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\darry\\anaconda3\\envs\\datakick\\lib\\site-packages\\sklearn\\neural_network\\_multilayer_perceptron.py:614: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/plain": [
"Pipeline(steps=[('scaler', StandardScaler()), ('mlp', MLPRegressor())])"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.random.seed(47)\n",
"\n",
"pipe = Pipeline([\n",
" ('scaler', StandardScaler()), \n",
" ('mlp', MLPRegressor())\n",
"])\n",
"\n",
"pipe.fit(X_train, y_true_train)"
]
},
{
"cell_type": "markdown",
"id": "41b17011-7e1c-467f-929e-919a46a33c5a",
"metadata": {},
"source": [
"-------------------------\n",
"### Generate predictions using the fitted model"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "82889e37-0b0f-4a9d-be2f-245c6c0ad39c",
"metadata": {},
"outputs": [],
"source": [
"y_pred_train = pipe.predict(X_train.values)\n",
"y_pred_test = pipe.predict(X_test.values)"
]
},
{
"cell_type": "markdown",
"id": "7ea7f393-042e-4775-9acb-46d26a7fb9f0",
"metadata": {},
"source": [
"-------------------------\n",
"### Get model performance metrics"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "35b5db05-0983-4587-8e4c-5de3f8e567dd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"mean_squared_error 25.5842\n",
"explained_variance_score 0.7097\n",
"mean_absolute_error 3.7858\n",
"mean_squared_error 25.5842\n",
"median_absolute_error 3.2032\n",
"r2_score 0.7027\n"
]
}
],
"source": [
"me.get_regression_metrics(y_true_train, y_pred_train)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "7122dbdb-92fe-4989-9005-e6b4ae1e85cd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"mean_squared_error 38.5706\n",
"explained_variance_score 0.5617\n",
"mean_absolute_error 4.4245\n",
"mean_squared_error 38.5706\n",
"median_absolute_error 3.5699\n",
"r2_score 0.5214\n"
]
}
],
"source": [
"me.get_regression_metrics(y_true_test, y_pred_test)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "7391ab80-2ffd-4dd4-9dde-f7116502794d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>clf</th>\n",
" <th>train</th>\n",
" <th>test</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>MLPRegressor</td>\n",
" <td>0.4862</td>\n",
" <td>0.5010</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>MLPRegressor w Scaler</td>\n",
" <td>0.7027</td>\n",
" <td>0.5214</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"append_results('MLPRegressor w Scaler', y_true_train, y_pred_train, y_true_test, y_pred_test)"
]
},
{
"cell_type": "markdown",
"id": "eef0af50-f27d-42ff-bf96-fb9e23a7e5c2",
"metadata": {},
"source": [
"-------------------------\n",
"### Fit a neural network regressor w feature scaling and appended PCA components"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "1fd42e70-db38-4397-9e79-c16c30511543",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\darry\\anaconda3\\envs\\datakick\\lib\\site-packages\\sklearn\\neural_network\\_multilayer_perceptron.py:614: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/plain": [
"Pipeline(steps=[('feat',\n",
" FeatureUnion(transformer_list=[('scaler', StandardScaler()),\n",
" ('pca', PCA())])),\n",
" ('mlpr', MLPRegressor())])"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.random.seed(47)\n",
"\n",
"pipe = Pipeline(steps=[\n",
" ('feat', FeatureUnion(transformer_list=[\n",
" ('scaler', StandardScaler()),\n",
" ('pca', PCA()),\n",
" ])),\n",
" ('mlpr', MLPRegressor())\n",
"])\n",
"\n",
"pipe.fit(X_train, y_true_train)"
]
},
{
"cell_type": "markdown",
"id": "0dd082d9-52f3-4532-afa7-18028c399bfd",
"metadata": {},
"source": [
"-------------------------\n",
"### Generate predictions using the fitted model"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "b4b87d68-ad5a-4d75-aa54-14444d3405f4",
"metadata": {},
"outputs": [],
"source": [
"y_pred_train = pipe.predict(X_train.values)\n",
"y_pred_test = pipe.predict(X_test.values)"
]
},
{
"cell_type": "markdown",
"id": "a8c884b3-fd50-4d8a-a3c1-3cb0485d8b03",
"metadata": {},
"source": [
"-------------------------\n",
"### Get model performance metrics"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "b2acc7e6-f9a5-4f4d-945c-de4d370e8369",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"mean_squared_error 19.5491\n",
"explained_variance_score 0.7754\n",
"mean_absolute_error 3.3399\n",
"mean_squared_error 19.5491\n",
"median_absolute_error 2.694\n",
"r2_score 0.7728\n"
]
}
],
"source": [
"me.get_regression_metrics(y_true_train, y_pred_train)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "c17b3dc1-daf8-4aff-9e05-daaa1111b51f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"mean_squared_error 24.8428\n",
"explained_variance_score 0.712\n",
"mean_absolute_error 3.6282\n",
"mean_squared_error 24.8428\n",
"median_absolute_error 2.9865\n",
"r2_score 0.6918\n"
]
}
],
"source": [
"me.get_regression_metrics(y_true_test, y_pred_test)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "44dda43b-48c3-4975-9365-e8fedf29bcde",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>clf</th>\n",
" <th>train</th>\n",
" <th>test</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>MLPRegressor</td>\n",
" <td>0.4862</td>\n",
" <td>0.5010</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>MLPRegressor w Scaler</td>\n",
" <td>0.7027</td>\n",
" <td>0.5214</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>MLPRegressor w Scaler &amp; PCA features</td>\n",
" <td>0.7728</td>\n",
" <td>0.6918</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"append_results('MLPRegressor w Scaler & PCA features', y_true_train, y_pred_train, y_true_test, y_pred_test)"
]
},
{
"cell_type": "markdown",
"id": "ace0bb95-78ae-471f-bbe6-b473d0d2cc87",
"metadata": {},
"source": [
"-------------------------\n",
"### Fit a neural network regressor w feature scaling, appended PCA components & appended encoded clusters"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "1226b462-e27d-412e-abab-94442d5e0127",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\darry\\anaconda3\\envs\\datakick\\lib\\site-packages\\sklearn\\neural_network\\_multilayer_perceptron.py:614: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/plain": [
"Pipeline(steps=[('feat',\n",
" FeatureUnion(transformer_list=[('onehot',\n",
" Pipeline(steps=[('kmeans',\n",
" transform_predict(clf=KMeans(n_clusters=6))),\n",
" ('onehot',\n",
" OneHotEncoder())])),\n",
" ('scaler', StandardScaler()),\n",
" ('pca', PCA())])),\n",
" ('mlpr', MLPRegressor())])"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.random.seed(47)\n",
"\n",
"pipe = Pipeline(steps=[\n",
" ('feat', FeatureUnion(transformer_list=[\n",
" ('onehot', Pipeline(steps=[\n",
" ('kmeans', md.transform_predict(KMeans(n_clusters=6))),\n",
" ('onehot', OneHotEncoder(categories='auto')) \n",
" ])),\n",
" ('scaler', StandardScaler()),\n",
" ('pca', PCA())\n",
" ])),\n",
" ('mlpr', MLPRegressor())\n",
"])\n",
"\n",
"pipe.fit(X_train, y_true_train)"
]
},
{
"cell_type": "markdown",
"id": "07c4a038-0204-4c50-91bf-ff938190286b",
"metadata": {},
"source": [
"-------------------------\n",
"### Generate predictions using the fitted model"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "e7560a07-8ed1-48c9-a750-518fefd2edc3",
"metadata": {},
"outputs": [],
"source": [
"y_pred_train = pipe.predict(X_train.values)\n",
"y_pred_test = pipe.predict(X_test.values)"
]
},
{
"cell_type": "markdown",
"id": "d0d434eb-4e00-4d35-a08a-f319516553ee",
"metadata": {},
"source": [
"-------------------------\n",
"### Get model performance metrics"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "3506c06a-b174-4a79-9e80-fc52250a51c7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"mean_squared_error 12.8069\n",
"explained_variance_score 0.8521\n",
"mean_absolute_error 2.6122\n",
"mean_squared_error 12.8069\n",
"median_absolute_error 1.9178\n",
"r2_score 0.8512\n"
]
}
],
"source": [
"me.get_regression_metrics(y_true_train, y_pred_train)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "13b554cb-579a-4c88-a0c4-efd192e956ea",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"mean_squared_error 20.7287\n",
"explained_variance_score 0.7501\n",
"mean_absolute_error 3.244\n",
"mean_squared_error 20.7287\n",
"median_absolute_error 2.5522\n",
"r2_score 0.7428\n"
]
}
],
"source": [
"me.get_regression_metrics(y_true_test, y_pred_test)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "f346a93c-b32d-466c-ac00-af4c5347917b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>clf</th>\n",
" <th>train</th>\n",
" <th>test</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>MLPRegressor</td>\n",
" <td>0.4862</td>\n",
" <td>0.5010</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>MLPRegressor w Scaler</td>\n",
" <td>0.7027</td>\n",
" <td>0.5214</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>MLPRegressor w Scaler &amp; PCA features</td>\n",
" <td>0.7728</td>\n",
" <td>0.6918</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>MLPRegressor w Scaler, PCA &amp; Encoded cluster features</td>\n",
" <td>0.8512</td>\n",
" <td>0.7428</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"append_results('MLPRegressor w Scaler, PCA & Encoded cluster features', y_true_train, y_pred_train, y_true_test, y_pred_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bf4e854b-d650-4bf5-8a10-247a3bc4b01b",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment