buswedg/helpers.py

## helpers.py
import numpy as np
import sklearn.base
from sklearn import metrics


class transform_predict(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):

    def __init__(self, clf: sklearn.base.BaseEstimator):
        self.clf = clf

    def fit(self, *args, **kwargs):
        self.clf.fit(*args, **kwargs)

        return self

    def transform(self, X: np.ndarray, **transform_params):
        pred = self.clf.predict(X)

        return pred.reshape(-1, 1) if len(pred.shape) == 1 else pred


class transform_predict_proba(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):

    def __init__(self, clf: sklearn.base.ClassifierMixin, drop: bool = True):
        self.clf = clf
        self.drop = drop

    def fit(self, *args, **kwargs):
        self.clf.fit(*args, **kwargs)

        return self

    def transform(self, X: np.ndarray, **transform_params):
        pred = self.clf.predict_proba(X)

        return pred[:, 1:] if self.drop else pred


def get_regression_metrics(y_true, y_pred):
    print('mean_squared_error', np.round(metrics.mean_squared_error(y_true, y_pred), 4))
    print('explained_variance_score', np.round(metrics.explained_variance_score(y_true, y_pred), 4))
    print('mean_absolute_error', np.round(metrics.mean_absolute_error(y_true, y_pred), 4))
    print('mean_squared_error', np.round(metrics.mean_squared_error(y_true, y_pred), 4))
    print('median_absolute_error', np.round(metrics.median_absolute_error(y_true, y_pred), 4))
    print('r2_score', np.round(metrics.r2_score(y_true, y_pred), 4))

## numeric_prediction_using_pipelines.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "64c31424-bcca-49c4-9932-78216c734ab7",
   "metadata": {},
   "source": [
    "# Numerical Prediction using Pipelines"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6629109c-4437-4570-be70-f83766eb860a",
   "metadata": {},
   "source": [
    "#### Description:"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ec759365-4c71-4846-80d7-3550f0b1a154",
   "metadata": {},
   "source": [
    "This codebook covers how to use pipelines with pre-processing steps to make a numerical prediction."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "70a24692-576c-4684-a5ec-58afdd179a3a",
   "metadata": {},
   "source": [
    "#### Skill level:"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "83b78e91-61f3-4636-9dab-c8fc2b1dde76",
   "metadata": {},
   "source": [
    "- Advanced"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "12b5d8a4-4295-4377-b667-417b2d7f967b",
   "metadata": {},
   "source": [
    "-------------------------\n",
    "### Import the required libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "68fe0efd-33b4-492c-a271-af6a1b958adc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "\n",
    "platform_path = os.path.abspath(os.path.join(os.path.abspath(''), '../../../'))\n",
    "sys.path.append(platform_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5bbb75ed-bc12-455c-bcdb-7c4e967d189f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import HELPERS.machine_learning.model_development as md\n",
    "import HELPERS.machine_learning.model_evaluation as me\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn.neural_network import MLPRegressor\n",
    "from sklearn.pipeline import Pipeline, FeatureUnion\n",
    "from sklearn import metrics\n",
    "from IPython.display import display, HTML"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d39d190c-950e-4bc2-a482-bd558e1e01c6",
   "metadata": {},
   "source": [
    "-------------------------\n",
    "### Read data into a dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "6c740939-dfba-499c-adf1-d5582bf8bdbe",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_raw = pd.read_csv(os.path.join(platform_path, 'DATA/boston.txt'))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "52c08cdb-917d-4815-ab1e-484e7a7b5745",
   "metadata": {},
   "source": [
    "-------------------------\n",
    "### Check shape and head of the dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c0a1cab4-32a7-4fb9-ab66-e1e8dbfa48e8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(506, 14)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_raw.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e2e82666-63f2-4c17-90f7-25cf10fbdb2c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>CRIM</th>\n",
       "      <th>ZN</th>\n",
       "      <th>INDUS</th>\n",
       "      <th>CHAS</th>\n",
       "      <th>NOX</th>\n",
       "      <th>RM</th>\n",
       "      <th>AGE</th>\n",
       "      <th>DIS</th>\n",
       "      <th>RAD</th>\n",
       "      <th>TAX</th>\n",
       "      <th>PTRATIO</th>\n",
       "      <th>B</th>\n",
       "      <th>LSTAT</th>\n",
       "      <th>MDEV</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.00632</td>\n",
       "      <td>18.0</td>\n",
       "      <td>2.31</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.538</td>\n",
       "      <td>6.575</td>\n",
       "      <td>65.2</td>\n",
       "      <td>4.0900</td>\n",
       "      <td>1.0</td>\n",
       "      <td>296.0</td>\n",
       "      <td>15.3</td>\n",
       "      <td>396.90</td>\n",
       "      <td>4.98</td>\n",
       "      <td>24.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.02731</td>\n",
       "      <td>0.0</td>\n",
       "      <td>7.07</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.469</td>\n",
       "      <td>6.421</td>\n",
       "      <td>78.9</td>\n",
       "      <td>4.9671</td>\n",
       "      <td>2.0</td>\n",
       "      <td>242.0</td>\n",
       "      <td>17.8</td>\n",
       "      <td>396.90</td>\n",
       "      <td>9.14</td>\n",
       "      <td>21.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.02729</td>\n",
       "      <td>0.0</td>\n",
       "      <td>7.07</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.469</td>\n",
       "      <td>7.185</td>\n",
       "      <td>61.1</td>\n",
       "      <td>4.9671</td>\n",
       "      <td>2.0</td>\n",
       "      <td>242.0</td>\n",
       "      <td>17.8</td>\n",
       "      <td>392.83</td>\n",
       "      <td>4.03</td>\n",
       "      <td>34.7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.03237</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.18</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.458</td>\n",
       "      <td>6.998</td>\n",
       "      <td>45.8</td>\n",
       "      <td>6.0622</td>\n",
       "      <td>3.0</td>\n",
       "      <td>222.0</td>\n",
       "      <td>18.7</td>\n",
       "      <td>394.63</td>\n",
       "      <td>2.94</td>\n",
       "      <td>33.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.06905</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.18</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.458</td>\n",
       "      <td>7.147</td>\n",
       "      <td>54.2</td>\n",
       "      <td>6.0622</td>\n",
       "      <td>3.0</td>\n",
       "      <td>222.0</td>\n",
       "      <td>18.7</td>\n",
       "      <td>396.90</td>\n",
       "      <td>5.33</td>\n",
       "      <td>36.2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \\\n",
       "0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   \n",
       "1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   \n",
       "2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   \n",
       "3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   \n",
       "4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   \n",
       "\n",
       "   PTRATIO       B  LSTAT  MDEV  \n",
       "0     15.3  396.90   4.98  24.0  \n",
       "1     17.8  396.90   9.14  21.6  \n",
       "2     17.8  392.83   4.03  34.7  \n",
       "3     18.7  394.63   2.94  33.4  \n",
       "4     18.7  396.90   5.33  36.2  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_raw.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f05909f4-a544-4fd7-bf79-9e6631657767",
   "metadata": {},
   "source": [
    "-------------------------\n",
    "### Create a dataframe and definition to store results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "03de3165-94ef-427f-841f-f39975b4a982",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_results = pd.DataFrame([], columns=['clf', 'train', 'test'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "9cceebb5-ab4a-4377-8903-2cdc753623df",
   "metadata": {},
   "outputs": [],
   "source": [
    "def append_results(clf, y_true_train, y_pred_train, y_true_test, y_pred_test):\n",
    "    r2_score_train = np.round(metrics.r2_score(y_true_train, y_pred_train), 4)\n",
    "    r2_score_test = np.round(metrics.r2_score(y_true_test, y_pred_test), 4)\n",
    "\n",
    "    df_results.loc[len(df_results)] = [clf, r2_score_train, r2_score_test]\n",
    "    \n",
    "    display(HTML(df_results.to_html()))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b98b4e1e-7cd7-4891-9f85-761a5c5020eb",
   "metadata": {},
   "source": [
    "-------------------------\n",
    "### Separate features from the label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "4d0c8404-04e2-4ae9-a196-7cc962d4497d",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_all = df_raw.drop('MDEV', axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "9555d9f9-ffb5-46dd-879f-4ce51e5b75d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_true_all = df_raw[['MDEV']].values.ravel()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c2b1f5f5-8ff4-4f73-9860-060c6eba8027",
   "metadata": {},
   "source": [
    "-------------------------\n",
    "### Make a split between training and test sets of data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "95d0c2f0-aa10-4ab1-a152-383e23591a3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(47)\n",
    "\n",
    "X_train, X_test, y_true_train, y_true_test = train_test_split(X_all, y_true_all, test_size=0.3)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3fb55b29-5d8c-4f7b-82d2-936f90a7af03",
   "metadata": {},
   "source": [
    "-------------------------\n",
    "### Fit a neural network regressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "35ba47a8-f503-4d32-8430-c830d03e74aa",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\darry\\anaconda3\\envs\\datakick\\lib\\site-packages\\sklearn\\neural_network\\_multilayer_perceptron.py:614: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "MLPRegressor()"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.random.seed(47)\n",
    "\n",
    "clf = MLPRegressor()\n",
    "\n",
    "clf.fit(X_train, y_true_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d4651008-c747-4f2f-8cd3-adaf3f825d4a",
   "metadata": {},
   "source": [
    "-------------------------\n",
    "### Generate predictions using the fitted model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "ec8402de-9cbd-4fd4-9cf6-e474694b7963",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred_train = clf.predict(X_train.values)\n",
    "y_pred_test = clf.predict(X_test.values)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fa030a3d-d8e3-4b83-9fee-27fdcc1bc0cb",
   "metadata": {},
   "source": [
    "-------------------------\n",
    "### Get model performance metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "94498991-898b-4819-8428-c20d87cb7efc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mean_squared_error 44.2158\n",
      "explained_variance_score 0.4864\n",
      "mean_absolute_error 4.7126\n",
      "mean_squared_error 44.2158\n",
      "median_absolute_error 3.7444\n",
      "r2_score 0.4862\n"
     ]
    }
   ],
   "source": [
    "me.get_regression_metrics(y_true_train, y_pred_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "39854c12-3af0-451f-815f-b03a107b9ed8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mean_squared_error 40.2137\n",
      "explained_variance_score 0.5025\n",
      "mean_absolute_error 4.4555\n",
      "mean_squared_error 40.2137\n",
      "median_absolute_error 3.4952\n",
      "r2_score 0.501\n"
     ]
    }
   ],
   "source": [
    "me.get_regression_metrics(y_true_test, y_pred_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "6c15fc40-80c2-4af6-88e7-6b4f8b3c6460",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>clf</th>\n",
       "      <th>train</th>\n",
       "      <th>test</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>MLPRegressor</td>\n",
       "      <td>0.4862</td>\n",
       "      <td>0.501</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "append_results('MLPRegressor', y_true_train, y_pred_train, y_true_test, y_pred_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "47eb5140-b24e-4f59-90b2-1222d8fbdc69",
   "metadata": {},
   "source": [
    "-------------------------\n",
    "### Fit a neural network regressor w feature scaling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "59c6dd4d-d033-468c-b003-5c4c31c9a453",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\darry\\anaconda3\\envs\\datakick\\lib\\site-packages\\sklearn\\neural_network\\_multilayer_perceptron.py:614: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Pipeline(steps=[('scaler', StandardScaler()), ('mlp', MLPRegressor())])"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.random.seed(47)\n",
    "\n",
    "pipe = Pipeline([\n",
    "    ('scaler', StandardScaler()), \n",
    "    ('mlp', MLPRegressor())\n",
    "])\n",
    "\n",
    "pipe.fit(X_train, y_true_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "41b17011-7e1c-467f-929e-919a46a33c5a",
   "metadata": {},
   "source": [
    "-------------------------\n",
    "### Generate predictions using the fitted model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "82889e37-0b0f-4a9d-be2f-245c6c0ad39c",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred_train = pipe.predict(X_train.values)\n",
    "y_pred_test = pipe.predict(X_test.values)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7ea7f393-042e-4775-9acb-46d26a7fb9f0",
   "metadata": {},
   "source": [
    "-------------------------\n",
    "### Get model performance metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "35b5db05-0983-4587-8e4c-5de3f8e567dd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mean_squared_error 25.5842\n",
      "explained_variance_score 0.7097\n",
      "mean_absolute_error 3.7858\n",
      "mean_squared_error 25.5842\n",
      "median_absolute_error 3.2032\n",
      "r2_score 0.7027\n"
     ]
    }
   ],
   "source": [
    "me.get_regression_metrics(y_true_train, y_pred_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "7122dbdb-92fe-4989-9005-e6b4ae1e85cd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mean_squared_error 38.5706\n",
      "explained_variance_score 0.5617\n",
      "mean_absolute_error 4.4245\n",
      "mean_squared_error 38.5706\n",
      "median_absolute_error 3.5699\n",
      "r2_score 0.5214\n"
     ]
    }
   ],
   "source": [
    "me.get_regression_metrics(y_true_test, y_pred_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "7391ab80-2ffd-4dd4-9dde-f7116502794d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>clf</th>\n",
       "      <th>train</th>\n",
       "      <th>test</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>MLPRegressor</td>\n",
       "      <td>0.4862</td>\n",
       "      <td>0.5010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>MLPRegressor w Scaler</td>\n",
       "      <td>0.7027</td>\n",
       "      <td>0.5214</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "append_results('MLPRegressor w Scaler', y_true_train, y_pred_train, y_true_test, y_pred_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eef0af50-f27d-42ff-bf96-fb9e23a7e5c2",
   "metadata": {},
   "source": [
    "-------------------------\n",
    "### Fit a neural network regressor w feature scaling and appended PCA components"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "1fd42e70-db38-4397-9e79-c16c30511543",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\darry\\anaconda3\\envs\\datakick\\lib\\site-packages\\sklearn\\neural_network\\_multilayer_perceptron.py:614: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Pipeline(steps=[('feat',\n",
       "                 FeatureUnion(transformer_list=[('scaler', StandardScaler()),\n",
       "                                                ('pca', PCA())])),\n",
       "                ('mlpr', MLPRegressor())])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.random.seed(47)\n",
    "\n",
    "pipe = Pipeline(steps=[\n",
    "    ('feat', FeatureUnion(transformer_list=[\n",
    "        ('scaler', StandardScaler()),\n",
    "        ('pca', PCA()),\n",
    "    ])),\n",
    "    ('mlpr', MLPRegressor())\n",
    "])\n",
    "\n",
    "pipe.fit(X_train, y_true_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0dd082d9-52f3-4532-afa7-18028c399bfd",
   "metadata": {},
   "source": [
    "-------------------------\n",
    "### Generate predictions using the fitted model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "b4b87d68-ad5a-4d75-aa54-14444d3405f4",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred_train = pipe.predict(X_train.values)\n",
    "y_pred_test = pipe.predict(X_test.values)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a8c884b3-fd50-4d8a-a3c1-3cb0485d8b03",
   "metadata": {},
   "source": [
    "-------------------------\n",
    "### Get model performance metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "b2acc7e6-f9a5-4f4d-945c-de4d370e8369",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mean_squared_error 19.5491\n",
      "explained_variance_score 0.7754\n",
      "mean_absolute_error 3.3399\n",
      "mean_squared_error 19.5491\n",
      "median_absolute_error 2.694\n",
      "r2_score 0.7728\n"
     ]
    }
   ],
   "source": [
    "me.get_regression_metrics(y_true_train, y_pred_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "c17b3dc1-daf8-4aff-9e05-daaa1111b51f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mean_squared_error 24.8428\n",
      "explained_variance_score 0.712\n",
      "mean_absolute_error 3.6282\n",
      "mean_squared_error 24.8428\n",
      "median_absolute_error 2.9865\n",
      "r2_score 0.6918\n"
     ]
    }
   ],
   "source": [
    "me.get_regression_metrics(y_true_test, y_pred_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "44dda43b-48c3-4975-9365-e8fedf29bcde",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>clf</th>\n",
       "      <th>train</th>\n",
       "      <th>test</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>MLPRegressor</td>\n",
       "      <td>0.4862</td>\n",
       "      <td>0.5010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>MLPRegressor w Scaler</td>\n",
       "      <td>0.7027</td>\n",
       "      <td>0.5214</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>MLPRegressor w Scaler &amp; PCA features</td>\n",
       "      <td>0.7728</td>\n",
       "      <td>0.6918</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "append_results('MLPRegressor w Scaler & PCA features', y_true_train, y_pred_train, y_true_test, y_pred_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ace0bb95-78ae-471f-bbe6-b473d0d2cc87",
   "metadata": {},
   "source": [
    "-------------------------\n",
    "### Fit a neural network regressor w feature scaling, appended PCA components & appended encoded clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "1226b462-e27d-412e-abab-94442d5e0127",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\darry\\anaconda3\\envs\\datakick\\lib\\site-packages\\sklearn\\neural_network\\_multilayer_perceptron.py:614: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Pipeline(steps=[('feat',\n",
       "                 FeatureUnion(transformer_list=[('onehot',\n",
       "                                                 Pipeline(steps=[('kmeans',\n",
       "                                                                  transform_predict(clf=KMeans(n_clusters=6))),\n",
       "                                                                 ('onehot',\n",
       "                                                                  OneHotEncoder())])),\n",
       "                                                ('scaler', StandardScaler()),\n",
       "                                                ('pca', PCA())])),\n",
       "                ('mlpr', MLPRegressor())])"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.random.seed(47)\n",
    "\n",
    "pipe = Pipeline(steps=[\n",
    "    ('feat', FeatureUnion(transformer_list=[\n",
    "        ('onehot', Pipeline(steps=[\n",
    "            ('kmeans', md.transform_predict(KMeans(n_clusters=6))),\n",
    "            ('onehot', OneHotEncoder(categories='auto'))         \n",
    "        ])),\n",
    "        ('scaler', StandardScaler()),\n",
    "        ('pca', PCA())\n",
    "    ])),\n",
    "    ('mlpr', MLPRegressor())\n",
    "])\n",
    "\n",
    "pipe.fit(X_train, y_true_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "07c4a038-0204-4c50-91bf-ff938190286b",
   "metadata": {},
   "source": [
    "-------------------------\n",
    "### Generate predictions using the fitted model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "e7560a07-8ed1-48c9-a750-518fefd2edc3",
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred_train = pipe.predict(X_train.values)\n",
    "y_pred_test = pipe.predict(X_test.values)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d0d434eb-4e00-4d35-a08a-f319516553ee",
   "metadata": {},
   "source": [
    "-------------------------\n",
    "### Get model performance metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "3506c06a-b174-4a79-9e80-fc52250a51c7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mean_squared_error 12.8069\n",
      "explained_variance_score 0.8521\n",
      "mean_absolute_error 2.6122\n",
      "mean_squared_error 12.8069\n",
      "median_absolute_error 1.9178\n",
      "r2_score 0.8512\n"
     ]
    }
   ],
   "source": [
    "me.get_regression_metrics(y_true_train, y_pred_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "13b554cb-579a-4c88-a0c4-efd192e956ea",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mean_squared_error 20.7287\n",
      "explained_variance_score 0.7501\n",
      "mean_absolute_error 3.244\n",
      "mean_squared_error 20.7287\n",
      "median_absolute_error 2.5522\n",
      "r2_score 0.7428\n"
     ]
    }
   ],
   "source": [
    "me.get_regression_metrics(y_true_test, y_pred_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "f346a93c-b32d-466c-ac00-af4c5347917b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>clf</th>\n",
       "      <th>train</th>\n",
       "      <th>test</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>MLPRegressor</td>\n",
       "      <td>0.4862</td>\n",
       "      <td>0.5010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>MLPRegressor w Scaler</td>\n",
       "      <td>0.7027</td>\n",
       "      <td>0.5214</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>MLPRegressor w Scaler &amp; PCA features</td>\n",
       "      <td>0.7728</td>\n",
       "      <td>0.6918</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>MLPRegressor w Scaler, PCA &amp; Encoded cluster features</td>\n",
       "      <td>0.8512</td>\n",
       "      <td>0.7428</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "append_results('MLPRegressor w Scaler, PCA & Encoded cluster features', y_true_train, y_pred_train, y_true_test, y_pred_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bf4e854b-d650-4bf5-8a10-247a3bc4b01b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	import numpy as np
	import sklearn.base
	from sklearn import metrics


	class transform_predict(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):

	def __init__(self, clf: sklearn.base.BaseEstimator):
	self.clf = clf

	def fit(self, args, *kwargs):
	self.clf.fit(args, *kwargs)

	return self

	def transform(self, X: np.ndarray, **transform_params):
	pred = self.clf.predict(X)

	return pred.reshape(-1, 1) if len(pred.shape) == 1 else pred


	class transform_predict_proba(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):

	def __init__(self, clf: sklearn.base.ClassifierMixin, drop: bool = True):
	self.clf = clf
	self.drop = drop

	def fit(self, args, *kwargs):
	self.clf.fit(args, *kwargs)

	return self

	def transform(self, X: np.ndarray, **transform_params):
	pred = self.clf.predict_proba(X)

	return pred[:, 1:] if self.drop else pred


	def get_regression_metrics(y_true, y_pred):
	print('mean_squared_error', np.round(metrics.mean_squared_error(y_true, y_pred), 4))
	print('explained_variance_score', np.round(metrics.explained_variance_score(y_true, y_pred), 4))
	print('mean_absolute_error', np.round(metrics.mean_absolute_error(y_true, y_pred), 4))
	print('mean_squared_error', np.round(metrics.mean_squared_error(y_true, y_pred), 4))
	print('median_absolute_error', np.round(metrics.median_absolute_error(y_true, y_pred), 4))
	print('r2_score', np.round(metrics.r2_score(y_true, y_pred), 4))