zgulde/modeling.ipynb

## modeling.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "b7f976f8-ba10-40d6-be47-84f941ff7149",
   "metadata": {},
   "source": [
    "Models:\n",
    "\n",
    "- LinearRegression\n",
    "- LassoLars\n",
    "- GLM\n",
    "\n",
    "All of these work the same from a python perspective.\n",
    "\n",
    "1. Create the object\n",
    "2. `.fit`\n",
    "3. `.predict`\n",
    "\n",
    "PolynomialFeatures is a little different"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "fc51e361-2f57-4710-b58f-90c5edd831b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import math\n",
    "import wrangle\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "from sklearn.linear_model import LinearRegression, LassoLars\n",
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "from sklearn.metrics import mean_squared_error\n",
    "\n",
    "path = 'https://gist.githubusercontent.com/ryanorsinger/55ccfd2f7820af169baea5aad3a9c60d/raw/da6c5a33307ed7ee207bd119d3361062a1d1c07e/student-mat.csv'\n",
    "\n",
    "(\n",
    "    df, X_train_exp, X_train, y_train,\n",
    "    X_validate, y_validate, X_test, y_test,\n",
    ") = wrangle.wrangle_student_math(path)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "94e4fd75-e0b6-4057-b1ad-071463888b40",
   "metadata": {},
   "source": [
    "Q: How do we decide between lassolars and polynomial regression?\n",
    "\n",
    "A: We measure performance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "6d9a6f2e-02a6-48b3-a080-68a3b4644ad0",
   "metadata": {},
   "outputs": [],
   "source": [
    "baseline = y_train.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "bcbd67d3-6ee9-492a-af19-966c7cfeddf6",
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions = pd.DataFrame()\n",
    "predictions['actual'] = y_train\n",
    "predictions['baseline'] = baseline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "d6c2056c-3736-42e9-aab3-b560fdf5c22d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>actual</th>\n",
       "      <th>baseline</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>142</th>\n",
       "      <td>11</td>\n",
       "      <td>10.524887</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>326</th>\n",
       "      <td>16</td>\n",
       "      <td>10.524887</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>88</th>\n",
       "      <td>10</td>\n",
       "      <td>10.524887</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>118</th>\n",
       "      <td>8</td>\n",
       "      <td>10.524887</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>312</th>\n",
       "      <td>11</td>\n",
       "      <td>10.524887</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>229</th>\n",
       "      <td>12</td>\n",
       "      <td>10.524887</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61</th>\n",
       "      <td>11</td>\n",
       "      <td>10.524887</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>11</td>\n",
       "      <td>10.524887</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>243</th>\n",
       "      <td>12</td>\n",
       "      <td>10.524887</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>166</th>\n",
       "      <td>10</td>\n",
       "      <td>10.524887</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>221 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     actual   baseline\n",
       "142      11  10.524887\n",
       "326      16  10.524887\n",
       "88       10  10.524887\n",
       "118       8  10.524887\n",
       "312      11  10.524887\n",
       "..      ...        ...\n",
       "229      12  10.524887\n",
       "61       11  10.524887\n",
       "38       11  10.524887\n",
       "243      12  10.524887\n",
       "166      10  10.524887\n",
       "\n",
       "[221 rows x 2 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "ceb657c7-608b-4b21-aa56-7ba21cabf1b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "model1 = LassoLars(alpha=.1)\n",
    "model1.fit(X_train, y_train)\n",
    "\n",
    "predictions['model1'] = model1.predict(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "89863e2c-b452-4e36-a107-9eec411f1fe0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>actual</th>\n",
       "      <th>baseline</th>\n",
       "      <th>model1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>142</th>\n",
       "      <td>11</td>\n",
       "      <td>10.524887</td>\n",
       "      <td>10.607736</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>326</th>\n",
       "      <td>16</td>\n",
       "      <td>10.524887</td>\n",
       "      <td>13.424600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>88</th>\n",
       "      <td>10</td>\n",
       "      <td>10.524887</td>\n",
       "      <td>9.903520</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>118</th>\n",
       "      <td>8</td>\n",
       "      <td>10.524887</td>\n",
       "      <td>7.790872</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>312</th>\n",
       "      <td>11</td>\n",
       "      <td>10.524887</td>\n",
       "      <td>10.607736</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>229</th>\n",
       "      <td>12</td>\n",
       "      <td>10.524887</td>\n",
       "      <td>9.903520</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>61</th>\n",
       "      <td>11</td>\n",
       "      <td>10.524887</td>\n",
       "      <td>8.495088</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>11</td>\n",
       "      <td>10.524887</td>\n",
       "      <td>11.311952</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>243</th>\n",
       "      <td>12</td>\n",
       "      <td>10.524887</td>\n",
       "      <td>11.311952</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>166</th>\n",
       "      <td>10</td>\n",
       "      <td>10.524887</td>\n",
       "      <td>9.903520</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>221 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     actual   baseline     model1\n",
       "142      11  10.524887  10.607736\n",
       "326      16  10.524887  13.424600\n",
       "88       10  10.524887   9.903520\n",
       "118       8  10.524887   7.790872\n",
       "312      11  10.524887  10.607736\n",
       "..      ...        ...        ...\n",
       "229      12  10.524887   9.903520\n",
       "61       11  10.524887   8.495088\n",
       "38       11  10.524887  11.311952\n",
       "243      12  10.524887  11.311952\n",
       "166      10  10.524887   9.903520\n",
       "\n",
       "[221 rows x 3 columns]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "82f046c0-d29d-4294-b365-11627fe21722",
   "metadata": {},
   "outputs": [],
   "source": [
    "# with polynomial features, we generally want to have a small amount of features\n",
    "# before transformation"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e4a5ba5f-fdc6-4c5d-9e04-6499d2289036",
   "metadata": {},
   "source": [
    "y ~ x1 + x2\n",
    "\n",
    "y ~ x1^2 + x1*x2 + x2^2 + x1 + x2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "5513e779-9f5d-4997-92ab-3eae37eee4aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_poly = X_train[['age', 'G1', 'G2']]\n",
    "\n",
    "poly = PolynomialFeatures()\n",
    "poly.fit(X_train_poly)\n",
    "X_train_transformed = poly.transform(X_train_poly)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "76ee0fd5-1643-4116-b7b2-e8db6a228f9b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LinearRegression()"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model2 = LinearRegression()\n",
    "model2.fit(X_train_transformed, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "651c3ef9-a184-4a74-a198-eebe52d3289b",
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions['model2'] = model2.predict(X_train_transformed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "ee6efff8-43ee-4ace-8391-7acfaa2246e6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>actual</th>\n",
       "      <th>baseline</th>\n",
       "      <th>model1</th>\n",
       "      <th>model2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>142</th>\n",
       "      <td>11</td>\n",
       "      <td>10.524887</td>\n",
       "      <td>10.607736</td>\n",
       "      <td>10.738603</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>326</th>\n",
       "      <td>16</td>\n",
       "      <td>10.524887</td>\n",
       "      <td>13.424600</td>\n",
       "      <td>15.106548</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>88</th>\n",
       "      <td>10</td>\n",
       "      <td>10.524887</td>\n",
       "      <td>9.903520</td>\n",
       "      <td>9.909767</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>118</th>\n",
       "      <td>8</td>\n",
       "      <td>10.524887</td>\n",
       "      <td>7.790872</td>\n",
       "      <td>6.541034</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>312</th>\n",
       "      <td>11</td>\n",
       "      <td>10.524887</td>\n",
       "      <td>10.607736</td>\n",
       "      <td>10.772394</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     actual   baseline     model1     model2\n",
       "142      11  10.524887  10.607736  10.738603\n",
       "326      16  10.524887  13.424600  15.106548\n",
       "88       10  10.524887   9.903520   9.909767\n",
       "118       8  10.524887   7.790872   6.541034\n",
       "312      11  10.524887  10.607736  10.772394"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "b300c8b3-6e9d-40ae-a217-c1185a92a3b5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train RMSE Baseline\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "4.498925523895268"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print('Train RMSE Baseline')\n",
    "math.sqrt(mean_squared_error(predictions.actual, predictions.baseline))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "769da73f-44c9-4597-a5d1-de363aa6f401",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train RMSE model1\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "2.396115747537386"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print('Train RMSE model1')\n",
    "math.sqrt(mean_squared_error(predictions.actual, predictions.model1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "474038cf-8aad-4e6b-b00d-54541b7406c3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train RMSE model2\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "1.8184368425448687"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print('Train RMSE model2')\n",
    "math.sqrt(mean_squared_error(predictions.actual, predictions.model2))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}

## pipeline_recap.md

      
    Raw
  

              pipeline_recap.md
            
          
    Data Science Pipeline

When working on a project, we probably go through the pipeline multiple times.
First pass is our MVP

Acquire: whatever SQL query gives us workable data
Prepare: drop nulls, data split
Explore: visualize the target against independent variables
Model: baseline, LinearRegression, LassoLars compare performance with rmse on validate

NB. not worried about scaling or automated feature engineering
Second pass:

focus on modeling, lets scale our data in prep and then try out models on the
scaled data

Third pass:

let's look at null values more, impute instead of drop
rerun models to see if performance changes

fourth pass

let's do more exploration, lets visualize more variable interactions

....
Eventually

Acquire: fancy SQL query that joins and stuff
Prepare: handle nulls, handle outliers, scale data, split data
Explore: multiple visualizations of independent variable interactions as well
as drivers of the target and statistical tests
Modeling: try out multiple different model types with different
hyperparameters
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "b7f976f8-ba10-40d6-be47-84f941ff7149",
	"metadata": {},
	"source": [
	"Models:\n",
	"\n",
	"- LinearRegression\n",
	"- LassoLars\n",
	"- GLM\n",
	"\n",
	"All of these work the same from a python perspective.\n",
	"\n",
	"1. Create the object\n",
	"2. `.fit`\n",
	"3. `.predict`\n",
	"\n",
	"PolynomialFeatures is a little different"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "fc51e361-2f57-4710-b58f-90c5edd831b9",
	"metadata": {},
	"outputs": [],
	"source": [
	"import math\n",
	"import wrangle\n",
	"import matplotlib.pyplot as plt\n",
	"import pandas as pd\n",
	"from sklearn.linear_model import LinearRegression, LassoLars\n",
	"from sklearn.preprocessing import PolynomialFeatures\n",
	"from sklearn.metrics import mean_squared_error\n",
	"\n",
	"path = 'https://gist.githubusercontent.com/ryanorsinger/55ccfd2f7820af169baea5aad3a9c60d/raw/da6c5a33307ed7ee207bd119d3361062a1d1c07e/student-mat.csv'\n",
	"\n",
	"(\n",
	" df, X_train_exp, X_train, y_train,\n",
	" X_validate, y_validate, X_test, y_test,\n",
	") = wrangle.wrangle_student_math(path)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "94e4fd75-e0b6-4057-b1ad-071463888b40",
	"metadata": {},
	"source": [
	"Q: How do we decide between lassolars and polynomial regression?\n",
	"\n",
	"A: We measure performance"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "6d9a6f2e-02a6-48b3-a080-68a3b4644ad0",
	"metadata": {},
	"outputs": [],
	"source": [
	"baseline = y_train.mean()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "bcbd67d3-6ee9-492a-af19-966c7cfeddf6",
	"metadata": {},
	"outputs": [],
	"source": [
	"predictions = pd.DataFrame()\n",
	"predictions['actual'] = y_train\n",
	"predictions['baseline'] = baseline"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "d6c2056c-3736-42e9-aab3-b560fdf5c22d",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>actual</th>\n",
	" <th>baseline</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>142</th>\n",
	" <td>11</td>\n",
	" <td>10.524887</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>326</th>\n",
	" <td>16</td>\n",
	" <td>10.524887</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>88</th>\n",
	" <td>10</td>\n",
	" <td>10.524887</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>118</th>\n",
	" <td>8</td>\n",
	" <td>10.524887</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>312</th>\n",
	" <td>11</td>\n",
	" <td>10.524887</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>229</th>\n",
	" <td>12</td>\n",
	" <td>10.524887</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>61</th>\n",
	" <td>11</td>\n",
	" <td>10.524887</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>38</th>\n",
	" <td>11</td>\n",
	" <td>10.524887</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>243</th>\n",
	" <td>12</td>\n",
	" <td>10.524887</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>166</th>\n",
	" <td>10</td>\n",
	" <td>10.524887</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>221 rows × 2 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" actual baseline\n",
	"142 11 10.524887\n",
	"326 16 10.524887\n",
	"88 10 10.524887\n",
	"118 8 10.524887\n",
	"312 11 10.524887\n",
	".. ... ...\n",
	"229 12 10.524887\n",
	"61 11 10.524887\n",
	"38 11 10.524887\n",
	"243 12 10.524887\n",
	"166 10 10.524887\n",
	"\n",
	"[221 rows x 2 columns]"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"predictions"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"id": "ceb657c7-608b-4b21-aa56-7ba21cabf1b3",
	"metadata": {},
	"outputs": [],
	"source": [
	"model1 = LassoLars(alpha=.1)\n",
	"model1.fit(X_train, y_train)\n",
	"\n",
	"predictions['model1'] = model1.predict(X_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"id": "89863e2c-b452-4e36-a107-9eec411f1fe0",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>actual</th>\n",
	" <th>baseline</th>\n",
	" <th>model1</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>142</th>\n",
	" <td>11</td>\n",
	" <td>10.524887</td>\n",
	" <td>10.607736</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>326</th>\n",
	" <td>16</td>\n",
	" <td>10.524887</td>\n",
	" <td>13.424600</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>88</th>\n",
	" <td>10</td>\n",
	" <td>10.524887</td>\n",
	" <td>9.903520</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>118</th>\n",
	" <td>8</td>\n",
	" <td>10.524887</td>\n",
	" <td>7.790872</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>312</th>\n",
	" <td>11</td>\n",
	" <td>10.524887</td>\n",
	" <td>10.607736</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>...</th>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" <td>...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>229</th>\n",
	" <td>12</td>\n",
	" <td>10.524887</td>\n",
	" <td>9.903520</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>61</th>\n",
	" <td>11</td>\n",
	" <td>10.524887</td>\n",
	" <td>8.495088</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>38</th>\n",
	" <td>11</td>\n",
	" <td>10.524887</td>\n",
	" <td>11.311952</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>243</th>\n",
	" <td>12</td>\n",
	" <td>10.524887</td>\n",
	" <td>11.311952</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>166</th>\n",
	" <td>10</td>\n",
	" <td>10.524887</td>\n",
	" <td>9.903520</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>221 rows × 3 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" actual baseline model1\n",
	"142 11 10.524887 10.607736\n",
	"326 16 10.524887 13.424600\n",
	"88 10 10.524887 9.903520\n",
	"118 8 10.524887 7.790872\n",
	"312 11 10.524887 10.607736\n",
	".. ... ... ...\n",
	"229 12 10.524887 9.903520\n",
	"61 11 10.524887 8.495088\n",
	"38 11 10.524887 11.311952\n",
	"243 12 10.524887 11.311952\n",
	"166 10 10.524887 9.903520\n",
	"\n",
	"[221 rows x 3 columns]"
	]
	},
	"execution_count": 15,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"predictions"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"id": "82f046c0-d29d-4294-b365-11627fe21722",
	"metadata": {},
	"outputs": [],
	"source": [
	"# with polynomial features, we generally want to have a small amount of features\n",
	"# before transformation"
	]
	},
	{
	"cell_type": "markdown",
	"id": "e4a5ba5f-fdc6-4c5d-9e04-6499d2289036",
	"metadata": {},
	"source": [
	"y ~ x1 + x2\n",
	"\n",
	"y ~ x1^2 + x1*x2 + x2^2 + x1 + x2"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"id": "5513e779-9f5d-4997-92ab-3eae37eee4aa",
	"metadata": {},
	"outputs": [],
	"source": [
	"X_train_poly = X_train[['age', 'G1', 'G2']]\n",
	"\n",
	"poly = PolynomialFeatures()\n",
	"poly.fit(X_train_poly)\n",
	"X_train_transformed = poly.transform(X_train_poly)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"id": "76ee0fd5-1643-4116-b7b2-e8db6a228f9b",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"LinearRegression()"
	]
	},
	"execution_count": 23,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"model2 = LinearRegression()\n",
	"model2.fit(X_train_transformed, y_train)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"id": "651c3ef9-a184-4a74-a198-eebe52d3289b",
	"metadata": {},
	"outputs": [],
	"source": [
	"predictions['model2'] = model2.predict(X_train_transformed)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"id": "ee6efff8-43ee-4ace-8391-7acfaa2246e6",
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>actual</th>\n",
	" <th>baseline</th>\n",
	" <th>model1</th>\n",
	" <th>model2</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>142</th>\n",
	" <td>11</td>\n",
	" <td>10.524887</td>\n",
	" <td>10.607736</td>\n",
	" <td>10.738603</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>326</th>\n",
	" <td>16</td>\n",
	" <td>10.524887</td>\n",
	" <td>13.424600</td>\n",
	" <td>15.106548</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>88</th>\n",
	" <td>10</td>\n",
	" <td>10.524887</td>\n",
	" <td>9.903520</td>\n",
	" <td>9.909767</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>118</th>\n",
	" <td>8</td>\n",
	" <td>10.524887</td>\n",
	" <td>7.790872</td>\n",
	" <td>6.541034</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>312</th>\n",
	" <td>11</td>\n",
	" <td>10.524887</td>\n",
	" <td>10.607736</td>\n",
	" <td>10.772394</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" actual baseline model1 model2\n",
	"142 11 10.524887 10.607736 10.738603\n",
	"326 16 10.524887 13.424600 15.106548\n",
	"88 10 10.524887 9.903520 9.909767\n",
	"118 8 10.524887 7.790872 6.541034\n",
	"312 11 10.524887 10.607736 10.772394"
	]
	},
	"execution_count": 26,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"predictions.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"id": "b300c8b3-6e9d-40ae-a217-c1185a92a3b5",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Train RMSE Baseline\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"4.498925523895268"
	]
	},
	"execution_count": 28,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"print('Train RMSE Baseline')\n",
	"math.sqrt(mean_squared_error(predictions.actual, predictions.baseline))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 29,
	"id": "769da73f-44c9-4597-a5d1-de363aa6f401",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Train RMSE model1\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"2.396115747537386"
	]
	},
	"execution_count": 29,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"print('Train RMSE model1')\n",
	"math.sqrt(mean_squared_error(predictions.actual, predictions.model1))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"id": "474038cf-8aad-4e6b-b00d-54541b7406c3",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Train RMSE model2\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"1.8184368425448687"
	]
	},
	"execution_count": 30,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"print('Train RMSE model2')\n",
	"math.sqrt(mean_squared_error(predictions.actual, predictions.model2))"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}