Skip to content

Instantly share code, notes, and snippets.

@zgulde
Created June 10, 2021 20:34
Show Gist options
  • Save zgulde/6e5f9269b7d31d4cce5e55f444d2d2ed to your computer and use it in GitHub Desktop.
Save zgulde/6e5f9269b7d31d4cce5e55f444d2d2ed to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "41e5840c-ecab-404e-af46-6e6b1067307f",
"metadata": {},
"source": [
"# Modeling\n",
"\n",
"Regression Models:\n",
"\n",
"- Baseline\n",
"- LinearRegression (OLS)\n",
"- LassoLars\n",
"- GLM: generalized linear models -- fancy math and stats- GLM: generalized linear models -- fancy math and stats; if the relationship between target and predictors (i.e. features, or independent variables) is non-linear\n",
"- PolynomialFeatures*: non-linear relationship between predictors and target\n",
" - creates interaction terms and squared terms (or even higher degrees)\n",
" - linear model == y ~ x1 + x2 + x3\n",
" - polynomial == y ~ x1^2 + x1*x2 + x2^2 + x2*x3 + x3^2 + x1 + x2 + x3\n",
" - total_charges ~ tenure + monthly_charges\n",
" - total_charges ~ tenure*monthly_charges\n",
"\n",
"What do we do in the modeling stage of the pipeline?\n",
"\n",
"1. Create a model\n",
" - research hyperparms\n",
" - try out lots of diff hyperparms\n",
" - baseline\n",
" - mean\n",
" - median\n",
"1. Evaluate the model's performance\n",
" - visualize residuals\n",
" - visualize the distribution of model predictions vs actual values\n",
" - fancy dataframes to hold on to predictions\n",
"1. Repeat\n",
"1. Compare model performance\n",
" - fancy dataframes to hold on to predictions\n",
" - seperate dataframe to hold regression evaluation metrics\n",
" - visualize different model residuals against each other\n",
"1. Evaluate the best model on test"
]
},
{
"cell_type": "code",
"execution_count": 78,
"id": "bdb1fdf6-b2e0-4ea1-9f8b-8d1577debf69",
"metadata": {},
"outputs": [],
"source": [
"import math\n",
"import wrangle\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"from sklearn.linear_model import LinearRegression, LassoLars\n",
"from sklearn.preprocessing import PolynomialFeatures\n",
"from sklearn.metrics import mean_squared_error"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "1912421b-62f9-4979-91f1-5d6fd0156793",
"metadata": {},
"outputs": [],
"source": [
"path = 'https://gist.githubusercontent.com/ryanorsinger/55ccfd2f7820af169baea5aad3a9c60d/raw/da6c5a33307ed7ee207bd119d3361062a1d1c07e/student-mat.csv'\n",
"\n",
"(\n",
" df, X_train_exp, X_train, y_train,\n",
" X_validate, y_validate, X_test, y_test,\n",
") = wrangle.wrangle_student_math(path)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "4a57e8ae-0f87-4a78-bdd5-30cc70cb6b7f",
"metadata": {},
"outputs": [],
"source": [
"baseline = y_train.mean()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "0949999c-2b53-4700-afa6-1672b920f99b",
"metadata": {},
"outputs": [],
"source": [
"# Turn y_train into a dataframe so that we can store, \"hold on to\", our models' predictions\n",
"\n",
"y_train = pd.DataFrame({'actual': y_train})"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "1e2e688e-97ad-4b25-9783-f65c3cd15cd2",
"metadata": {},
"outputs": [],
"source": [
"y_train['baseline'] = baseline"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "23432fa6-0140-41d1-a26e-e6b922e61cb9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train RMSE Baseline model:\n"
]
},
{
"data": {
"text/plain": [
"4.498925523895268"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print('Train RMSE Baseline model:')\n",
"math.sqrt(mean_squared_error(y_train.actual, y_train.baseline))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "7c7121d0-c974-464b-a539-631531b5b58c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LinearRegression()"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# OLS -- ordinary least squares - way the coefficients are calculated\n",
"model2 = LinearRegression()\n",
"# When add additional values to y_train, we need to fit with a single column\n",
"model2.fit(X_train, y_train.actual)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "60f6c92a-3cdf-43f6-ada0-b35fd685b33d",
"metadata": {},
"outputs": [],
"source": [
"y_train['model2'] = model2.predict(X_train)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "60751519-58ef-43ed-adb7-4b8c534f6421",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train RMSE for model2:\n"
]
},
{
"data": {
"text/plain": [
"1.7503546500121143"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print('Train RMSE for model2:')\n",
"math.sqrt(mean_squared_error(y_train.actual, y_train.model2))"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "c3e6b99c-1d2e-4b4f-b2cb-665e7a2a83ad",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LassoLars(alpha=0.1)"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model3 = LassoLars(alpha=.1)\n",
"model3.fit(X_train, y_train.actual)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "adbb11ca-520e-4628-b1d2-1640e8ff2d44",
"metadata": {},
"outputs": [],
"source": [
"y_train['model3'] = model3.predict(X_train)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "b35191b7-8ec0-4eac-ade6-d0ebc2a04718",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train RMSE for model3:\n"
]
},
{
"data": {
"text/plain": [
"2.396115747537386"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print('Train RMSE for model3:')\n",
"math.sqrt(mean_squared_error(y_train.actual, y_train.model3))"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "32228808-6944-4f7f-9638-8d441bd11f28",
"metadata": {},
"outputs": [],
"source": [
"y_validate = pd.DataFrame({'actual': y_validate})"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "e3597b3a-8053-4237-a8af-44ecb5f987f9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>actual</th>\n",
" <th>baseline</th>\n",
" <th>model2</th>\n",
" <th>model3</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>241</th>\n",
" <td>12</td>\n",
" <td>10.524887</td>\n",
" <td>10.219690</td>\n",
" <td>10.607736</td>\n",
" </tr>\n",
" <tr>\n",
" <th>235</th>\n",
" <td>10</td>\n",
" <td>10.524887</td>\n",
" <td>9.423179</td>\n",
" <td>9.199304</td>\n",
" </tr>\n",
" <tr>\n",
" <th>369</th>\n",
" <td>11</td>\n",
" <td>10.524887</td>\n",
" <td>12.013069</td>\n",
" <td>11.311952</td>\n",
" </tr>\n",
" <tr>\n",
" <th>217</th>\n",
" <td>8</td>\n",
" <td>10.524887</td>\n",
" <td>4.643292</td>\n",
" <td>7.086656</td>\n",
" </tr>\n",
" <tr>\n",
" <th>331</th>\n",
" <td>14</td>\n",
" <td>10.524887</td>\n",
" <td>13.636035</td>\n",
" <td>12.720384</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" actual baseline model2 model3\n",
"241 12 10.524887 10.219690 10.607736\n",
"235 10 10.524887 9.423179 9.199304\n",
"369 11 10.524887 12.013069 11.311952\n",
"217 8 10.524887 4.643292 7.086656\n",
"331 14 10.524887 13.636035 12.720384"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_validate['baseline'] = y_train.actual.mean()\n",
"y_validate['model2'] = model2.predict(X_validate)\n",
"y_validate['model3'] = model3.predict(X_validate)\n",
"\n",
"y_validate.head()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "7713bff9-8e48-45df-bb02-1f6940e069ab",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train RMSE baseline:\n",
"4.498925523895268\n",
"Train RMSE model2:\n",
"1.7503546500121143\n",
"Train RMSE model3:\n",
"2.396115747537386\n"
]
}
],
"source": [
"print('Train RMSE baseline:')\n",
"print(math.sqrt(mean_squared_error(y_train.actual, y_train.baseline)))\n",
"\n",
"print('Train RMSE model2:')\n",
"print(math.sqrt(mean_squared_error(y_train.actual, y_train.model2)))\n",
"\n",
"print('Train RMSE model3:')\n",
"print(math.sqrt(mean_squared_error(y_train.actual, y_train.model3)))"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "cc3c4807-562e-461a-9a87-04bf6ba4eb57",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Validate RMSE baseline:\n",
"4.578916932633144\n",
"Validate RMSE model2:\n",
"2.1264081323553436\n",
"Validate RMSE model3:\n",
"2.486601997892303\n"
]
}
],
"source": [
"print('Validate RMSE baseline:')\n",
"print(math.sqrt(mean_squared_error(y_validate.actual, y_validate.baseline)))\n",
"\n",
"print('Validate RMSE model2:')\n",
"print(math.sqrt(mean_squared_error(y_validate.actual, y_validate.model2)))\n",
"\n",
"print('Validate RMSE model3:')\n",
"print(math.sqrt(mean_squared_error(y_validate.actual, y_validate.model3)))"
]
},
{
"cell_type": "markdown",
"id": "e610cdb9-f161-426d-a8be-7e58de5b9bde",
"metadata": {},
"source": [
"Looks like model 2 is the winner!"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "404209a5-d67d-4fe2-9da4-3d383f05b62b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.9229879901741005"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"math.sqrt(mean_squared_error(model2.predict(X_test), y_test))"
]
},
{
"cell_type": "markdown",
"id": "92048e1b-6d1d-49c3-9791-d5c35b955997",
"metadata": {},
"source": [
"^ this number tells us how we would expect our model to perform on future data, on future predictions."
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "94afb024-33d6-47f9-86e7-0b1b9af0d55a",
"metadata": {},
"outputs": [],
"source": [
"# data for a single student\n",
"unknown_student = X_test.iloc[0]"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "a5fb59e6-b2ee-44d1-9b53-9949d8cc4192",
"metadata": {},
"outputs": [],
"source": [
"# the data needs to be in the shape that the model expects, i.e.\n",
"# it needs to have the same number of columns as the training X"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "7211aefa-13c5-4a33-81ba-37d0e81d2867",
"metadata": {},
"outputs": [],
"source": [
"# rows, cols -- -1 means fill the extra space\n",
"student_X = unknown_student.values.reshape(1, -1)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "333434bc-d827-4e65-92dd-92a982544b71",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([7.4726451])"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model2.predict(student_X)"
]
},
{
"cell_type": "markdown",
"id": "ef2eb1d9-8f5a-4d66-9de8-5b416aeafc65",
"metadata": {},
"source": [
"-----------------"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "ca48eda4-1e2a-42a0-aff1-2941935bdfb9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['1', 'age', 'G1', 'G2', 'age^2', 'age G1', 'age G2', 'G1^2', 'G1 G2', 'G2^2']"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# different from our other models in that this just transforms the X\n",
"# *after* the polynomial transformation, run the transformed data through the same model(s)\n",
"X_train_poly = X_train[['age', 'G1', 'G2']]\n",
"\n",
"poly = PolynomialFeatures()\n",
"poly.fit(X_train_poly)\n",
"X_transformed = poly.transform(X_train_poly)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "e8768085-6955-4fb9-b049-3a35407b9619",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>1</th>\n",
" <th>age</th>\n",
" <th>G1</th>\n",
" <th>G2</th>\n",
" <th>age^2</th>\n",
" <th>age G1</th>\n",
" <th>age G2</th>\n",
" <th>G1^2</th>\n",
" <th>G1 G2</th>\n",
" <th>G2^2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.357143</td>\n",
" <td>0.578947</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.127551</td>\n",
" <td>0.206767</td>\n",
" <td>0.335180</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>0.333333</td>\n",
" <td>0.714286</td>\n",
" <td>0.789474</td>\n",
" <td>0.111111</td>\n",
" <td>0.238095</td>\n",
" <td>0.263158</td>\n",
" <td>0.510204</td>\n",
" <td>0.563910</td>\n",
" <td>0.623269</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.0</td>\n",
" <td>0.166667</td>\n",
" <td>0.500000</td>\n",
" <td>0.526316</td>\n",
" <td>0.027778</td>\n",
" <td>0.083333</td>\n",
" <td>0.087719</td>\n",
" <td>0.250000</td>\n",
" <td>0.263158</td>\n",
" <td>0.277008</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 1 age G1 G2 age^2 age G1 age G2 G1^2 \\\n",
"0 1.0 0.000000 0.357143 0.578947 0.000000 0.000000 0.000000 0.127551 \n",
"1 1.0 0.333333 0.714286 0.789474 0.111111 0.238095 0.263158 0.510204 \n",
"2 1.0 0.166667 0.500000 0.526316 0.027778 0.083333 0.087719 0.250000 \n",
"\n",
" G1 G2 G2^2 \n",
"0 0.206767 0.335180 \n",
"1 0.563910 0.623269 \n",
"2 0.263158 0.277008 "
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(X_transformed, columns=poly.get_feature_names(input_features=X_train_poly.columns)).head(3)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "8a074fc0-7846-4525-bce2-3b33cc4bd870",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LinearRegression()"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model4 = LinearRegression()\n",
"model4.fit(X_transformed, y_train.actual)"
]
},
{
"cell_type": "code",
"execution_count": 74,
"id": "a79e6d55-d233-40a3-8b73-a32ef38ffc90",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.8184368425448687"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_train['model4'] = model4.predict(X_transformed)\n",
"\n",
"math.sqrt(mean_squared_error(y_train.actual, y_train.model4))"
]
},
{
"cell_type": "code",
"execution_count": 75,
"id": "7814dfe0-6b11-43ca-88db-763685e12b8a",
"metadata": {},
"outputs": [],
"source": [
"# when we are making predictions with a model, any predictors (i.e. new data)\n",
"# including polynomial transformation\n",
"\n",
"X_validate_poly = X_validate[['age', 'G1', 'G2']]\n",
"X_validate_transformed = poly.transform(X_validate_poly)\n",
"y_validate['model4'] = model4.predict(X_validate_transformed)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"id": "2b66b341-76e0-4945-b21d-98912e5c5aa7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2.231228440521658"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"math.sqrt(mean_squared_error(y_validate.actual, y_validate.model4))"
]
},
{
"cell_type": "markdown",
"id": "b06d8a55-2946-4061-9bc7-1d9f72859b2c",
"metadata": {},
"source": [
"Visualize Residuals\n",
"\n",
"residuals = actual - predicted"
]
},
{
"cell_type": "code",
"execution_count": 84,
"id": "8446adf5-7d2e-47de-9414-4045a6f7308b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot:ylabel='Frequency'>"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAX4AAAD4CAYAAADrRI2NAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAARxElEQVR4nO3de7BdZ13G8e9DW2xBsI1NawRCLHbAylhaD4hWEQg40EpTcIowXjJaiYzgwKgjERjgH2eKDiBeBgiCBixKuZRGLkqJIMOMLaRYoJhiAEOpjUmslwIylJaff+wVOD05J1nJybt3Tt7vZ2bPXutda2X95j07z1nn3euSqkKS1I/7zLoASdJ0GfyS1BmDX5I6Y/BLUmcMfknqzMmzLmCMM888s9atWzfrMiRpRbnxxhv/s6pWL2xfEcG/bt06duzYMesyJGlFSfKlxdod6pGkzhj8ktQZg1+SOmPwS1JnDH5J6ozBL0mdaRr8SU5P8s4ktyTZmeTHk6xKcl2SXcP7GS1rkCTdW+sj/tcCf1dVjwDOB3YCm4HtVXUusH2YlyRNSbPgT/JA4HHAmwCq6q6q+h9gA7B1WG0rcFmrGiRJB2t55e45wH7gL5KcD9wIvAA4u6r2AFTVniRnLbZxkk3AJoC1a9c2LFM6eus2v29m+9595SUz27dWtpZDPScDFwKvq6oLgK9xBMM6VbWlquaqam716oNuNSFJOkotg/824LaqumGYfyeTXwR7k6wBGN73NaxBkrRAs+Cvqv8Avpzk4UPTeuBfgG3AxqFtI3BtqxokSQdrfXfO3wSuSnJf4IvArzD5ZXN1kiuAW4HLG9cgSZqnafBX1U3A3CKL1rfcryRpaV65K0mdMfglqTMGvyR1xuCXpM4Y/JLUGYNfkjpj8EtSZwx+SeqMwS9JnTH4JakzBr8kdcbgl6TOGPyS1BmDX5I6Y/BLUmcMfknqjMEvSZ0x+CWpMwa/JHXG4Jekzhj8ktQZg1+SOmPwS1JnDH5J6ozBL0mdObnlP55kN/AV4B7g7qqaS7IKeDuwDtgNPLOq/rtlHZKk75jGEf8TqupRVTU3zG8GtlfVucD2YV6SNCWzGOrZAGwdprcCl82gBknqVuvgL+CDSW5MsmloO7uq9gAM72cttmGSTUl2JNmxf//+xmVKUj+ajvEDF1XV7UnOAq5LcsvYDatqC7AFYG5urloVKEm9aXrEX1W3D+/7gGuAxwB7k6wBGN73taxBknRvzYI/yf2TPODANPAzwM3ANmDjsNpG4NpWNUiSDtZyqOds4JokB/bztqr6uySfAK5OcgVwK3B5wxokSQs0C/6q+iJw/iLtdwDrW+1XknRoXrkrSZ0x+CWpMwa/JHXG4Jekzhj8ktQZg1+SOmPwS1JnDH5J6ozBL0mdMfglqTMGvyR1xuCXpM4Y/JLUGYNfkjpj8EtSZwx+SeqMwS9JnTH4JakzBr8kdcbgl6TOGPyS1BmDX5I6Y/BLUmcMfknqjMEvSZ0x+CWpM82DP8lJSf45yXuH+VVJrkuya3g/o3UNkqTvmMYR/wuAnfPmNwPbq+pcYPswL0makqbBn+TBwCXAn89r3gBsHaa3Ape1rEGSdG+tj/j/CPhd4Fvz2s6uqj0Aw/tZi22YZFOSHUl27N+/v3GZktSPZsGf5GeBfVV149FsX1VbqmququZWr159jKuTpH6d3PDfvgi4NMnFwKnAA5P8FbA3yZqq2pNkDbCvYQ2SpAWaHfFX1e9V1YOrah3wLOAfquoXgW3AxmG1jcC1rWqQJB1sFufxXwk8Ocku4MnDvCRpSloO9XxbVX0E+MgwfQewfhr7lSQdbNQRf5JHti5EkjQdY4d6Xp/k40l+I8npLQuSJLU1Kvir6ieBXwAeAuxI8rYkT25amSSpidFf7lbVLuClwIuAnwb+OMktSZ7RqjhJ0rE3doz/R5K8hsk9d54IPK2qfmiYfk3D+iRJx9jYs3r+FHgj8OKq+vqBxqq6PclLm1QmSWpibPBfDHy9qu4BSHIf4NSq+r+qemuz6iRJx9zYMf4PAafNm7/f0CZJWmHGHvGfWlVfPTBTVV9Ncr9GNUk6jq3b/L6Z7Xv3lZfMbN8nkrFH/F9LcuGBmSQ/Cnz9EOtLko5TY4/4Xwi8I8ntw/wa4OebVCRJampU8FfVJ5I8Ang4EOCWqvpm08okSU0cyU3aHg2sG7a5IAlV9ZYmVUmSmhkV/EneCjwMuAm4Z2guwOCXpBVm7BH/HHBeVVXLYqSjNcszTaSVZuxZPTcD39eyEEnSdIw94j8T+JckHwe+caCxqi5tUpUkqZmxwf+KlkVIkqZn7Omc/5jkocC5VfWh4ardk9qWJklqYextmZ8DvBN4w9D0IOA9jWqSJDU09svd5wEXAXfCtx/KclaroiRJ7YwN/m9U1V0HZpKczOQ8fknSCjM2+P8xyYuB04Zn7b4D+Nt2ZUmSWhkb/JuB/cBngF8H3s/k+buSpBVm7Fk932Ly6MU3ti1HktTa2Hv1/BuLjOlX1TmH2OZU4KPAdw37eWdVvTzJKuDtTG74tht4ZlX99xFXLkk6Kkdyr54DTgUuB1YdZptvAE8cntZ1CvCxJB8AngFsr6ork2xmMoz0oiOsW5J0lEaN8VfVHfNe/15VfwQ88TDb1LzHNZ4yvArYAGwd2rcClx1N4ZKkozN2qOfCebP3YfIXwANGbHcScCPwg8CfVdUNSc6uqj0AVbUnyaLXAyTZBGwCWLt27Zgypa54R1IdrbFDPa+aN303w9j84TaqqnuARyU5HbgmySPHFlZVW4AtAHNzc14zIEnHyNizep6wnJ1U1f8k+QjwFGBvkjXD0f4aYN9y/m1J0pEZO9TzW4daXlWvXmSb1cA3h9A/DXgS8EpgG7ARuHJ4v/ZIi5YkHb0jOavn0UxCG+BpTE7V/PIhtlkDbB3G+e8DXF1V703yT8DVSa4AbmVyhpAkaUqO5EEsF1bVVwCSvAJ4R1X92lIbVNWngQsWab8DWH/kpUqSjoWxt2xYC9w1b/4uJhdgSZJWmLFH/G8FPp7kGibn4j8deEuzqiRJzYw9q+f3h6tuf2po+pWq+ud2ZUmSWhk71ANwP+DOqnotcFuSH2hUkySpobGPXnw5k/vp/N7QdArwV62KkiS1M/aI/+nApcDXAKrqdkbcskGSdPwZG/x3VVUx3Jo5yf3blSRJamls8F+d5A3A6UmeA3wIH8oiSSvSYc/qSRImD055BHAn8HDgZVV1XePaJEkNHDb4q6qSvKeqfhQw7CVphRs71HN9kkc3rUSSNBVjr9x9AvDcJLuZnNkTJn8M/EirwiRJbRwy+JOsrapbgadOqR5JUmOHO+J/D5O7cn4pybuq6uemUJMkqaHDjfFn3vQ5LQuRJE3H4YK/lpiWJK1QhxvqOT/JnUyO/E8bpuE7X+4+sGl1kqRj7pDBX1UnTasQSdJ0HMltmSVJJwCDX5I6Y/BLUmcMfknqjMEvSZ0x+CWpMwa/JHXG4JekzjQL/iQPSfLhJDuTfDbJC4b2VUmuS7JreD+jVQ2SpIO1POK/G/jtqvoh4LHA85KcB2wGtlfVucD2YV6SNCXNgr+q9lTVJ4fprwA7gQcBG4Ctw2pbgcta1SBJOtjYJ3AtS5J1wAXADcDZVbUHJr8ckpy1xDabgE0Aa9euPep9r9v8vqPedrl2X3nJzPYtSUtp/uVuku8G3gW8sKruPNz6B1TVlqqaq6q51atXtytQkjrTNPiTnMIk9K+qqncPzXuTrBmWrwH2taxBknRvLc/qCfAmYGdVvXreom3AxmF6I3BtqxokSQdrOcZ/EfBLwGeS3DS0vRi4Erg6yRXArcDlDWuQJC3QLPir6mPc+5m9861vtV9J0qF55a4kdcbgl6TOGPyS1BmDX5I6M5Urd9WHWV4lLWk8j/glqTMGvyR1xuCXpM4Y/JLUGYNfkjpj8EtSZwx+SeqMwS9JnTH4JakzXrkracWY1dXhJ9rzsz3il6TOGPyS1BmDX5I6Y/BLUmcMfknqjMEvSZ0x+CWpMwa/JHXG4Jekzhj8ktSZZsGf5M1J9iW5eV7bqiTXJdk1vJ/Rav+SpMW1POL/S+ApC9o2A9ur6lxg+zAvSZqiZsFfVR8F/mtB8wZg6zC9Fbis1f4lSYub9hj/2VW1B2B4P2upFZNsSrIjyY79+/dPrUBJOtEdt1/uVtWWqpqrqrnVq1fPuhxJOmFMO/j3JlkDMLzvm/L+Jal70w7+bcDGYXojcO2U9y9J3Wv2BK4kfw08HjgzyW3Ay4ErgauTXAHcClzeav+SdKzM6slf0ObpX82Cv6qevcSi9a32KUk6vOP2y11JUhsGvyR1xuCXpM4Y/JLUGYNfkjpj8EtSZwx+SeqMwS9JnTH4JakzBr8kdcbgl6TOGPyS1BmDX5I6Y/BLUmcMfknqjMEvSZ0x+CWpMwa/JHXG4Jekzhj8ktQZg1+SOmPwS1JnDH5J6ozBL0mdMfglqTMnz7oAHXvrNr9v1iVIOo7N5Ig/yVOSfC7J55NsnkUNktSrqQd/kpOAPwOeCpwHPDvJedOuQ5J6NYsj/scAn6+qL1bVXcDfABtmUIckdWkWY/wPAr48b/424McWrpRkE7BpmP1qks8d5f7OBP7zKLddlrxy1Gozq28k61se61ue470+aFzjyBxZykMXa5xF8GeRtjqooWoLsGXZO0t2VNXccv+dVqxveaxveaxv+VZCjQvNYqjnNuAh8+YfDNw+gzokqUuzCP5PAOcm+YEk9wWeBWybQR2S1KWpD/VU1d1Jng/8PXAS8Oaq+mzDXS57uKgx61se61se61u+lVDjvaTqoOF1SdIJzFs2SFJnDH5J6swJE/yHuw1EJv54WP7pJBdOsbaHJPlwkp1JPpvkBYus8/gk/5vkpuH1smnVN+x/d5LPDPvescjyWfbfw+f1y01J7kzywgXrTLX/krw5yb4kN89rW5XkuiS7hvczlti2+S1LlqjvD5PcMvz8rkly+hLbHvKz0LC+VyT593k/w4uX2HZW/ff2ebXtTnLTEts2779lq6oV/2LyJfEXgHOA+wKfAs5bsM7FwAeYXEfwWOCGKda3BrhwmH4A8K+L1Pd44L0z7MPdwJmHWD6z/lvkZ/0fwENn2X/A44ALgZvntf0BsHmY3gy8con6D/lZbVjfzwAnD9OvXKy+MZ+FhvW9AvidET//mfTfguWvAl42q/5b7utEOeIfcxuIDcBbauJ64PQka6ZRXFXtqapPDtNfAXYyuYJ5JZlZ/y2wHvhCVX1pBvv+tqr6KPBfC5o3AFuH6a3AZYtsOpVblixWX1V9sKruHmavZ3INzUws0X9jzKz/DkgS4JnAXx/r/U7LiRL8i90GYmGwjlmnuSTrgAuAGxZZ/ONJPpXkA0l+eLqVUcAHk9w43C5joeOi/5hc97HUf7hZ9h/A2VW1Bya/7IGzFlnneOnHX2XyF9xiDvdZaOn5w1DUm5cYKjse+u+ngL1VtWuJ5bPsv1FOlOAfcxuIUbeKaCnJdwPvAl5YVXcuWPxJJsMX5wN/ArxnmrUBF1XVhUzumvq8JI9bsPx46L/7ApcC71hk8az7b6zjoR9fAtwNXLXEKof7LLTyOuBhwKOAPUyGUxaaef8Bz+bQR/uz6r/RTpTgH3MbiJneKiLJKUxC/6qqevfC5VV1Z1V9dZh+P3BKkjOnVV9V3T687wOuYfIn9XzHw602ngp8sqr2Llww6/4b7D0w/DW871tknVl/DjcCPwv8Qg0D0guN+Cw0UVV7q+qeqvoW8MYl9jvr/jsZeAbw9qXWmVX/HYkTJfjH3AZiG/DLw9kpjwX+98Cf5a0NY4JvAnZW1auXWOf7hvVI8hgmP5s7plTf/ZM84MA0ky8Bb16w2sz6b54lj7Rm2X/zbAM2DtMbgWsXWWdmtyxJ8hTgRcClVfV/S6wz5rPQqr753xk9fYn9zvqWL08Cbqmq2xZbOMv+OyKz/nb5WL2YnHXyr0y+8X/J0PZc4LnDdJg8AOYLwGeAuSnW9pNM/hz9NHDT8Lp4QX3PBz7L5CyF64GfmGJ95wz7/dRQw3HVf8P+78ckyL9nXtvM+o/JL6A9wDeZHIVeAXwvsB3YNbyvGtb9fuD9h/qsTqm+zzMZHz/wGXz9wvqW+ixMqb63Dp+tTzMJ8zXHU/8N7X954DM3b92p999yX96yQZI6c6IM9UiSRjL4JakzBr8kdcbgl6TOGPyS1BmDX5I6Y/BLUmf+Hyw192B/Je+zAAAAAElFTkSuQmCC\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"y_train.actual.plot.hist()"
]
},
{
"cell_type": "code",
"execution_count": 90,
"id": "14c9482b-5319-41d3-a5b0-ad1481d95886",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 936x504 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# plt.scatter(y_train.actual, y_train.actual - y_train.baseline, label='baseline')\n",
"plt.figure(figsize=(13, 7))\n",
"plt.scatter(y_train.actual, y_train.actual - y_train.model2, label='model2 (LinearRegression)', alpha=.6)\n",
"plt.scatter(y_train.actual, y_train.actual - y_train.model4, label='model4 (Polynomial)', alpha=.6)\n",
"plt.hlines(0, 0, 20, ls=':', label='perfect prediction', color='black')\n",
"plt.ylabel('Residual')\n",
"plt.xlabel('Actual')\n",
"plt.legend()\n",
"plt.grid()"
]
},
{
"cell_type": "code",
"execution_count": 92,
"id": "bd2a7211-8765-422e-9dbd-7e6d6d3518b8",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 936x504 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# plt.scatter(y_train.actual, y_train.actual - y_train.baseline, label='baseline')\n",
"plt.figure(figsize=(13, 7))\n",
"plt.scatter(y_train.actual, y_train.model2, label='model2 (LinearRegression)', alpha=.6)\n",
"plt.scatter(y_train.actual, y_train.model4, label='model4 (Polynomial)', alpha=.6)\n",
"plt.ylabel('Predicted Value')\n",
"plt.plot([0, 20], [0, 20], ls=':', c='black', label='perfect prediction')\n",
"plt.xlabel('Actual')\n",
"plt.legend()\n",
"plt.grid()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment