MattMcMurray/model_selection.ipynb

## model_selection.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Linear Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 261,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)"
      ]
     },
     "execution_count": 261,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import LinearRegression\n",
    "lin_reg = LinearRegression()\n",
    "lin_reg.fit(prepared_data, labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 262,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Predictions: [ 0.26903984  0.19360012  0.19776904  0.23113952  0.17055647]\n",
      "Labels: 44864     1\n",
      "24799     1\n",
      "108547    1\n",
      "24979     1\n",
      "10221     0\n",
      "Name: No-show, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "some_data_pt = data.iloc[:5]\n",
    "some_data_label = labels.iloc[:5]\n",
    "some_data_pt_prepared = full_pipeline.transform(some_data_pt)\n",
    "print('Predictions:', lin_reg.predict(some_data_pt_prepared))\n",
    "print('Labels:', some_data_label)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "** Not even close...**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 263,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.39229643378648837"
      ]
     },
     "execution_count": 263,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.metrics import mean_squared_error\n",
    "predictions = lin_reg.predict(prepared_data)\n",
    "lin_mse = mean_squared_error(labels, predictions)\n",
    "lin_rmse = np.sqrt(lin_mse)\n",
    "lin_rmse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DecisionTreeRegressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 264,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,\n",
       "           max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
       "           min_impurity_split=None, min_samples_leaf=1,\n",
       "           min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "           presort=False, random_state=42, splitter='best')"
      ]
     },
     "execution_count": 264,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.tree import DecisionTreeRegressor\n",
    "\n",
    "dec_tree = DecisionTreeRegressor(random_state=42)\n",
    "dec_tree.fit(prepared_data, labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 265,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.2968375571401668"
      ]
     },
     "execution_count": 265,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tree_predictions = dec_tree.predict(prepared_data)\n",
    "tree_mse = mean_squared_error(labels, tree_predictions)\n",
    "tree_rmse = np.sqrt(tree_mse)\n",
    "tree_rmse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Better. Still not great"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# RandomForestRegressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 266,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.31167539452639936"
      ]
     },
     "execution_count": 266,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestRegressor\n",
    "\n",
    "forest_reg = RandomForestRegressor()\n",
    "forest_reg.fit(prepared_data, labels)\n",
    "forest_preds = forest_reg.predict(prepared_data)\n",
    "forest_mse = mean_squared_error(labels, forest_preds)\n",
    "forest_rmse = np.sqrt(forest_mse)\n",
    "forest_rmse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Worse. Let's go back to Decision Trees and see if we can fine tune that model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 283,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomizedSearchCV(cv=None, error_score='raise',\n",
       "          estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,\n",
       "           max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
       "           min_impurity_split=None, min_samples_leaf=1,\n",
       "           min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "           presort=False, random_state=42, splitter='best'),\n",
       "          fit_params=None, iid=True, n_iter=1000, n_jobs=1,\n",
       "          param_distributions={'splitter': ['best', 'random'], 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f8c884dae10>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f8c884da4a8>},\n",
       "          pre_dispatch='2*n_jobs', random_state=None, refit=True,\n",
       "          return_train_score='warn', scoring='neg_mean_squared_error',\n",
       "          verbose=0)"
      ]
     },
     "execution_count": 283,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "from scipy.stats import randint as sp_randint\n",
    "\n",
    "params = {\n",
    "    'max_depth': sp_randint(10, 100000),\n",
    "    'max_features': sp_randint(1, 10),\n",
    "    'splitter': ['best', 'random']\n",
    "}\n",
    "\n",
    "dec_tree = DecisionTreeRegressor(random_state=42)\n",
    "rand_search = RandomizedSearchCV(dec_tree, param_distributions=params,\n",
    "                                 n_iter=1000, \n",
    "                                 scoring='neg_mean_squared_error')\n",
    "\n",
    "rand_search.fit(prepared_data, labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 284,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'max_depth': 21, 'max_features': 2, 'splitter': 'best'}"
      ]
     },
     "execution_count": 284,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rand_search.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 285,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_model = rand_search.best_estimator_"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Linear Regression"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 261,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)"
	]
	},
	"execution_count": 261,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from sklearn.linear_model import LinearRegression\n",
	"lin_reg = LinearRegression()\n",
	"lin_reg.fit(prepared_data, labels)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 262,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Predictions: [ 0.26903984 0.19360012 0.19776904 0.23113952 0.17055647]\n",
	"Labels: 44864 1\n",
	"24799 1\n",
	"108547 1\n",
	"24979 1\n",
	"10221 0\n",
	"Name: No-show, dtype: int64\n"
	]
	}
	],
	"source": [
	"some_data_pt = data.iloc[:5]\n",
	"some_data_label = labels.iloc[:5]\n",
	"some_data_pt_prepared = full_pipeline.transform(some_data_pt)\n",
	"print('Predictions:', lin_reg.predict(some_data_pt_prepared))\n",
	"print('Labels:', some_data_label)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	" Not even close..."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 263,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.39229643378648837"
	]
	},
	"execution_count": 263,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from sklearn.metrics import mean_squared_error\n",
	"predictions = lin_reg.predict(prepared_data)\n",
	"lin_mse = mean_squared_error(labels, predictions)\n",
	"lin_rmse = np.sqrt(lin_mse)\n",
	"lin_rmse"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# DecisionTreeRegressor"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 264,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,\n",
	" max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
	" min_impurity_split=None, min_samples_leaf=1,\n",
	" min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
	" presort=False, random_state=42, splitter='best')"
	]
	},
	"execution_count": 264,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from sklearn.tree import DecisionTreeRegressor\n",
	"\n",
	"dec_tree = DecisionTreeRegressor(random_state=42)\n",
	"dec_tree.fit(prepared_data, labels)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 265,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.2968375571401668"
	]
	},
	"execution_count": 265,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"tree_predictions = dec_tree.predict(prepared_data)\n",
	"tree_mse = mean_squared_error(labels, tree_predictions)\n",
	"tree_rmse = np.sqrt(tree_mse)\n",
	"tree_rmse"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Better. Still not great"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# RandomForestRegressor"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 266,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.31167539452639936"
	]
	},
	"execution_count": 266,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from sklearn.ensemble import RandomForestRegressor\n",
	"\n",
	"forest_reg = RandomForestRegressor()\n",
	"forest_reg.fit(prepared_data, labels)\n",
	"forest_preds = forest_reg.predict(prepared_data)\n",
	"forest_mse = mean_squared_error(labels, forest_preds)\n",
	"forest_rmse = np.sqrt(forest_mse)\n",
	"forest_rmse"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Worse. Let's go back to Decision Trees and see if we can fine tune that model."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 283,
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"RandomizedSearchCV(cv=None, error_score='raise',\n",
	" estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,\n",
	" max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
	" min_impurity_split=None, min_samples_leaf=1,\n",
	" min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
	" presort=False, random_state=42, splitter='best'),\n",
	" fit_params=None, iid=True, n_iter=1000, n_jobs=1,\n",
	" param_distributions={'splitter': ['best', 'random'], 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f8c884dae10>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f8c884da4a8>},\n",
	" pre_dispatch='2*n_jobs', random_state=None, refit=True,\n",
	" return_train_score='warn', scoring='neg_mean_squared_error',\n",
	" verbose=0)"
	]
	},
	"execution_count": 283,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"from sklearn.model_selection import RandomizedSearchCV\n",
	"from scipy.stats import randint as sp_randint\n",
	"\n",
	"params = {\n",
	" 'max_depth': sp_randint(10, 100000),\n",
	" 'max_features': sp_randint(1, 10),\n",
	" 'splitter': ['best', 'random']\n",
	"}\n",
	"\n",
	"dec_tree = DecisionTreeRegressor(random_state=42)\n",
	"rand_search = RandomizedSearchCV(dec_tree, param_distributions=params,\n",
	" n_iter=1000, \n",
	" scoring='neg_mean_squared_error')\n",
	"\n",
	"rand_search.fit(prepared_data, labels)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 284,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"{'max_depth': 21, 'max_features': 2, 'splitter': 'best'}"
	]
	},
	"execution_count": 284,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"rand_search.best_params_"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 285,
	"metadata": {},
	"outputs": [],
	"source": [
	"final_model = rand_search.best_estimator_"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}