Last active
November 10, 2017 16:29
-
-
Save MattMcMurray/0809b174c49c4f962fde80d18d1be737 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Linear Regression" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 261, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)" | |
] | |
}, | |
"execution_count": 261, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from sklearn.linear_model import LinearRegression\n", | |
"lin_reg = LinearRegression()\n", | |
"lin_reg.fit(prepared_data, labels)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 262, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Predictions: [ 0.26903984 0.19360012 0.19776904 0.23113952 0.17055647]\n", | |
"Labels: 44864 1\n", | |
"24799 1\n", | |
"108547 1\n", | |
"24979 1\n", | |
"10221 0\n", | |
"Name: No-show, dtype: int64\n" | |
] | |
} | |
], | |
"source": [ | |
"some_data_pt = data.iloc[:5]\n", | |
"some_data_label = labels.iloc[:5]\n", | |
"some_data_pt_prepared = full_pipeline.transform(some_data_pt)\n", | |
"print('Predictions:', lin_reg.predict(some_data_pt_prepared))\n", | |
"print('Labels:', some_data_label)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"** Not even close...**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 263, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.39229643378648837" | |
] | |
}, | |
"execution_count": 263, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from sklearn.metrics import mean_squared_error\n", | |
"predictions = lin_reg.predict(prepared_data)\n", | |
"lin_mse = mean_squared_error(labels, predictions)\n", | |
"lin_rmse = np.sqrt(lin_mse)\n", | |
"lin_rmse" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# DecisionTreeRegressor" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 264, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,\n", | |
" max_leaf_nodes=None, min_impurity_decrease=0.0,\n", | |
" min_impurity_split=None, min_samples_leaf=1,\n", | |
" min_samples_split=2, min_weight_fraction_leaf=0.0,\n", | |
" presort=False, random_state=42, splitter='best')" | |
] | |
}, | |
"execution_count": 264, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from sklearn.tree import DecisionTreeRegressor\n", | |
"\n", | |
"dec_tree = DecisionTreeRegressor(random_state=42)\n", | |
"dec_tree.fit(prepared_data, labels)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 265, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.2968375571401668" | |
] | |
}, | |
"execution_count": 265, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tree_predictions = dec_tree.predict(prepared_data)\n", | |
"tree_mse = mean_squared_error(labels, tree_predictions)\n", | |
"tree_rmse = np.sqrt(tree_mse)\n", | |
"tree_rmse" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Better. Still not great" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# RandomForestRegressor" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 266, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.31167539452639936" | |
] | |
}, | |
"execution_count": 266, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from sklearn.ensemble import RandomForestRegressor\n", | |
"\n", | |
"forest_reg = RandomForestRegressor()\n", | |
"forest_reg.fit(prepared_data, labels)\n", | |
"forest_preds = forest_reg.predict(prepared_data)\n", | |
"forest_mse = mean_squared_error(labels, forest_preds)\n", | |
"forest_rmse = np.sqrt(forest_mse)\n", | |
"forest_rmse" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Worse. Let's go back to Decision Trees and see if we can fine tune that model." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 283, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"RandomizedSearchCV(cv=None, error_score='raise',\n", | |
" estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,\n", | |
" max_leaf_nodes=None, min_impurity_decrease=0.0,\n", | |
" min_impurity_split=None, min_samples_leaf=1,\n", | |
" min_samples_split=2, min_weight_fraction_leaf=0.0,\n", | |
" presort=False, random_state=42, splitter='best'),\n", | |
" fit_params=None, iid=True, n_iter=1000, n_jobs=1,\n", | |
" param_distributions={'splitter': ['best', 'random'], 'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f8c884dae10>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f8c884da4a8>},\n", | |
" pre_dispatch='2*n_jobs', random_state=None, refit=True,\n", | |
" return_train_score='warn', scoring='neg_mean_squared_error',\n", | |
" verbose=0)" | |
] | |
}, | |
"execution_count": 283, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from sklearn.model_selection import RandomizedSearchCV\n", | |
"from scipy.stats import randint as sp_randint\n", | |
"\n", | |
"params = {\n", | |
" 'max_depth': sp_randint(10, 100000),\n", | |
" 'max_features': sp_randint(1, 10),\n", | |
" 'splitter': ['best', 'random']\n", | |
"}\n", | |
"\n", | |
"dec_tree = DecisionTreeRegressor(random_state=42)\n", | |
"rand_search = RandomizedSearchCV(dec_tree, param_distributions=params,\n", | |
" n_iter=1000, \n", | |
" scoring='neg_mean_squared_error')\n", | |
"\n", | |
"rand_search.fit(prepared_data, labels)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 284, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'max_depth': 21, 'max_features': 2, 'splitter': 'best'}" | |
] | |
}, | |
"execution_count": 284, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"rand_search.best_params_" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 285, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"final_model = rand_search.best_estimator_" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment