vidit0210/Tree-Boosting-Bagging-CV

## Tree-Boosting-Bagging-CV
# Compute the array containing the 10-folds CV MSEs
MSE_CV_scores = - cross_val_score(dt, X_train, y_train, cv=10,
                                  scoring='neg_mean_squared_error',
                                  n_jobs=-1)

# Compute the 10-folds CV RMSE
RMSE_CV = (MSE_CV_scores.mean())**(1/2)

# Print RMSE_CV
print('CV RMSE: {:.2f}'.format(RMSE_CV))

--------------

# Import mean_squared_error from sklearn.metrics as MSE
from sklearn.metrics import mean_squared_error as MSE

# Fit dt to the training set
dt.fit(X_train, y_train)

# Predict the labels of the training set
y_pred_train = dt.predict(X_train)

# Evaluate the training set RMSE of dt

RMSE_train = (MSE(y_train, y_pred_train))**(1/2)

# Print RMSE_train
print('Train RMSE: {:.2f}'.format(RMSE_train))

----------------------------------

Ensemble Learning
------------------------

# Set seed for reproducibility
SEED=1

# Instantiate lr
lr = LogisticRegression(random_state=SEED)

# Instantiate knn
knn = KNN(n_neighbors=27)

# Instantiate dt
dt = DecisionTreeClassifier(min_samples_leaf=0.13, random_state=SEED)

# Define the list classifiers
classifiers = [('Logistic Regression', lr), ('K Nearest Neighbours', knn), ('Classification Tree', dt)]

# Iterate over the pre-defined list of classifiers
for clf_name, clf in classifiers:

    # Fit clf to the training set
    clf.fit(X_train, y_train)

    # Predict y_pred
    y_pred = clf.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Evaluate clf's accuracy on the test set
    print('{:s} : {:.3f}'.format(clf_name, accuracy))

---------
Bagging
---------
# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

# Import BaggingClassifier
from sklearn.ensemble import BaggingClassifier

# Instantiate dt
dt = DecisionTreeClassifier(random_state=1)

# Instantiate bc
bc = BaggingClassifier(base_estimator=dt, n_estimators=50, random_state=1)

# Fit bc to the training set
bc.fit(X_train, y_train)

# Predict test set labels
y_pred = bc.predict(X_test)

# Evaluate acc_test
acc_test = accuracy_score(y_test, y_pred)
print('Test set accuracy of bc: {:.2f}'.format(acc_test))

-----
OOB ( Out of Box
----
# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

# Import BaggingClassifier
from sklearn.ensemble import BaggingClassifier

# Instantiate dt
dt = DecisionTreeClassifier(min_samples_leaf=8, random_state=1)

# Instantiate bc
bc = BaggingClassifier(base_estimator=dt,
                       n_estimators=50,
                       oob_score=True,
                       random_state=1)

       -----------
Random Forest Regressor
-----
# Import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor

# Instantiate rf
rf = RandomForestRegressor(n_estimators=25,
            random_state=2)

# Fit rf to the training set
rf.fit(X_train, y_train)

# Import mean_squared_error as MSE
from sklearn.metrics import mean_squared_error as MSE

# Predict the test set labels
y_pred = rf.predict(X_test)

# Evaluate the test set RMSE
rmse_test = MSE(y_test, y_pred)**(1/2)

# Print rmse_test
print('Test set RMSE of rf: {:.2f}'.format(rmse_test))

# Create a pd.Series of features importances
importances = pd.Series(data=rf.feature_importances_,
                        index= X_train.columns)

# Sort importances
importances_sorted = importances.sort_values()

# Draw a horizontal barplot of importances_sorted
importances_sorted.plot(kind='barh', color='lightgreen')
plt.title('Features Importances')
plt.show()

----------
Boosting
---------
# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

# Import AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

# Instantiate dt
dt = DecisionTreeClassifier(max_depth=2, random_state=1)

# Instantiate ada
ada = AdaBoostClassifier(base_estimator=dt, n_estimators=180, random_state=1)

# Fit ada to the training set
ada.fit(X_train, y_train)

# Compute the probabilities of obtaining the positive class
y_pred_proba = ada.predict_proba(X_test)[:,1]

# Import roc_auc_score
from sklearn.metrics import roc_auc_score

# Evaluate test-set roc_auc_score
ada_roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print roc_auc_score
print('ROC AUC score: {:.2f}'.format(ada_roc_auc))
-----
Gradient Boosting
---
# Import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Instantiate gb
gb = GradientBoostingRegressor(max_depth=4,
                               n_estimators=200,
                               random_state=2)
                               # Fit gb to the training set
gb.fit(X_train, y_train)

# Predict test set labels
y_pred = gb.predict(X_test)

# Import mean_squared_error as MSE
from sklearn.metrics import mean_squared_error as MSE

# Compute MSE
mse_test = MSE(y_test, y_pred)

# Compute RMSE
rmse_test = mse_test**(1/2)

# Print RMSE
print('Test set RMSE of gb: {:.3f}'.format(rmse_test))
-------
Stochastic Gradient Boosting
-------
# Import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Instantiate sgbr
sgbr = GradientBoostingRegressor(max_depth=4,
                                 subsample=0.9,
                                 max_features=0.75,
                                 n_estimators=200,
                                 random_state=2)

# Fit sgbr to the training set
sgbr.fit(X_train, y_train)

# Predict test set labels
y_pred = sgbr.predict(X_test)

# Import mean_squared_error as MSE
from sklearn.metrics import mean_squared_error as MSE

# Compute test set MSE
mse_test = MSE(y_test, y_pred)

# Compute test set RMSE
rmse_test = mse_test**(1/2)

# Print rmse_test
print('Test set RMSE of sgbr: {:.3f}'.format(rmse_test))
-----
Random Forest Hyperparameter Tuning
----
rf.get_params() ---> Gives de

# Define the dictionary 'params_rf'
params_rf = {
             'n_estimators': [100, 350, 500],
             'max_features': ['log2', 'auto', 'sqrt'],
             'min_samples_leaf': [2, 10, 30],
             }# Import GridSearchCV

from sklearn.model_selection import  GridSearchCV

# Instantiate grid_rf
grid_rf = GridSearchCV(estimator=rf,
                       param_grid=params_rf,
                       scoring='neg_mean_squared_error',
                       cv=3,
                       verbose=1,
                       n_jobs=-1)


# Import mean_squared_error from sklearn.metrics as MSE
from sklearn.metrics import mean_squared_error as MSE

# Extract the best estimator
best_model = grid_rf.best_estimator_

# Predict test set labels
y_pred = best_model.predict(X_test)

# Compute rmse_test
rmse_test = MSE(y_test, y_pred)**(1/2)

# Print rmse_test
print('Test RMSE of best model: {:.3f}'.format(rmse_test))
	# Compute the array containing the 10-folds CV MSEs
	MSE_CV_scores = - cross_val_score(dt, X_train, y_train, cv=10,
	scoring='neg_mean_squared_error',
	n_jobs=-1)

	# Compute the 10-folds CV RMSE
	RMSE_CV = (MSE_CV_scores.mean())**(1/2)

	# Print RMSE_CV
	print('CV RMSE: {:.2f}'.format(RMSE_CV))

	--------------

	# Import mean_squared_error from sklearn.metrics as MSE
	from sklearn.metrics import mean_squared_error as MSE

	# Fit dt to the training set
	dt.fit(X_train, y_train)

	# Predict the labels of the training set
	y_pred_train = dt.predict(X_train)

	# Evaluate the training set RMSE of dt

	RMSE_train = (MSE(y_train, y_pred_train))**(1/2)

	# Print RMSE_train
	print('Train RMSE: {:.2f}'.format(RMSE_train))

	----------------------------------

	Ensemble Learning
	------------------------

	# Set seed for reproducibility
	SEED=1

	# Instantiate lr
	lr = LogisticRegression(random_state=SEED)

	# Instantiate knn
	knn = KNN(n_neighbors=27)

	# Instantiate dt
	dt = DecisionTreeClassifier(min_samples_leaf=0.13, random_state=SEED)

	# Define the list classifiers
	classifiers = [('Logistic Regression', lr), ('K Nearest Neighbours', knn), ('Classification Tree', dt)]

	# Iterate over the pre-defined list of classifiers
	for clf_name, clf in classifiers:

	# Fit clf to the training set
	clf.fit(X_train, y_train)

	# Predict y_pred
	y_pred = clf.predict(X_test)

	# Calculate accuracy
	accuracy = accuracy_score(y_test, y_pred)

	# Evaluate clf's accuracy on the test set
	print('{:s} : {:.3f}'.format(clf_name, accuracy))

	---------
	Bagging
	---------
	# Import DecisionTreeClassifier
	from sklearn.tree import DecisionTreeClassifier

	# Import BaggingClassifier
	from sklearn.ensemble import BaggingClassifier

	# Instantiate dt
	dt = DecisionTreeClassifier(random_state=1)

	# Instantiate bc
	bc = BaggingClassifier(base_estimator=dt, n_estimators=50, random_state=1)

	# Fit bc to the training set
	bc.fit(X_train, y_train)

	# Predict test set labels
	y_pred = bc.predict(X_test)

	# Evaluate acc_test
	acc_test = accuracy_score(y_test, y_pred)
	print('Test set accuracy of bc: {:.2f}'.format(acc_test))

	-----
	OOB ( Out of Box
	----
	# Import DecisionTreeClassifier
	from sklearn.tree import DecisionTreeClassifier

	# Import BaggingClassifier
	from sklearn.ensemble import BaggingClassifier

	# Instantiate dt
	dt = DecisionTreeClassifier(min_samples_leaf=8, random_state=1)

	# Instantiate bc
	bc = BaggingClassifier(base_estimator=dt,
	n_estimators=50,
	oob_score=True,
	random_state=1)

	-----------
	Random Forest Regressor
	-----
	# Import RandomForestRegressor
	from sklearn.ensemble import RandomForestRegressor

	# Instantiate rf
	rf = RandomForestRegressor(n_estimators=25,
	random_state=2)

	# Fit rf to the training set
	rf.fit(X_train, y_train)

	# Import mean_squared_error as MSE
	from sklearn.metrics import mean_squared_error as MSE

	# Predict the test set labels
	y_pred = rf.predict(X_test)

	# Evaluate the test set RMSE
	rmse_test = MSE(y_test, y_pred)**(1/2)

	# Print rmse_test
	print('Test set RMSE of rf: {:.2f}'.format(rmse_test))

	# Create a pd.Series of features importances
	importances = pd.Series(data=rf.feature_importances_,
	index= X_train.columns)

	# Sort importances
	importances_sorted = importances.sort_values()

	# Draw a horizontal barplot of importances_sorted
	importances_sorted.plot(kind='barh', color='lightgreen')
	plt.title('Features Importances')
	plt.show()

	----------
	Boosting
	---------
	# Import DecisionTreeClassifier
	from sklearn.tree import DecisionTreeClassifier

	# Import AdaBoostClassifier
	from sklearn.ensemble import AdaBoostClassifier

	# Instantiate dt
	dt = DecisionTreeClassifier(max_depth=2, random_state=1)

	# Instantiate ada
	ada = AdaBoostClassifier(base_estimator=dt, n_estimators=180, random_state=1)

	# Fit ada to the training set
	ada.fit(X_train, y_train)

	# Compute the probabilities of obtaining the positive class
	y_pred_proba = ada.predict_proba(X_test)[:,1]

	# Import roc_auc_score
	from sklearn.metrics import roc_auc_score

	# Evaluate test-set roc_auc_score
	ada_roc_auc = roc_auc_score(y_test, y_pred_proba)

	# Print roc_auc_score
	print('ROC AUC score: {:.2f}'.format(ada_roc_auc))
	-----
	Gradient Boosting
	---
	# Import GradientBoostingRegressor
	from sklearn.ensemble import GradientBoostingRegressor

	# Instantiate gb
	gb = GradientBoostingRegressor(max_depth=4,
	n_estimators=200,
	random_state=2)
	# Fit gb to the training set
	gb.fit(X_train, y_train)

	# Predict test set labels
	y_pred = gb.predict(X_test)

	# Import mean_squared_error as MSE
	from sklearn.metrics import mean_squared_error as MSE

	# Compute MSE
	mse_test = MSE(y_test, y_pred)

	# Compute RMSE
	rmse_test = mse_test**(1/2)

	# Print RMSE
	print('Test set RMSE of gb: {:.3f}'.format(rmse_test))
	-------
	Stochastic Gradient Boosting
	-------
	# Import GradientBoostingRegressor
	from sklearn.ensemble import GradientBoostingRegressor

	# Instantiate sgbr
	sgbr = GradientBoostingRegressor(max_depth=4,
	subsample=0.9,
	max_features=0.75,
	n_estimators=200,
	random_state=2)

	# Fit sgbr to the training set
	sgbr.fit(X_train, y_train)

	# Predict test set labels
	y_pred = sgbr.predict(X_test)

	# Import mean_squared_error as MSE
	from sklearn.metrics import mean_squared_error as MSE

	# Compute test set MSE
	mse_test = MSE(y_test, y_pred)

	# Compute test set RMSE
	rmse_test = mse_test**(1/2)

	# Print rmse_test
	print('Test set RMSE of sgbr: {:.3f}'.format(rmse_test))
	-----
	Random Forest Hyperparameter Tuning
	----
	rf.get_params() ---> Gives de

	# Define the dictionary 'params_rf'
	params_rf = {
	'n_estimators': [100, 350, 500],
	'max_features': ['log2', 'auto', 'sqrt'],
	'min_samples_leaf': [2, 10, 30],
	}# Import GridSearchCV

	from sklearn.model_selection import GridSearchCV

	# Instantiate grid_rf
	grid_rf = GridSearchCV(estimator=rf,
	param_grid=params_rf,
	scoring='neg_mean_squared_error',
	cv=3,
	verbose=1,
	n_jobs=-1)


	# Import mean_squared_error from sklearn.metrics as MSE
	from sklearn.metrics import mean_squared_error as MSE

	# Extract the best estimator
	best_model = grid_rf.best_estimator_

	# Predict test set labels
	y_pred = best_model.predict(X_test)

	# Compute rmse_test
	rmse_test = MSE(y_test, y_pred)**(1/2)

	# Print rmse_test
	print('Test RMSE of best model: {:.3f}'.format(rmse_test))