vidit0210/XG-Boost-DataCamp

## XG-Boost-DataCamp
----
Starting
----
# Import xgboost
import xgboost as xgb

# Create arrays for the features and the target: X, y
X, y = churn_data.iloc[:,:-1], churn_data.iloc[:,-1]

# Create the training and test sets
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2, random_state=123)

# Instantiate the XGBClassifier: xg_cl
xg_cl = xgb.XGBClassifier(objective='binary:logistic', n_estimators=10, seed=123)

# Fit the classifier to the training set
xg_cl.fit(X_train,y_train)

# Predict the labels of the test set: preds
preds = xg_cl.predict(X_test)

# Compute the accuracy: accuracy
accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
print("accuracy: %f" % (accuracy))

----
XGBOOST Cross Validation
-----
# Create arrays for the features and the target: X, y
X, y = churn_data.iloc[:,:-1], churn_data.iloc[:,-1]

# Create the DMatrix from X and y: churn_dmatrix
churn_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary: params
params = {"objective":"reg:logistic", "max_depth":3}

# Perform cross-validation: cv_results
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params,
                    nfold=3, num_boost_round=5,
                    metrics="error", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Print the accuracy
print(((1-cv_results["test-error-mean"]).iloc[-1]))

-----
AUC CURVE
-----
# Perform cross_validation: cv_results
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params,
                    nfold=3, num_boost_round=5,
                    metrics="auc", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Print the AUC
print((cv_results["test-auc-mean"]).iloc[-1])

--------
XG Boost Regression
-------
# Create the training and test sets
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=123)

# Instantiate the XGBRegressor: xg_reg
xg_reg = xgb.XGBRegressor(objective="reg:linear", n_estimators=10, seed=123)

# Fit the regressor to the training set
xg_reg.fit(X_train, y_train)

# Predict the labels of the test set: preds
preds = xg_reg.predict(X_test)

# Compute the rmse: rmse
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))
----
XG Boost GLearner Have to use DMatrix as it is not common
----
# Convert the training and testing sets into DMatrixes: DM_train, DM_test
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_test =  xgb.DMatrix(data=X_test, label=y_test)

# Create the parameter dictionary: params
params = {"booster":"gblinear", "objective":"reg:linear"}

# Train the model: xg_reg
xg_reg = xgb.train(params = params, dtrain=DM_train, num_boost_round=5)

# Predict the labels of the test set: preds
preds = xg_reg.predict(DM_test)

# Compute and print the RMSE
rmse = np.sqrt(mean_squared_error(y_test,preds))
print("RMSE: %f" % (rmse))
-----
CV Fold using XG Boost
----
# Create the DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X,label=y)

# Create the parameter dictionary: params
params = {"objective":"reg:linear", "max_depth":4}

# Perform cross-validation: cv_results
cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=4, num_boost_round=5, metrics="mae", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Extract and print final round boosting round metric
print((cv_results["test-mae-mean"]).tail(1))
----
Regularization
-----
# Create the DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

reg_params = [1, 10, 100]

# Create the initial parameter dictionary for varying l2 strength: params
params = {"objective":"reg:linear","max_depth":3}

# Create an empty list for storing rmses as a function of l2 complexity
rmses_l2 = []

# Iterate over reg_params
for reg in reg_params:

    # Update l2 strength
    params["lambda"] = reg

    # Pass this updated param dictionary into cv
    cv_results_rmse = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2, num_boost_round=5, metrics="rmse", as_pandas=True, seed=123)

    # Append best rmse (final round) to rmses_l2
    rmses_l2.append(cv_results_rmse["test-rmse-mean"].tail(1).values[0])

# Look at best rmse per l2 param
print("Best rmse as a function of l2:")
print(pd.DataFrame(list(zip(reg_params, rmses_l2)), columns=["l2","rmse"]))

-----
Visulaizing Most Important DataSet
-----
# Create the DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary: params
params = {"objective":"reg:linear", "max_depth":4}

# Train the model: xg_reg
xg_reg = xgb.train(params=params, dtrain=housing_dmatrix, num_boost_round=10)

# Plot the feature importances
xgb.plot_importance(xg_reg)
plt.show()
-----
Tuning XGBoost
------
# Create the DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary for each tree: params
params = {"objective":"reg:linear", "max_depth":3}

# Create list of number of boosting rounds
num_rounds = [5, 10, 15]

# Empty list to store final round rmse per XGBoost model
final_rmse_per_round = []

# Iterate over num_rounds and build one model per num_boost_round parameter
for curr_num_rounds in num_rounds:

    # Perform cross-validation: cv_results
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3, num_boost_round=curr_num_rounds, metrics="rmse", as_pandas=True, seed=123)

    # Append final round RMSE
    final_rmse_per_round.append(cv_results["test-rmse-mean"].tail().values[-1])

# Print the resultant DataFrame
num_rounds_rmses = list(zip(num_rounds, final_rmse_per_round))
print(pd.DataFrame(num_rounds_rmses,columns=["num_boosting_rounds","rmse"]))

-------
Automated boosting round selection using early_stopping
-------
# Create your housing DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X,label=y)

# Create the parameter dictionary for each tree: params
params = {"objective":"reg:linear", "max_depth":4}

# Perform cross-validation with early stopping: cv_results
cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3, num_boost_round=50, early_stopping_rounds=10, metrics="rmse", as_pandas=True, seed=123)

# Print cv_results
print(cv_results)
-------
Tunig eta
-----
# Create your housing DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary for each tree (boosting round)
params = {"objective":"reg:linear", "max_depth":3}

# Create list of eta values and empty list to store final round rmse per xgboost model
eta_vals = [0.001, 0.01, 0.1]
best_rmse = []

# Systematically vary the eta
for curr_val in eta_vals:

    params["eta"] = curr_val

    # Perform cross-validation: cv_results
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3,
                        num_boost_round=10, early_stopping_rounds=5,
                        metrics="rmse", as_pandas=True, seed=123)

    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

# Print the resultant DataFrame


print(pd.DataFrame(list(zip(eta_vals, best_rmse)), columns=["eta","best_rmse"]))

---
Tuning max_depth
-----
# Create your housing DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X,label=y)

# Create the parameter dictionary
params = {"objective":"reg:linear"}

# Create list of max_depth values
max_depths = [2, 5, 10, 20]
best_rmse = []

# Systematically vary the max_depth
for curr_val in max_depths:

    params["max_depth"] = curr_val

    # Perform cross-validation
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2,
                 num_boost_round=10, early_stopping_rounds=5,
                 metrics="rmse", as_pandas=True, seed=123)

    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

# Print the resultant DataFrame
print(pd.DataFrame(list(zip(max_depths, best_rmse)),columns=["max_depth","best_rmse"]))

-----
Tuning colsample_bytree
----
# Create your housing DMatrix
housing_dmatrix = xgb.DMatrix(data=X,label=y)

# Create the parameter dictionary
params={"objective":"reg:linear","max_depth":3}

# Create list of hyperparameter values
colsample_bytree_vals = [0.1, 0.5, 0.8, 1]
best_rmse = []

# Systematically vary the hyperparameter value
for curr_val in colsample_bytree_vals:

    params["colsample_bytree"] = curr_val

    # Perform cross-validation
    cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2,
                 num_boost_round=10, early_stopping_rounds=5,
                 metrics="rmse", as_pandas=True, seed=123)

    # Append the final round rmse to best_rmse
    best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

# Print the resultant DataFrame
print(pd.DataFrame(list(zip(colsample_bytree_vals, best_rmse)), columns=["colsample_bytree","best_rmse"]))
------
Grid search with XGBoost
------
# Create the parameter grid: gbm_param_grid
gbm_param_grid = {
    'colsample_bytree': [0.3, 0.7],
    'n_estimators': [50],
    'max_depth': [2, 5]
}

# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor()

# Perform grid search: grid_mse
grid_mse = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid,
                        scoring='neg_mean_squared_error', cv=4, verbose=1)
grid_mse.fit(X, y)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))
---------
Random search with XGBoost
----------
# Create the parameter grid: gbm_param_grid
gbm_param_grid = {
    'n_estimators': [25],
    'max_depth': range(2, 12)
}

# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor(n_estimators=10)

# Perform random search: grid_mse
randomized_mse = RandomizedSearchCV(estimator=gbm, param_distributions=gbm_param_grid,
                                    n_iter=5, scoring='neg_mean_squared_error', cv=4, verbose=1)
randomized_mse.fit(X, y)

# Print the best parameters and lowest RMSE
print("Best parameters found: ",randomized_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))
--------
 Encoding categorical columns I: LabelEncoder
---------
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Fill missing values with 0
df.LotFrontage = df.LotFrontage.fillna(0)

# Create a boolean mask for categorical columns
categorical_mask = (df.dtypes == object)

# Get list of categorical column names
categorical_columns = df.columns[categorical_mask].tolist()

# Print the head of the categorical columns
print(df[categorical_columns].head())

# Create LabelEncoder object: le
le = LabelEncoder()

# Apply LabelEncoder to categorical columns
df[categorical_columns] = df[categorical_columns].apply(lambda x: le.fit_transform(x))

# Print the head of the LabelEncoded categorical columns
print(df[categorical_columns].head())

------
Encoding categorical columns II: OneHotEncoder
-------
# Import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

# Create OneHotEncoder: ohe
ohe = OneHotEncoder(categorical_features=categorical_mask, sparse=False)

# Apply OneHotEncoder to categorical columns - output is no longer a dataframe: df_encoded
df_encoded = ohe.fit_transform(df)

# Print first 5 rows of the resulting dataset - again, this will no longer be a pandas dataframe
print(df_encoded[:5, :])

# Print the shape of the original DataFrame
print(df.shape)

# Print the shape of the transformed array
print(df_encoded.shape)
-------
Encoding categorical columns III: DictVectorizer-LabelEncoder followed by OneHotEncoder - can be simplified by using a DictVectorizer.
------
# Import DictVectorizer
from sklearn.feature_extraction import DictVectorizer

# Convert df into a dictionary: df_dict
df_dict = df.to_dict("records")

# Create the DictVectorizer object: dv
dv = DictVectorizer(sparse=False)

# Apply dv on df: df_encoded
df_encoded = dv.fit_transform(df_dict)

# Print the resulting first five rows
print(df_encoded[:5,:])

# Print the vocabulary
print(dv.vocabulary_)
-----
Preprocessing within a pipeline
-------
# Import necessary modules
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

# Fill LotFrontage missing values with 0
X.LotFrontage = X.LotFrontage.fillna(0)

# Setup the pipeline steps: steps
steps = [("ohe_onestep", DictVectorizer(sparse=False)),
         ("xgb_model", xgb.XGBRegressor())]

# Create the pipeline: xgb_pipeline
xgb_pipeline = Pipeline(steps)

# Fit the pipeline
xgb_pipeline.fit(X.to_dict("records"), y)
--------
-----IMPORTANT STUFF---
------
Cross-validating your XGBoost model
-----

# Import necessary modules
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

# Fill LotFrontage missing values with 0
X.LotFrontage = X.LotFrontage.fillna(0)

# Setup the pipeline steps: steps
steps = [("ohe_onestep", DictVectorizer(sparse=False)),
         ("xgb_model", xgb.XGBRegressor(max_depth=2, objective="reg:linear"))]

# Create the pipeline: xgb_pipeline
xgb_pipeline = Pipeline(steps)

# Cross-validate the model
cross_val_scores = cross_val_score(xgb_pipeline, X.to_dict("records"), y, cv=10, scoring="neg_mean_squared_error")

# Print the 10-fold RMSE
print("10-fold RMSE: ", np.mean(np.sqrt(np.abs(cross_val_scores))))
---
KIDNEY CASE STUDY
next gist
	----
	Starting
	----
	# Import xgboost
	import xgboost as xgb

	# Create arrays for the features and the target: X, y
	X, y = churn_data.iloc[:,:-1], churn_data.iloc[:,-1]

	# Create the training and test sets
	X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2, random_state=123)

	# Instantiate the XGBClassifier: xg_cl
	xg_cl = xgb.XGBClassifier(objective='binary:logistic', n_estimators=10, seed=123)

	# Fit the classifier to the training set
	xg_cl.fit(X_train,y_train)

	# Predict the labels of the test set: preds
	preds = xg_cl.predict(X_test)

	# Compute the accuracy: accuracy
	accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
	print("accuracy: %f" % (accuracy))

	----
	XGBOOST Cross Validation
	-----
	# Create arrays for the features and the target: X, y
	X, y = churn_data.iloc[:,:-1], churn_data.iloc[:,-1]

	# Create the DMatrix from X and y: churn_dmatrix
	churn_dmatrix = xgb.DMatrix(data=X, label=y)

	# Create the parameter dictionary: params
	params = {"objective":"reg:logistic", "max_depth":3}

	# Perform cross-validation: cv_results
	cv_results = xgb.cv(dtrain=churn_dmatrix, params=params,
	nfold=3, num_boost_round=5,
	metrics="error", as_pandas=True, seed=123)

	# Print cv_results
	print(cv_results)

	# Print the accuracy
	print(((1-cv_results["test-error-mean"]).iloc[-1]))

	-----
	AUC CURVE
	-----
	# Perform cross_validation: cv_results
	cv_results = xgb.cv(dtrain=churn_dmatrix, params=params,
	nfold=3, num_boost_round=5,
	metrics="auc", as_pandas=True, seed=123)

	# Print cv_results
	print(cv_results)

	# Print the AUC
	print((cv_results["test-auc-mean"]).iloc[-1])

	--------
	XG Boost Regression
	-------
	# Create the training and test sets
	X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=123)

	# Instantiate the XGBRegressor: xg_reg
	xg_reg = xgb.XGBRegressor(objective="reg:linear", n_estimators=10, seed=123)

	# Fit the regressor to the training set
	xg_reg.fit(X_train, y_train)

	# Predict the labels of the test set: preds
	preds = xg_reg.predict(X_test)

	# Compute the rmse: rmse
	rmse = np.sqrt(mean_squared_error(y_test, preds))
	print("RMSE: %f" % (rmse))
	----
	XG Boost GLearner Have to use DMatrix as it is not common
	----
	# Convert the training and testing sets into DMatrixes: DM_train, DM_test
	DM_train = xgb.DMatrix(data=X_train, label=y_train)
	DM_test = xgb.DMatrix(data=X_test, label=y_test)

	# Create the parameter dictionary: params
	params = {"booster":"gblinear", "objective":"reg:linear"}

	# Train the model: xg_reg
	xg_reg = xgb.train(params = params, dtrain=DM_train, num_boost_round=5)

	# Predict the labels of the test set: preds
	preds = xg_reg.predict(DM_test)

	# Compute and print the RMSE
	rmse = np.sqrt(mean_squared_error(y_test,preds))
	print("RMSE: %f" % (rmse))
	-----
	CV Fold using XG Boost
	----
	# Create the DMatrix: housing_dmatrix
	housing_dmatrix = xgb.DMatrix(data=X,label=y)

	# Create the parameter dictionary: params
	params = {"objective":"reg:linear", "max_depth":4}

	# Perform cross-validation: cv_results
	cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=4, num_boost_round=5, metrics="mae", as_pandas=True, seed=123)

	# Print cv_results
	print(cv_results)

	# Extract and print final round boosting round metric
	print((cv_results["test-mae-mean"]).tail(1))
	----
	Regularization
	-----
	# Create the DMatrix: housing_dmatrix
	housing_dmatrix = xgb.DMatrix(data=X, label=y)

	reg_params = [1, 10, 100]

	# Create the initial parameter dictionary for varying l2 strength: params
	params = {"objective":"reg:linear","max_depth":3}

	# Create an empty list for storing rmses as a function of l2 complexity
	rmses_l2 = []

	# Iterate over reg_params
	for reg in reg_params:

	# Update l2 strength
	params["lambda"] = reg

	# Pass this updated param dictionary into cv
	cv_results_rmse = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2, num_boost_round=5, metrics="rmse", as_pandas=True, seed=123)

	# Append best rmse (final round) to rmses_l2
	rmses_l2.append(cv_results_rmse["test-rmse-mean"].tail(1).values[0])

	# Look at best rmse per l2 param
	print("Best rmse as a function of l2:")
	print(pd.DataFrame(list(zip(reg_params, rmses_l2)), columns=["l2","rmse"]))

	-----
	Visulaizing Most Important DataSet
	-----
	# Create the DMatrix: housing_dmatrix
	housing_dmatrix = xgb.DMatrix(data=X, label=y)

	# Create the parameter dictionary: params
	params = {"objective":"reg:linear", "max_depth":4}

	# Train the model: xg_reg
	xg_reg = xgb.train(params=params, dtrain=housing_dmatrix, num_boost_round=10)

	# Plot the feature importances
	xgb.plot_importance(xg_reg)
	plt.show()
	-----
	Tuning XGBoost
	------
	# Create the DMatrix: housing_dmatrix
	housing_dmatrix = xgb.DMatrix(data=X, label=y)

	# Create the parameter dictionary for each tree: params
	params = {"objective":"reg:linear", "max_depth":3}

	# Create list of number of boosting rounds
	num_rounds = [5, 10, 15]

	# Empty list to store final round rmse per XGBoost model
	final_rmse_per_round = []

	# Iterate over num_rounds and build one model per num_boost_round parameter
	for curr_num_rounds in num_rounds:

	# Perform cross-validation: cv_results
	cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3, num_boost_round=curr_num_rounds, metrics="rmse", as_pandas=True, seed=123)

	# Append final round RMSE
	final_rmse_per_round.append(cv_results["test-rmse-mean"].tail().values[-1])

	# Print the resultant DataFrame
	num_rounds_rmses = list(zip(num_rounds, final_rmse_per_round))
	print(pd.DataFrame(num_rounds_rmses,columns=["num_boosting_rounds","rmse"]))

	-------
	Automated boosting round selection using early_stopping
	-------
	# Create your housing DMatrix: housing_dmatrix
	housing_dmatrix = xgb.DMatrix(data=X,label=y)

	# Create the parameter dictionary for each tree: params
	params = {"objective":"reg:linear", "max_depth":4}

	# Perform cross-validation with early stopping: cv_results
	cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3, num_boost_round=50, early_stopping_rounds=10, metrics="rmse", as_pandas=True, seed=123)

	# Print cv_results
	print(cv_results)
	-------
	Tunig eta
	-----
	# Create your housing DMatrix: housing_dmatrix
	housing_dmatrix = xgb.DMatrix(data=X, label=y)

	# Create the parameter dictionary for each tree (boosting round)
	params = {"objective":"reg:linear", "max_depth":3}

	# Create list of eta values and empty list to store final round rmse per xgboost model
	eta_vals = [0.001, 0.01, 0.1]
	best_rmse = []

	# Systematically vary the eta
	for curr_val in eta_vals:

	params["eta"] = curr_val

	# Perform cross-validation: cv_results
	cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=3,
	num_boost_round=10, early_stopping_rounds=5,
	metrics="rmse", as_pandas=True, seed=123)

	# Append the final round rmse to best_rmse
	best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

	# Print the resultant DataFrame


	print(pd.DataFrame(list(zip(eta_vals, best_rmse)), columns=["eta","best_rmse"]))

	---
	Tuning max_depth
	-----
	# Create your housing DMatrix: housing_dmatrix
	housing_dmatrix = xgb.DMatrix(data=X,label=y)

	# Create the parameter dictionary
	params = {"objective":"reg:linear"}

	# Create list of max_depth values
	max_depths = [2, 5, 10, 20]
	best_rmse = []

	# Systematically vary the max_depth
	for curr_val in max_depths:

	params["max_depth"] = curr_val

	# Perform cross-validation
	cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2,
	num_boost_round=10, early_stopping_rounds=5,
	metrics="rmse", as_pandas=True, seed=123)

	# Append the final round rmse to best_rmse
	best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

	# Print the resultant DataFrame
	print(pd.DataFrame(list(zip(max_depths, best_rmse)),columns=["max_depth","best_rmse"]))

	-----
	Tuning colsample_bytree
	----
	# Create your housing DMatrix
	housing_dmatrix = xgb.DMatrix(data=X,label=y)

	# Create the parameter dictionary
	params={"objective":"reg:linear","max_depth":3}

	# Create list of hyperparameter values
	colsample_bytree_vals = [0.1, 0.5, 0.8, 1]
	best_rmse = []

	# Systematically vary the hyperparameter value
	for curr_val in colsample_bytree_vals:

	params["colsample_bytree"] = curr_val

	# Perform cross-validation
	cv_results = xgb.cv(dtrain=housing_dmatrix, params=params, nfold=2,
	num_boost_round=10, early_stopping_rounds=5,
	metrics="rmse", as_pandas=True, seed=123)

	# Append the final round rmse to best_rmse
	best_rmse.append(cv_results["test-rmse-mean"].tail().values[-1])

	# Print the resultant DataFrame
	print(pd.DataFrame(list(zip(colsample_bytree_vals, best_rmse)), columns=["colsample_bytree","best_rmse"]))
	------
	Grid search with XGBoost
	------
	# Create the parameter grid: gbm_param_grid
	gbm_param_grid = {
	'colsample_bytree': [0.3, 0.7],
	'n_estimators': [50],
	'max_depth': [2, 5]
	}

	# Instantiate the regressor: gbm
	gbm = xgb.XGBRegressor()

	# Perform grid search: grid_mse
	grid_mse = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid,
	scoring='neg_mean_squared_error', cv=4, verbose=1)
	grid_mse.fit(X, y)

	# Print the best parameters and lowest RMSE
	print("Best parameters found: ", grid_mse.best_params_)
	print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))
	---------
	Random search with XGBoost
	----------
	# Create the parameter grid: gbm_param_grid
	gbm_param_grid = {
	'n_estimators': [25],
	'max_depth': range(2, 12)
	}

	# Instantiate the regressor: gbm
	gbm = xgb.XGBRegressor(n_estimators=10)

	# Perform random search: grid_mse
	randomized_mse = RandomizedSearchCV(estimator=gbm, param_distributions=gbm_param_grid,
	n_iter=5, scoring='neg_mean_squared_error', cv=4, verbose=1)
	randomized_mse.fit(X, y)

	# Print the best parameters and lowest RMSE
	print("Best parameters found: ",randomized_mse.best_params_)
	print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))
	--------
	Encoding categorical columns I: LabelEncoder
	---------
	# Import LabelEncoder
	from sklearn.preprocessing import LabelEncoder

	# Fill missing values with 0
	df.LotFrontage = df.LotFrontage.fillna(0)

	# Create a boolean mask for categorical columns
	categorical_mask = (df.dtypes == object)

	# Get list of categorical column names
	categorical_columns = df.columns[categorical_mask].tolist()

	# Print the head of the categorical columns
	print(df[categorical_columns].head())

	# Create LabelEncoder object: le
	le = LabelEncoder()

	# Apply LabelEncoder to categorical columns
	df[categorical_columns] = df[categorical_columns].apply(lambda x: le.fit_transform(x))

	# Print the head of the LabelEncoded categorical columns
	print(df[categorical_columns].head())

	------
	Encoding categorical columns II: OneHotEncoder
	-------
	# Import OneHotEncoder
	from sklearn.preprocessing import OneHotEncoder

	# Create OneHotEncoder: ohe
	ohe = OneHotEncoder(categorical_features=categorical_mask, sparse=False)

	# Apply OneHotEncoder to categorical columns - output is no longer a dataframe: df_encoded
	df_encoded = ohe.fit_transform(df)

	# Print first 5 rows of the resulting dataset - again, this will no longer be a pandas dataframe
	print(df_encoded[:5, :])

	# Print the shape of the original DataFrame
	print(df.shape)

	# Print the shape of the transformed array
	print(df_encoded.shape)
	-------
	Encoding categorical columns III: DictVectorizer-LabelEncoder followed by OneHotEncoder - can be simplified by using a DictVectorizer.
	------
	# Import DictVectorizer
	from sklearn.feature_extraction import DictVectorizer

	# Convert df into a dictionary: df_dict
	df_dict = df.to_dict("records")

	# Create the DictVectorizer object: dv
	dv = DictVectorizer(sparse=False)

	# Apply dv on df: df_encoded
	df_encoded = dv.fit_transform(df_dict)

	# Print the resulting first five rows
	print(df_encoded[:5,:])

	# Print the vocabulary
	print(dv.vocabulary_)
	-----
	Preprocessing within a pipeline
	-------
	# Import necessary modules
	from sklearn.feature_extraction import DictVectorizer
	from sklearn.pipeline import Pipeline

	# Fill LotFrontage missing values with 0
	X.LotFrontage = X.LotFrontage.fillna(0)

	# Setup the pipeline steps: steps
	steps = [("ohe_onestep", DictVectorizer(sparse=False)),
	("xgb_model", xgb.XGBRegressor())]

	# Create the pipeline: xgb_pipeline
	xgb_pipeline = Pipeline(steps)

	# Fit the pipeline
	xgb_pipeline.fit(X.to_dict("records"), y)
	--------
	-----IMPORTANT STUFF---
	------
	Cross-validating your XGBoost model
	-----

	# Import necessary modules
	from sklearn.feature_extraction import DictVectorizer
	from sklearn.pipeline import Pipeline
	from sklearn.model_selection import cross_val_score

	# Fill LotFrontage missing values with 0
	X.LotFrontage = X.LotFrontage.fillna(0)

	# Setup the pipeline steps: steps
	steps = [("ohe_onestep", DictVectorizer(sparse=False)),
	("xgb_model", xgb.XGBRegressor(max_depth=2, objective="reg:linear"))]

	# Create the pipeline: xgb_pipeline
	xgb_pipeline = Pipeline(steps)

	# Cross-validate the model
	cross_val_scores = cross_val_score(xgb_pipeline, X.to_dict("records"), y, cv=10, scoring="neg_mean_squared_error")

	# Print the 10-fold RMSE
	print("10-fold RMSE: ", np.mean(np.sqrt(np.abs(cross_val_scores))))
	---
	KIDNEY CASE STUDY
	next gist