ivopbernardo/h2o_example.r

## h2o_example.r
# Load h2o
library(h2o)
library(ggplot2)

# Load Dataset - London Bike
london_bike <- read.csv('./london_merged.csv')

# Transforming Weather code and Season to factor
london_bike$weather_code <- as.factor(london_bike$weather_code)
london_bike$season <- as.factor(london_bike$season)

h2o.init()

# Convert london_bike to h2o frame
london_bike.h2o <- as.h2o(london_bike)

# Split london_bike into train_test split
london_bike_split <- h2o.splitFrame(data = london_bike.h2o, ratios = 0.8, seed = 1234)

training_data <- london_bike_split[[1]]
test_data <- london_bike_split[[2]]

# Training Linear regression

predictors <- c("t1", "t2", "hum", "wind_speed", "weather_code", "is_holiday",
                "is_weekend", "season")
response <- "cnt"


# Model 1
london_bike_model <- h2o.glm(x = predictors,
                      y = response,
                      training_frame = training_data)

test_predict <- h2o.predict(object = london_bike_model,
                            newdata = test_data)

predictions_x_real <- cbind(
  as.data.frame(test_data$cnt),
  as.data.frame(test_predict)
)

ggplot(
  data = as.data.frame(predictions_x_real),
  aes(x=cnt, y=predict)
) + geom_point(color='darkgreen') + xlab('Actual Label') + ylab('Predictions')

# Training Linear Regression using Regularization
london_bike_model_regularized <- h2o.glm(x = predictors,
                             y = response,
                             training_frame = training_data,
                             alpha = 1)

test_predict_regularized <- h2o.predict(object = london_bike_model_regularized,
                            newdata = test_data)

predictions_x_real_regularized <- cbind(
  as.data.frame(test_data$cnt),
  as.data.frame(test_predict_regularized)
)

ggplot(
  data = as.data.frame(predictions_x_real_regularized),
  aes(x=cnt, y=predict)
) + geom_point(color='darkgreen') + xlab('Actual Label') + ylab('Predictions')

# Evaluating models - Using validation_frame
london_bike_model <- h2o.glm(x = predictors,
                             y = response,
                             training_frame = training_data,
                             validation_frame = test_data)

#
h2o.rmse(london_bike_model, train=TRUE, valid=TRUE)

# Random Forest Example
london_bike_rf <- h2o.randomForest(x = predictors,
                                   y = response,
                                   ntrees = 25,
                                   max_depth = 5,
                                   training_frame = training_data,
                                   validation_frame = test_data)

# Retrieving metrics for randomforest
h2o.rmse(london_bike_rf, train=TRUE, valid=TRUE)
h2o.r2(london_bike_rf, train=TRUE, valid=TRUE)

# Training Neural Network
nn_model <- h2o.deeplearning(x = predictors,
                             y = response,
                             hidden = c(6,6,4,7),
                             epochs = 1000,
                             train_samples_per_iteration = -1,
                             reproducible = TRUE,
                             activation = "Rectifier",
                             seed = 23123,
                             training_frame = training_data,
                             validation_frame = test_data)

# Neural Network Evaluation
h2o.rmse(nn_model, train=TRUE, valid=TRUE)
h2o.r2(nn_model, train=TRUE, valid=TRUE)

# Grid Search
rf_params <- list(ntrees = c(2, 5, 10, 15),
                    max_depth = c(3, 5, 9),
                    min_rows = c(5, 10, 100))

# Train and validate a grid of randomForests
rf_grid <- h2o.grid("randomForest",
                      x = predictors,
                      y = response,
                      grid_id = "rf_grid",
                      training_frame = training_data,
                      validation_frame = test_data,
                      seed = 1,
                      hyper_params = rf_params)


h2o.getGrid(grid_id = "rf_grid",
            sort_by = "r2",
            decreasing = TRUE)

# Auto ML Routine
aml <- h2o.automl(x = predictors,
                  y = response,
                  training_frame = training_data,
                  validation_frame = test_data,
                  max_models = 15,
                  seed = 1)


# Explainability
london_bike_rf <- h2o.randomForest(x = predictors,
                                   y = response,
                                   ntrees = 25,
                                   max_depth = 5,
                                   training_frame = training_data,
                                   validation_frame = test_data)


# Variable importance plot
h2o.varimp_plot(london_bike_rf)

# Shap Summary
h2o.shap_summary_plot(london_bike_rf, test_data)

# Shap Explain Row
h2o.shap_explain_row_plot(london_bike_rf, test_data, row_index = 4)

# Shap Explain Summer Row
h2o.shap_explain_row_plot(london_bike_rf, test_data, row_index = 830)
	# Load h2o
	library(h2o)
	library(ggplot2)

	# Load Dataset - London Bike
	london_bike <- read.csv('./london_merged.csv')

	# Transforming Weather code and Season to factor
	london_bike$weather_code <- as.factor(london_bike$weather_code)
	london_bike$season <- as.factor(london_bike$season)

	h2o.init()

	# Convert london_bike to h2o frame
	london_bike.h2o <- as.h2o(london_bike)

	# Split london_bike into train_test split
	london_bike_split <- h2o.splitFrame(data = london_bike.h2o, ratios = 0.8, seed = 1234)

	training_data <- london_bike_split[[1]]
	test_data <- london_bike_split[[2]]

	# Training Linear regression

	predictors <- c("t1", "t2", "hum", "wind_speed", "weather_code", "is_holiday",
	"is_weekend", "season")
	response <- "cnt"


	# Model 1
	london_bike_model <- h2o.glm(x = predictors,
	y = response,
	training_frame = training_data)

	test_predict <- h2o.predict(object = london_bike_model,
	newdata = test_data)

	predictions_x_real <- cbind(
	as.data.frame(test_data$cnt),
	as.data.frame(test_predict)
	)

	ggplot(
	data = as.data.frame(predictions_x_real),
	aes(x=cnt, y=predict)
	) + geom_point(color='darkgreen') + xlab('Actual Label') + ylab('Predictions')

	# Training Linear Regression using Regularization
	london_bike_model_regularized <- h2o.glm(x = predictors,
	y = response,
	training_frame = training_data,
	alpha = 1)

	test_predict_regularized <- h2o.predict(object = london_bike_model_regularized,
	newdata = test_data)

	predictions_x_real_regularized <- cbind(
	as.data.frame(test_data$cnt),
	as.data.frame(test_predict_regularized)
	)

	ggplot(
	data = as.data.frame(predictions_x_real_regularized),
	aes(x=cnt, y=predict)
	) + geom_point(color='darkgreen') + xlab('Actual Label') + ylab('Predictions')

	# Evaluating models - Using validation_frame
	london_bike_model <- h2o.glm(x = predictors,
	y = response,
	training_frame = training_data,
	validation_frame = test_data)

	#
	h2o.rmse(london_bike_model, train=TRUE, valid=TRUE)

	# Random Forest Example
	london_bike_rf <- h2o.randomForest(x = predictors,
	y = response,
	ntrees = 25,
	max_depth = 5,
	training_frame = training_data,
	validation_frame = test_data)

	# Retrieving metrics for randomforest
	h2o.rmse(london_bike_rf, train=TRUE, valid=TRUE)
	h2o.r2(london_bike_rf, train=TRUE, valid=TRUE)

	# Training Neural Network
	nn_model <- h2o.deeplearning(x = predictors,
	y = response,
	hidden = c(6,6,4,7),
	epochs = 1000,
	train_samples_per_iteration = -1,
	reproducible = TRUE,
	activation = "Rectifier",
	seed = 23123,
	training_frame = training_data,
	validation_frame = test_data)

	# Neural Network Evaluation
	h2o.rmse(nn_model, train=TRUE, valid=TRUE)
	h2o.r2(nn_model, train=TRUE, valid=TRUE)

	# Grid Search
	rf_params <- list(ntrees = c(2, 5, 10, 15),
	max_depth = c(3, 5, 9),
	min_rows = c(5, 10, 100))

	# Train and validate a grid of randomForests
	rf_grid <- h2o.grid("randomForest",
	x = predictors,
	y = response,
	grid_id = "rf_grid",
	training_frame = training_data,
	validation_frame = test_data,
	seed = 1,
	hyper_params = rf_params)


	h2o.getGrid(grid_id = "rf_grid",
	sort_by = "r2",
	decreasing = TRUE)

	# Auto ML Routine
	aml <- h2o.automl(x = predictors,
	y = response,
	training_frame = training_data,
	validation_frame = test_data,
	max_models = 15,
	seed = 1)


	# Explainability
	london_bike_rf <- h2o.randomForest(x = predictors,
	y = response,
	ntrees = 25,
	max_depth = 5,
	training_frame = training_data,
	validation_frame = test_data)



	# Variable importance plot
	h2o.varimp_plot(london_bike_rf)

	# Shap Summary
	h2o.shap_summary_plot(london_bike_rf, test_data)

	# Shap Explain Row
	h2o.shap_explain_row_plot(london_bike_rf, test_data, row_index = 4)

	# Shap Explain Summer Row
	h2o.shap_explain_row_plot(london_bike_rf, test_data, row_index = 830)