ledell/h2o-stacked-ensemble-demo.py

## h2o-stacked-ensemble-demo.py
import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
from h2o.grid.grid_search import H2OGridSearch
from __future__ import print_function


h2o.init(nthreads=-1)

# Import a sample binary outcome train/test set into H2O
train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")

# Identify predictors and response
x = train.columns
y = "response"
x.remove(y)

# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

# Number of CV folds (to generate level-one data for stacking)
nfolds = 5


# There are a few ways to assemble a list of models to stack toegether:
# 1. Train individual models and put them in a list
# 2. Train a grid of models
# 3. Train several grids of models
# Note: All base models must have the same cross-validation folds and
# the cross-validated predicted values must be kept.


# 1. Generate a 2-model ensemble (GBM + RF)

# Train and cross-validate a GBM
my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
                                      ntrees=10,
                                      max_depth=3,
                                      min_rows=2,
                                      learn_rate=0.2,
                                      nfolds=nfolds,
                                      fold_assignment="Modulo",
                                      keep_cross_validation_predictions=True,
                                      seed=1)
my_gbm.train(x=x, y=y, training_frame=train)


# Train and cross-validate a RF
my_rf = H2ORandomForestEstimator(ntrees=50,
                                 nfolds=nfolds,
                                 fold_assignment="Modulo",
                                 keep_cross_validation_predictions=True,
                                 seed=1)
my_rf.train(x=x, y=y, training_frame=train)


# Train a stacked ensemble using the GBM and GLM above
ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_binomial",
                                       base_models=[my_gbm.model_id, my_rf.model_id])
ensemble.train(x=x, y=y, training_frame=train)

# Eval ensemble performance on the test data
perf_stack_test = ensemble.model_performance(test)

# Compare to base learner performance on the test set
perf_gbm_test = my_gbm.model_performance(test)
perf_rf_test = my_rf.model_performance(test)
baselearner_best_auc_test = max(perf_gbm_test.auc(), perf_rf_test.auc())
stack_auc_test = perf_stack_test.auc()
print("Best Base-learner Test AUC:  {0}".format(baselearner_best_auc_test))
print("Ensemble Test AUC:  {0}".format(stack_auc_test))

# Generate predictions on a test set (if neccessary)
pred = ensemble.predict(test)


# 2. Generate a random grid of models and stack them together

# Specify GBM hyperparameters for the grid
hyper_params = {"learn_rate": [0.01, 0.03],
                "max_depth": [3, 4, 5, 6, 9],
                "sample_rate": [0.7, 0.8, 0.9, 1.0],
                "col_sample_rate": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]}
search_criteria = {"strategy": "RandomDiscrete", "max_models": 3, "seed": 1}

# Train the grid
grid = H2OGridSearch(model=H2OGradientBoostingEstimator(ntrees=10,
                                                        seed=1,
                                                        nfolds=nfolds,
                                                        fold_assignment="Modulo",
                                                        keep_cross_validation_predictions=True),
                     hyper_params=hyper_params,
                     search_criteria=search_criteria,
                     grid_id="gbm_grid_binomial")
grid.train(x=x, y=y, training_frame=train)

# Train a stacked ensemble using the GBM grid
ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_gbm_grid_binomial",
                                       base_models=grid.model_ids)
ensemble.train(x=x, y=y, training_frame=train)

# Eval ensemble performance on the test data
perf_stack_test = ensemble.model_performance(test)

# Compare to base learner performance on the test set
baselearner_best_auc_test = max([h2o.get_model(model).model_performance(test_data=test).auc() for model in grid.model_ids])
stack_auc_test = perf_stack_test.auc()
print("Best Base-learner Test AUC:  {0}".format(baselearner_best_auc_test))
print("Ensemble Test AUC:  {0}".format(stack_auc_test))

# Generate predictions on a test set (if neccessary)
pred = ensemble.predict(test)
	import h2o
	from h2o.estimators.random_forest import H2ORandomForestEstimator
	from h2o.estimators.gbm import H2OGradientBoostingEstimator
	from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
	from h2o.grid.grid_search import H2OGridSearch
	from __future__ import print_function


	h2o.init(nthreads=-1)

	# Import a sample binary outcome train/test set into H2O
	train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
	test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")

	# Identify predictors and response
	x = train.columns
	y = "response"
	x.remove(y)

	# For binary classification, response should be a factor
	train[y] = train[y].asfactor()
	test[y] = test[y].asfactor()

	# Number of CV folds (to generate level-one data for stacking)
	nfolds = 5



	# There are a few ways to assemble a list of models to stack toegether:
	# 1. Train individual models and put them in a list
	# 2. Train a grid of models
	# 3. Train several grids of models
	# Note: All base models must have the same cross-validation folds and
	# the cross-validated predicted values must be kept.



	# 1. Generate a 2-model ensemble (GBM + RF)

	# Train and cross-validate a GBM
	my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
	ntrees=10,
	max_depth=3,
	min_rows=2,
	learn_rate=0.2,
	nfolds=nfolds,
	fold_assignment="Modulo",
	keep_cross_validation_predictions=True,
	seed=1)
	my_gbm.train(x=x, y=y, training_frame=train)


	# Train and cross-validate a RF
	my_rf = H2ORandomForestEstimator(ntrees=50,
	nfolds=nfolds,
	fold_assignment="Modulo",
	keep_cross_validation_predictions=True,
	seed=1)
	my_rf.train(x=x, y=y, training_frame=train)


	# Train a stacked ensemble using the GBM and GLM above
	ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_binomial",
	base_models=[my_gbm.model_id, my_rf.model_id])
	ensemble.train(x=x, y=y, training_frame=train)

	# Eval ensemble performance on the test data
	perf_stack_test = ensemble.model_performance(test)

	# Compare to base learner performance on the test set
	perf_gbm_test = my_gbm.model_performance(test)
	perf_rf_test = my_rf.model_performance(test)
	baselearner_best_auc_test = max(perf_gbm_test.auc(), perf_rf_test.auc())
	stack_auc_test = perf_stack_test.auc()
	print("Best Base-learner Test AUC: {0}".format(baselearner_best_auc_test))
	print("Ensemble Test AUC: {0}".format(stack_auc_test))

	# Generate predictions on a test set (if neccessary)
	pred = ensemble.predict(test)



	# 2. Generate a random grid of models and stack them together

	# Specify GBM hyperparameters for the grid
	hyper_params = {"learn_rate": [0.01, 0.03],
	"max_depth": [3, 4, 5, 6, 9],
	"sample_rate": [0.7, 0.8, 0.9, 1.0],
	"col_sample_rate": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]}
	search_criteria = {"strategy": "RandomDiscrete", "max_models": 3, "seed": 1}

	# Train the grid
	grid = H2OGridSearch(model=H2OGradientBoostingEstimator(ntrees=10,
	seed=1,
	nfolds=nfolds,
	fold_assignment="Modulo",
	keep_cross_validation_predictions=True),
	hyper_params=hyper_params,
	search_criteria=search_criteria,
	grid_id="gbm_grid_binomial")
	grid.train(x=x, y=y, training_frame=train)

	# Train a stacked ensemble using the GBM grid
	ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_gbm_grid_binomial",
	base_models=grid.model_ids)
	ensemble.train(x=x, y=y, training_frame=train)

	# Eval ensemble performance on the test data
	perf_stack_test = ensemble.model_performance(test)

	# Compare to base learner performance on the test set
	baselearner_best_auc_test = max([h2o.get_model(model).model_performance(test_data=test).auc() for model in grid.model_ids])
	stack_auc_test = perf_stack_test.auc()
	print("Best Base-learner Test AUC: {0}".format(baselearner_best_auc_test))
	print("Ensemble Test AUC: {0}".format(stack_auc_test))

	# Generate predictions on a test set (if neccessary)
	pred = ensemble.predict(test)