Skip to content

Instantly share code, notes, and snippets.

@ledell
Created February 24, 2017 02:18
Show Gist options
  • Save ledell/95cdd71f1de715c48ba9293071a6ea22 to your computer and use it in GitHub Desktop.
Save ledell/95cdd71f1de715c48ba9293071a6ea22 to your computer and use it in GitHub Desktop.
import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
from h2o.grid.grid_search import H2OGridSearch
from __future__ import print_function
h2o.init(nthreads=-1)
# Import a sample binary outcome train/test set into H2O
train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
# Identify predictors and response
x = train.columns
y = "response"
x.remove(y)
# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()
# Number of CV folds (to generate level-one data for stacking)
nfolds = 5
# There are a few ways to assemble a list of models to stack toegether:
# 1. Train individual models and put them in a list
# 2. Train a grid of models
# 3. Train several grids of models
# Note: All base models must have the same cross-validation folds and
# the cross-validated predicted values must be kept.
# 1. Generate a 2-model ensemble (GBM + RF)
# Train and cross-validate a GBM
my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli",
ntrees=10,
max_depth=3,
min_rows=2,
learn_rate=0.2,
nfolds=nfolds,
fold_assignment="Modulo",
keep_cross_validation_predictions=True,
seed=1)
my_gbm.train(x=x, y=y, training_frame=train)
# Train and cross-validate a RF
my_rf = H2ORandomForestEstimator(ntrees=50,
nfolds=nfolds,
fold_assignment="Modulo",
keep_cross_validation_predictions=True,
seed=1)
my_rf.train(x=x, y=y, training_frame=train)
# Train a stacked ensemble using the GBM and GLM above
ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_binomial",
base_models=[my_gbm.model_id, my_rf.model_id])
ensemble.train(x=x, y=y, training_frame=train)
# Eval ensemble performance on the test data
perf_stack_test = ensemble.model_performance(test)
# Compare to base learner performance on the test set
perf_gbm_test = my_gbm.model_performance(test)
perf_rf_test = my_rf.model_performance(test)
baselearner_best_auc_test = max(perf_gbm_test.auc(), perf_rf_test.auc())
stack_auc_test = perf_stack_test.auc()
print("Best Base-learner Test AUC: {0}".format(baselearner_best_auc_test))
print("Ensemble Test AUC: {0}".format(stack_auc_test))
# Generate predictions on a test set (if neccessary)
pred = ensemble.predict(test)
# 2. Generate a random grid of models and stack them together
# Specify GBM hyperparameters for the grid
hyper_params = {"learn_rate": [0.01, 0.03],
"max_depth": [3, 4, 5, 6, 9],
"sample_rate": [0.7, 0.8, 0.9, 1.0],
"col_sample_rate": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]}
search_criteria = {"strategy": "RandomDiscrete", "max_models": 3, "seed": 1}
# Train the grid
grid = H2OGridSearch(model=H2OGradientBoostingEstimator(ntrees=10,
seed=1,
nfolds=nfolds,
fold_assignment="Modulo",
keep_cross_validation_predictions=True),
hyper_params=hyper_params,
search_criteria=search_criteria,
grid_id="gbm_grid_binomial")
grid.train(x=x, y=y, training_frame=train)
# Train a stacked ensemble using the GBM grid
ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_gbm_grid_binomial",
base_models=grid.model_ids)
ensemble.train(x=x, y=y, training_frame=train)
# Eval ensemble performance on the test data
perf_stack_test = ensemble.model_performance(test)
# Compare to base learner performance on the test set
baselearner_best_auc_test = max([h2o.get_model(model).model_performance(test_data=test).auc() for model in grid.model_ids])
stack_auc_test = perf_stack_test.auc()
print("Best Base-learner Test AUC: {0}".format(baselearner_best_auc_test))
print("Ensemble Test AUC: {0}".format(stack_auc_test))
# Generate predictions on a test set (if neccessary)
pred = ensemble.predict(test)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment