Created
February 24, 2017 02:18
-
-
Save ledell/95cdd71f1de715c48ba9293071a6ea22 to your computer and use it in GitHub Desktop.
Python code example from here: http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/stacked-ensembles.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import h2o | |
from h2o.estimators.random_forest import H2ORandomForestEstimator | |
from h2o.estimators.gbm import H2OGradientBoostingEstimator | |
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator | |
from h2o.grid.grid_search import H2OGridSearch | |
from __future__ import print_function | |
h2o.init(nthreads=-1) | |
# Import a sample binary outcome train/test set into H2O | |
train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv") | |
test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv") | |
# Identify predictors and response | |
x = train.columns | |
y = "response" | |
x.remove(y) | |
# For binary classification, response should be a factor | |
train[y] = train[y].asfactor() | |
test[y] = test[y].asfactor() | |
# Number of CV folds (to generate level-one data for stacking) | |
nfolds = 5 | |
# There are a few ways to assemble a list of models to stack toegether: | |
# 1. Train individual models and put them in a list | |
# 2. Train a grid of models | |
# 3. Train several grids of models | |
# Note: All base models must have the same cross-validation folds and | |
# the cross-validated predicted values must be kept. | |
# 1. Generate a 2-model ensemble (GBM + RF) | |
# Train and cross-validate a GBM | |
my_gbm = H2OGradientBoostingEstimator(distribution="bernoulli", | |
ntrees=10, | |
max_depth=3, | |
min_rows=2, | |
learn_rate=0.2, | |
nfolds=nfolds, | |
fold_assignment="Modulo", | |
keep_cross_validation_predictions=True, | |
seed=1) | |
my_gbm.train(x=x, y=y, training_frame=train) | |
# Train and cross-validate a RF | |
my_rf = H2ORandomForestEstimator(ntrees=50, | |
nfolds=nfolds, | |
fold_assignment="Modulo", | |
keep_cross_validation_predictions=True, | |
seed=1) | |
my_rf.train(x=x, y=y, training_frame=train) | |
# Train a stacked ensemble using the GBM and GLM above | |
ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_binomial", | |
base_models=[my_gbm.model_id, my_rf.model_id]) | |
ensemble.train(x=x, y=y, training_frame=train) | |
# Eval ensemble performance on the test data | |
perf_stack_test = ensemble.model_performance(test) | |
# Compare to base learner performance on the test set | |
perf_gbm_test = my_gbm.model_performance(test) | |
perf_rf_test = my_rf.model_performance(test) | |
baselearner_best_auc_test = max(perf_gbm_test.auc(), perf_rf_test.auc()) | |
stack_auc_test = perf_stack_test.auc() | |
print("Best Base-learner Test AUC: {0}".format(baselearner_best_auc_test)) | |
print("Ensemble Test AUC: {0}".format(stack_auc_test)) | |
# Generate predictions on a test set (if neccessary) | |
pred = ensemble.predict(test) | |
# 2. Generate a random grid of models and stack them together | |
# Specify GBM hyperparameters for the grid | |
hyper_params = {"learn_rate": [0.01, 0.03], | |
"max_depth": [3, 4, 5, 6, 9], | |
"sample_rate": [0.7, 0.8, 0.9, 1.0], | |
"col_sample_rate": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]} | |
search_criteria = {"strategy": "RandomDiscrete", "max_models": 3, "seed": 1} | |
# Train the grid | |
grid = H2OGridSearch(model=H2OGradientBoostingEstimator(ntrees=10, | |
seed=1, | |
nfolds=nfolds, | |
fold_assignment="Modulo", | |
keep_cross_validation_predictions=True), | |
hyper_params=hyper_params, | |
search_criteria=search_criteria, | |
grid_id="gbm_grid_binomial") | |
grid.train(x=x, y=y, training_frame=train) | |
# Train a stacked ensemble using the GBM grid | |
ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_gbm_grid_binomial", | |
base_models=grid.model_ids) | |
ensemble.train(x=x, y=y, training_frame=train) | |
# Eval ensemble performance on the test data | |
perf_stack_test = ensemble.model_performance(test) | |
# Compare to base learner performance on the test set | |
baselearner_best_auc_test = max([h2o.get_model(model).model_performance(test_data=test).auc() for model in grid.model_ids]) | |
stack_auc_test = perf_stack_test.auc() | |
print("Best Base-learner Test AUC: {0}".format(baselearner_best_auc_test)) | |
print("Ensemble Test AUC: {0}".format(stack_auc_test)) | |
# Generate predictions on a test set (if neccessary) | |
pred = ensemble.predict(test) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment