Last active
June 19, 2020 22:21
-
-
Save ledell/3d99498d4236248f9bfbc8ed2fd424fa to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This example is outdated because we have the H2O Stacked Ensemble function now (so it's better to use that): | |
# http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/stacked-ensembles.html | |
import h2o | |
from h2o.estimators.gbm import H2OGradientBoostingEstimator | |
from h2o.estimators.deeplearning import H2ODeepLearningEstimator | |
from h2o.estimators.glm import H2OGeneralizedLinearEstimator | |
from h2o.estimators.random_forest import H2ORandomForestEstimator | |
from sklearn import metrics #will be replaced with ensemble_performance later | |
def source_stack_utils(): | |
''' | |
Current location of h2o stack python utils | |
''' | |
import urllib | |
url = "https://gist.githubusercontent.com/ledell/8ba8d064ae13169a1821faac70d2211b/raw/7d0fa741df619d1a5340e06258a91831951be8a9/stack.py" | |
urllib.urlretrieve(url, "stack.py") | |
def prep_data_example(): | |
# Import a sample binary outcome train/test set into R | |
train = h2o.import_file("http://www.stat.berkeley.edu/~ledell/data/higgs_10k.csv") | |
test = h2o.import_file("http://www.stat.berkeley.edu/~ledell/data/higgs_test_5k.csv") | |
y = "C1" | |
x = list(train.columns) | |
x.pop(0) | |
family = "binomial" | |
#For binary classification, response should be a factor | |
train[y] = train[y].asfactor() | |
test[y] = test[y].asfactor() | |
return x, y, train, test, family | |
def cvtrain_base_models(x, y, train, family): | |
''' | |
Here we (5-fold) cross-validate a collection of base models | |
This is an example of an ensemble of nine models: | |
- 3 GBM | |
- 3 DL | |
- 2 RF | |
- 1 GLM | |
''' | |
# All models must use exact same CV folds | |
nfolds = 5 | |
fold_assignment = 'Modulo' | |
# TO DO: Sync up family and distribution and un-hardcode distribution below | |
gbm1 = H2OGradientBoostingEstimator(distribution='bernoulli', | |
ntrees=100, | |
max_depth=4, | |
learn_rate=0.1, | |
nfolds=nfolds, | |
fold_assignment=fold_assignment, | |
keep_cross_validation_predictions=True) | |
gbm2 = H2OGradientBoostingEstimator(distribution='bernoulli', | |
ntrees=100, | |
max_depth=4, | |
learn_rate=0.1, | |
col_sample_rate=0.7, | |
nfolds=nfolds, | |
fold_assignment=fold_assignment, | |
keep_cross_validation_predictions=True) | |
gbm3 = H2OGradientBoostingEstimator(distribution='bernoulli', | |
ntrees=100, | |
max_depth=2, | |
learn_rate=0.1, | |
nfolds=nfolds, | |
fold_assignment=fold_assignment, | |
keep_cross_validation_predictions=True) | |
dl1 = H2ODeepLearningEstimator(distribution='bernoulli', | |
activation='Rectifier', | |
hidden=[50,50,50], | |
l1=1e-5, | |
epochs=10, | |
nfolds=nfolds, | |
fold_assignment=fold_assignment, | |
keep_cross_validation_predictions=True) | |
dl2 = H2ODeepLearningEstimator(distribution='bernoulli', | |
activation='RectifierWithDropout', | |
hidden=[100,100,100], | |
input_dropout_ratio=0.2, | |
l1=1e-5, | |
epochs=10, | |
nfolds=nfolds, | |
fold_assignment=fold_assignment, | |
keep_cross_validation_predictions=True) | |
dl3 = H2ODeepLearningEstimator(distribution='bernoulli', | |
activation='Rectifier', | |
hidden=[200,200], | |
l1=1e-6, | |
epochs=10, | |
nfolds=nfolds, | |
fold_assignment=fold_assignment, | |
keep_cross_validation_predictions=True) | |
rf1 = H2ORandomForestEstimator(#distribution='bernoulli', | |
ntrees=300, | |
nfolds=nfolds, | |
fold_assignment=fold_assignment, | |
keep_cross_validation_predictions=True) | |
rf2 = H2ORandomForestEstimator(#distribution='bernoulli', | |
ntrees=300, | |
sample_rate=0.7, | |
mtries=10, | |
nfolds=nfolds, | |
fold_assignment=fold_assignment, | |
keep_cross_validation_predictions=True) | |
glm1 = H2OGeneralizedLinearEstimator(family='binomial', | |
nfolds=nfolds, | |
fold_assignment=fold_assignment, | |
keep_cross_validation_predictions=True) | |
# Edit this list of base models to make different ensembles | |
models = [gbm1, gbm2, gbm3, dl1, dl2, dl3, rf1, rf2, glm1] | |
for model in models: | |
model.train(x=x, y=y, training_frame=train) | |
return models | |
def main(): | |
h2o.init() | |
# Load some example binary response data | |
x, y, train, test, family = prep_data_example() | |
# Load stacking utils | |
source_stack_utils() | |
from stack import make_Z, get_cvpreds, stack, metapredict | |
# Cross-validation & training of base models | |
# Above we train an abitrary assortment of base models | |
models = cvtrain_base_models(x=x, y=y, train=train, family=family) | |
# Define a NN-GLM metalearner | |
metalearner = H2OGeneralizedLinearEstimator(family='binomial', non_negative=True) | |
# Fit the stacked ensemble / Super Learner | |
metafit = stack(models=models, | |
metalearner=metalearner, | |
response_frame=train[y], | |
seed=1, | |
keep_levelone_data=True) | |
# Generate ensemble prediction on the test set | |
pred, basepred = metapredict(models=models, metafit=metafit, test_data=test) | |
# TO DO: Add metafit.ensemble_performance() | |
# Evaluate ensemble test performance | |
preds = pred[2].as_data_frame(True) | |
labels = test[y].as_data_frame(True) | |
fpr, tpr, thresholds = metrics.roc_curve(labels, preds, pos_label=1) | |
auc = metrics.auc(fpr, tpr) | |
print str(auc) + " " + "H2O Ensemble" | |
# Evaluate base learner test set performance (for comparison) | |
for model in models: | |
bperf = model.model_performance(test_data=test) | |
print str(bperf.auc()) + " " + model.model_id | |
# 0.792100100148 H2O Ensemble | |
# 0.781849246474 GBM_model_python_1471654758738_1 | |
# 0.782052358716 GBM_model_python_1471654758738_816 | |
# 0.769195957061 GBM_model_python_1471654758738_1837 | |
# 0.729095165124 DeepLearning_model_python_1471654758738_3028 | |
# 0.691393671746 DeepLearning_model_python_1471654758738_3057 | |
# 0.724608757556 DeepLearning_model_python_1471654758738_3086 | |
# 0.78333120166 DRF_model_python_1471654758738_3115 | |
# 0.787051172219 DRF_model_python_1471654758738_3896 | |
# 0.687091955549 GLM_model_python_1471654758738_4639 | |
# In this example, ensemble test AUC was 0.792 and the top base learner was 0.783 |
It means that there are three hidden layers, each with 50 neurons. More info here: http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/deep-learning.html
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
In the H2ODeepLearningEstimator what is the parameter hidden[50,50,50] means. Can you explain it?