Skip to content

Instantly share code, notes, and snippets.

@ledell
Last active February 15, 2016 19:21
Show Gist options
  • Save ledell/b3b1e0af3d817beed4b4 to your computer and use it in GitHub Desktop.
Save ledell/b3b1e0af3d817beed4b4 to your computer and use it in GitHub Desktop.
library(h2o)
localH2O <- h2o.init(nthreads = -1) #Start up H2O cluster using nthreads = ncores
# Get training data:
data <- h2o.importFile("http://www.stat.berkeley.edu/~ledell/data/wisc-diag-breast-cancer-shuffled.csv",
destination_frame = "breast_cancer")
y <- "diagnosis" #Response column
x <- setdiff(names(data), c(y, "id")) #remove 'id' and response col
# Train & Test
set.seed(1)
ss <- h2o.splitFrame(data) #split data into train & test partitions
training_frame <- ss[[1]]
validation_frame <- ss[[2]]
# Train a GLM
h2o_glm <- h2o.glm(x = x, y = y,
training_frame = training_frame,
validation_frame = validation_frame,
family = "binomial")
print(h2o_glm@model$validation_metrics@metrics$AUC)
# Test set AUC: 0.9935432
h2o.auc(h2o_glm, valid = TRUE) #utility function to get AUC
# Cross-validated GBM
h2o_gbm <- h2o.gbm(x = x, y = y,
training_frame = data,
nfolds = 5,
family = "binomial",
seed = 1)
print(h2o_gbm@model$cross_validation_metrics@metrics$AUC)
# CV AUC: 0.9894099
# Cross-validate a Random Forest
h2o_rf <- h2o.randomForest(x = x, y = y,
training_frame = data,
nfolds = 5,
family = "binomial",
seed = 1)
print(h2o_rf@model$cross_validation_metrics@metrics$AUC)
# CV AUC: 0.9902621
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment