ledell/train_h2o_cluster_stratified_folds.R

## train_h2o_cluster_stratified_folds.R
# Example of how to train an H2O model with folds that are
# stratified both by outcome and a cluster id

library(cvAUC)

data("adherence")  #load a dataset with an ID column
df <- adherence

# Load a utility function for creating stratified folds
source("https://gist.githubusercontent.com/ledell/bd4e227d4e5ff426c41d/raw/708eb429fa1954a140d65a6a42ce93847affd67c/CVFolds2.R")  #utility function
cvControl <- list(V = 10, stratifyCV = TRUE, shuffle = TRUE)

# Create stratified folds for 10-fold CV: `folds` is a list of length 10, of fold idxs
folds <- CVFolds2(N = nrow(df), id = df$id, Y = df$Y, cvControl = cvControl)
convert_foldlist_to_vec <- function(folds) {
  V <- length(folds)
  N <- length(unlist(folds))
  fold_column <- rep(NA, N)
  for (i in 1:V) {
    fold_column[folds[[i]]] <- i
  }
 return(fold_column)
}
fold_column <- convert_foldlist_to_vec(folds)
df$fold_id <- fold_column
df$Y <- as.factor(df$Y)  #convert to factor for binary classfication

# Now use this fold designation with H2O
library(h2o)
h2o.init(nthreads = -1)
train <- as.h2o(df)  #if data is too big, write folds to disk and upload file using h2o.importFile

fit <- h2o.deeplearning(x = 2:8, y = 7, training_frame = train, fold_column = "fold_id")

# Get CV metrics
fit@model$cross_validation_metrics
	# Example of how to train an H2O model with folds that are
	# stratified both by outcome and a cluster id

	library(cvAUC)

	data("adherence") #load a dataset with an ID column
	df <- adherence

	# Load a utility function for creating stratified folds
	source("https://gist.githubusercontent.com/ledell/bd4e227d4e5ff426c41d/raw/708eb429fa1954a140d65a6a42ce93847affd67c/CVFolds2.R") #utility function
	cvControl <- list(V = 10, stratifyCV = TRUE, shuffle = TRUE)

	# Create stratified folds for 10-fold CV: `folds` is a list of length 10, of fold idxs
	folds <- CVFolds2(N = nrow(df), id = df$id, Y = df$Y, cvControl = cvControl)
	convert_foldlist_to_vec <- function(folds) {
	V <- length(folds)
	N <- length(unlist(folds))
	fold_column <- rep(NA, N)
	for (i in 1:V) {
	fold_column[folds[[i]]] <- i
	}
	return(fold_column)
	}
	fold_column <- convert_foldlist_to_vec(folds)
	df$fold_id <- fold_column
	df$Y <- as.factor(df$Y) #convert to factor for binary classfication

	# Now use this fold designation with H2O
	library(h2o)
	h2o.init(nthreads = -1)
	train <- as.h2o(df) #if data is too big, write folds to disk and upload file using h2o.importFile

	fit <- h2o.deeplearning(x = 2:8, y = 7, training_frame = train, fold_column = "fold_id")

	# Get CV metrics
	fit@model$cross_validation_metrics