ledell/stack.R

## stack.R
# NOTE: This is now part of h2oEnsemble R package and should be used from there instead

# Given a list of H2O models, ensemble the base learners usig a metalearner (Stacking / Super Learning)

# Currently requires:
#source("cvpreds.R")


h2o.stack <- function(models,  #list of H2OModels
                      metalearner = "h2o.glm.wrapper",
                      response_frame,  #must pass the response col as an H2OFrame
                      cvpreds_frame = NULL,
                      # maybe add a family argument (to overrule),
                      seed = 1,
                      keep_levelone_data = TRUE) {

  # models: list of (supervised) H2OModel objects saved using with keep_levelone_data = TRUE and identical fold_column
  # metalearner: the metalearning algorithm
  # response_frame: the response column, passed as an H2OFrame (coltype should be correct)
  # cvpreds_frame: the level-one cross-validated predicted values (# col == # length(model))

  # Check dimensions of response_frame
  if (!(ncol(response_frame) == 1)) {
    stop("response_frame must have 1 column that contains the original response, y, used to train base models")
  }
  # cvpreds_frame = NULL assumes that models[[l]]@model$cross_validated_predictions exists in memory (not null)
  # Add something about supporting a cvpreds_frame

  # Assure that these have the same # of folds
  nfold_sync <- length(unique(sapply(models, function(mm) mm@allparameters$nfolds)))
  if (nfold_sync != 1) {
    stop("All models must have used the same number of cv folds specified by the `nfolds` argument")
  }
  # Assure that these used the same folds (check for modulo; TO DO: add support for arbitrary folds)
  fold_sync <- prod(sapply(models, function(mm) mm@allparameters$fold_assignment == "Modulo"))
  if (fold_sync != 1) {
    stop("All models must have used fold_assingment == Modulo")
  }
  # Assure that these used the same response column (is this too restrictive?)
  y_sync <- length(unique(sapply(models, function(mm) mm@allparameters$y)))
  if (y_sync != 1) {
    stop("All models must have used response variable, y")
  }
  # Collect the superset of x colnames
  # TO DO: Maybe add some more checking here to ensure compatibilty
  x <- unique(unlist(sapply(models, function(mm) mm@allparameters$x, simplify = FALSE)))
  y <- models[[1]]@allparameters$y

  # TO DO: Clean this up
  class_sync1 <- prod(sapply(models, function(mm) class(mm) == "H2OBinomialModel"))
  class_sync2 <- prod(sapply(models, function(mm) class(mm) == "H2OMultinomialModel"))
  class_sync3 <- prod(sapply(models, function(mm) class(mm) == "H2ORegressionModel"))
  class_sync <- class_sync1 + class_sync2 + class_sync3
  if (class_sync != 1) {
    stop("All models must have the same class type; one of: H2OBinomialModel, H2OMultinomialModel, H2ORegressionModel")
  }
  if (class_sync1 == 1) {
   family <- "binomial"
   ylim <- c(min(response_frame[,y]), max(response_frame[,y]))  #Used to enforce bounds
  } else if (class_sync2 == 1) {
   family <- "multinomial"
   stop("Multinomial case not yet supported in H2O Ensemble")
  } else {
   family <- "gaussian"
   ylim <- NULL
  }
#   family <- models[[1]]@model$model_summary$family
#   if (family == "binomial") {
#     ylim <- c(min(response_frame[,y]), max(response_frame[,y]))  #Used to enforce bounds
#   } else if (family == "gaussian") {
#     ylim <- NULL
#   } else if (family == "multinomial") {
#     stop("Multinomial case not yet supported in H2O Ensemble")
#   }
  runtime <- list()

  # Build the levelone data using the cross_validated_predictions of each H2O Model
  if (!is.null(cvpreds_frame)) {
    Z <- cvpreds_frame
    # TO DO: Check dimensions and other properties of cvpreds_frame for correctness
  } else {
    Z <- do.call("h2o.cbind", sapply(models, h2o.cvpreds, simplify = FALSE))
  }
  levelone <- h2o.cbind(Z, response_frame)
  learner <- sapply(models, function(mm) mm@model_id)
  names(models) <- learner
  names(levelone) <- c(learner, "y")
  N <- nrow(levelone)
  V <- models[[1]]@allparameters$nfolds  #Number of internal CV folds

  # Validate metalearner argument
  if (length(metalearner)>1 | !is.character(metalearner) | !exists(metalearner)) {
    stop("The 'metalearner' argument must be a string, specifying the name of a base learner wrapper function.")
  }
  if (!exists(metalearner)) {
    stop("'metalearner' function name not found.")
  }

  # What type of metalearning function do we have?
  # The h2o version is memory-optimized (the N x L level-one matrix, Z, never leaves H2O memory);
  # SuperLearner metalearners provide additional metalearning algos, but has a much bigger memory footprint
  if (grepl("^SL.", metalearner)) {
    metalearner_type <- "SuperLearner"
  } else if (grepl("^h2o.", metalearner)){
    metalearner_type <- "h2o"
  }

  # TO DO: Maybe remove this:
  # Do we want to rename this to `fit`? or overwrite original object...
  # Currently making a copy of the model
  #fit <- object

  # Metalearning: Regress y onto Z to learn optimal combination of base models
  print("Metalearning")
  if (is.numeric(seed)) set.seed(seed)  #If seed given, set seed prior to next step
  if (grepl("^SL.", metalearner)) {
    # SuperLearner package metalearner
    if (is.character(family)) {
      familyFun <- get(family, mode = "function", envir = parent.frame())
      #print(familyFun$family)  #does not work for SL.glmnet
    }
    # SL metalearner functions require pulling levelone data back into R memory (not recommended for big datasets)
    Zdf <- as.data.frame(levelone)[,-ncol(levelone)]  #TO DO: untested; levelone is Z,y
    Y <- as.data.frame(levelone)[,"y"]  # TO DO: untested
    runtime$metalearning <- system.time(metafit <- match.fun(metalearner)(Y = Y,
                                                                          X = Zdf,
                                                                          newX = Zdf,
                                                                          family = familyFun,
                                                                          id = seq(N),
                                                                          obsWeights = rep(1,N)), gcFirst = FALSE)
  } else {
    # H2O metalearner
    runtime$metalearning <- system.time(metafit <- match.fun(metalearner)(x = learner,
                                                                          y = "y",
                                                                          training_frame = levelone,   #levelone := cbind(Z, y)
                                                                          validation_frame = NULL,
                                                                          family = family), gcFirst = FALSE)
  }

  if (!keep_levelone_data) {
    levelone <- NULL
  }

  # Ensemble model
  out <- list(x = x,
              y = y, # should we add y to the input so multiple responses can be stored?
              family = family,
              learner = learner,
              metalearner = metalearner,
              cvControl = list(V = V, shuffle = FALSE),
              folds = rep(1:V, N),  # currently must be modulo V... is this 1:V or 0:(V-1)??
              ylim = ylim,
              seed = seed,
              parallel = "seq",
              basefits = models,
              metafit = metafit,
              levelone = levelone,  #levelone = cbind(Z, y)
              runtime = runtime,
              h2o_version = packageVersion(pkg = "h2o"),
              h2oEnsemble_version = packageVersion(pkg = "h2oEnsemble"))
  class(out) <- "h2o.ensemble"
  return(out)
}
	# NOTE: This is now part of h2oEnsemble R package and should be used from there instead

	# Given a list of H2O models, ensemble the base learners usig a metalearner (Stacking / Super Learning)

	# Currently requires:
	#source("cvpreds.R")


	h2o.stack <- function(models, #list of H2OModels
	metalearner = "h2o.glm.wrapper",
	response_frame, #must pass the response col as an H2OFrame
	cvpreds_frame = NULL,
	# maybe add a family argument (to overrule),
	seed = 1,
	keep_levelone_data = TRUE) {

	# models: list of (supervised) H2OModel objects saved using with keep_levelone_data = TRUE and identical fold_column
	# metalearner: the metalearning algorithm
	# response_frame: the response column, passed as an H2OFrame (coltype should be correct)
	# cvpreds_frame: the level-one cross-validated predicted values (# col == # length(model))

	# Check dimensions of response_frame
	if (!(ncol(response_frame) == 1)) {
	stop("response_frame must have 1 column that contains the original response, y, used to train base models")
	}
	# cvpreds_frame = NULL assumes that models[[l]]@model$cross_validated_predictions exists in memory (not null)
	# Add something about supporting a cvpreds_frame

	# Assure that these have the same # of folds
	nfold_sync <- length(unique(sapply(models, function(mm) mm@allparameters$nfolds)))
	if (nfold_sync != 1) {
	stop("All models must have used the same number of cv folds specified by the `nfolds` argument")
	}
	# Assure that these used the same folds (check for modulo; TO DO: add support for arbitrary folds)
	fold_sync <- prod(sapply(models, function(mm) mm@allparameters$fold_assignment == "Modulo"))
	if (fold_sync != 1) {
	stop("All models must have used fold_assingment == Modulo")
	}
	# Assure that these used the same response column (is this too restrictive?)
	y_sync <- length(unique(sapply(models, function(mm) mm@allparameters$y)))
	if (y_sync != 1) {
	stop("All models must have used response variable, y")
	}
	# Collect the superset of x colnames
	# TO DO: Maybe add some more checking here to ensure compatibilty
	x <- unique(unlist(sapply(models, function(mm) mm@allparameters$x, simplify = FALSE)))
	y <- models[[1]]@allparameters$y

	# TO DO: Clean this up
	class_sync1 <- prod(sapply(models, function(mm) class(mm) == "H2OBinomialModel"))
	class_sync2 <- prod(sapply(models, function(mm) class(mm) == "H2OMultinomialModel"))
	class_sync3 <- prod(sapply(models, function(mm) class(mm) == "H2ORegressionModel"))
	class_sync <- class_sync1 + class_sync2 + class_sync3
	if (class_sync != 1) {
	stop("All models must have the same class type; one of: H2OBinomialModel, H2OMultinomialModel, H2ORegressionModel")
	}
	if (class_sync1 == 1) {
	family <- "binomial"
	ylim <- c(min(response_frame[,y]), max(response_frame[,y])) #Used to enforce bounds
	} else if (class_sync2 == 1) {
	family <- "multinomial"
	stop("Multinomial case not yet supported in H2O Ensemble")
	} else {
	family <- "gaussian"
	ylim <- NULL
	}
	# family <- models[[1]]@model$model_summary$family
	# if (family == "binomial") {
	# ylim <- c(min(response_frame[,y]), max(response_frame[,y])) #Used to enforce bounds
	# } else if (family == "gaussian") {
	# ylim <- NULL
	# } else if (family == "multinomial") {
	# stop("Multinomial case not yet supported in H2O Ensemble")
	# }
	runtime <- list()

	# Build the levelone data using the cross_validated_predictions of each H2O Model
	if (!is.null(cvpreds_frame)) {
	Z <- cvpreds_frame
	# TO DO: Check dimensions and other properties of cvpreds_frame for correctness
	} else {
	Z <- do.call("h2o.cbind", sapply(models, h2o.cvpreds, simplify = FALSE))
	}
	levelone <- h2o.cbind(Z, response_frame)
	learner <- sapply(models, function(mm) mm@model_id)
	names(models) <- learner
	names(levelone) <- c(learner, "y")
	N <- nrow(levelone)
	V <- models[[1]]@allparameters$nfolds #Number of internal CV folds

	# Validate metalearner argument
	if (length(metalearner)>1 \| !is.character(metalearner) \| !exists(metalearner)) {
	stop("The 'metalearner' argument must be a string, specifying the name of a base learner wrapper function.")
	}
	if (!exists(metalearner)) {
	stop("'metalearner' function name not found.")
	}

	# What type of metalearning function do we have?
	# The h2o version is memory-optimized (the N x L level-one matrix, Z, never leaves H2O memory);
	# SuperLearner metalearners provide additional metalearning algos, but has a much bigger memory footprint
	if (grepl("^SL.", metalearner)) {
	metalearner_type <- "SuperLearner"
	} else if (grepl("^h2o.", metalearner)){
	metalearner_type <- "h2o"
	}

	# TO DO: Maybe remove this:
	# Do we want to rename this to `fit`? or overwrite original object...
	# Currently making a copy of the model
	#fit <- object

	# Metalearning: Regress y onto Z to learn optimal combination of base models
	print("Metalearning")
	if (is.numeric(seed)) set.seed(seed) #If seed given, set seed prior to next step
	if (grepl("^SL.", metalearner)) {
	# SuperLearner package metalearner
	if (is.character(family)) {
	familyFun <- get(family, mode = "function", envir = parent.frame())
	#print(familyFun$family) #does not work for SL.glmnet
	}
	# SL metalearner functions require pulling levelone data back into R memory (not recommended for big datasets)
	Zdf <- as.data.frame(levelone)[,-ncol(levelone)] #TO DO: untested; levelone is Z,y
	Y <- as.data.frame(levelone)[,"y"] # TO DO: untested
	runtime$metalearning <- system.time(metafit <- match.fun(metalearner)(Y = Y,
	X = Zdf,
	newX = Zdf,
	family = familyFun,
	id = seq(N),
	obsWeights = rep(1,N)), gcFirst = FALSE)
	} else {
	# H2O metalearner
	runtime$metalearning <- system.time(metafit <- match.fun(metalearner)(x = learner,
	y = "y",
	training_frame = levelone, #levelone := cbind(Z, y)
	validation_frame = NULL,
	family = family), gcFirst = FALSE)
	}

	if (!keep_levelone_data) {
	levelone <- NULL
	}

	# Ensemble model
	out <- list(x = x,
	y = y, # should we add y to the input so multiple responses can be stored?
	family = family,
	learner = learner,
	metalearner = metalearner,
	cvControl = list(V = V, shuffle = FALSE),
	folds = rep(1:V, N), # currently must be modulo V... is this 1:V or 0:(V-1)??
	ylim = ylim,
	seed = seed,
	parallel = "seq",
	basefits = models,
	metafit = metafit,
	levelone = levelone, #levelone = cbind(Z, y)
	runtime = runtime,
	h2o_version = packageVersion(pkg = "h2o"),
	h2oEnsemble_version = packageVersion(pkg = "h2oEnsemble"))
	class(out) <- "h2o.ensemble"
	return(out)
	}