Erin LeDell ledell

## install_h2o_slater.R
# The following two commands remove any previously installed H2O packages for R.
if ("package:h2o" %in% search()) { detach("package:h2o", unload=TRUE) }
if ("h2o" %in% rownames(installed.packages())) { remove.packages("h2o") }

# Next, we download packages that H2O depends on.
if (! ("methods" %in% rownames(installed.packages()))) { install.packages("methods") }
if (! ("statmod" %in% rownames(installed.packages()))) { install.packages("statmod") }
if (! ("stats" %in% rownames(installed.packages()))) { install.packages("stats") }
if (! ("graphics" %in% rownames(installed.packages()))) { install.packages("graphics") }
if (! ("RCurl" %in% rownames(installed.packages()))) { install.packages("RCurl") }

## stack.R
# NOTE: This is now part of h2oEnsemble R package and should be used from there instead

# Given a list of H2O models, ensemble the base learners usig a metalearner (Stacking / Super Learning)

# Currently requires:
#source("cvpreds.R")


h2o.stack <- function(models,  #list of H2OModels
                      metalearner = "h2o.glm.wrapper",

## cvpreds.R
# Extract cross-validated predicted values (in order of original rows)
h2o.cvpreds <- function(object, single_col = TRUE) {

  # TO DO: Check that object is an H2OModel
  # TO DO: Check that keep_cross_validation_predictions = TRUE in the model
  # TO DO: Need to add support for returning a multiclass prediction and binary (full frame: predict, p0, p1)
  # TO DO: Remove family variable and just check class(object) directly

  # Need to extract family from model object
  if (class(object) == "H2OBinomialModel") family <- "binomial"

## h2o_stacking_example.R
# Train 4 models and ensemble them together with new h2o.stack function.

# Requirements: Models must be same type of model H2OBinomial, etc
# Must have same outcome
# Must have used `fold_assignment = "Modulo"` and same number for `nfolds`,
# or identical `fold_column` must be used to guarantee same folds between base models

# Requires: cvpreds.R and stack.R
source("https://gist.githubusercontent.com/ledell/f3a87bd136ce06e0a5ff/raw/2a82535892ff66694a1a401de46b8b5a92820849/cvpreds.R")
source("https://gist.githubusercontent.com/ledell/f389ac1e9c6e7000b299/raw/6bc1d2c9cfe1a51ffcdcf79cf184e80a40d4828f/stack.R")

## stack.py
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator


def make_Z(models):
	'''
	Takes a list of models and creates level-one data
	'''

## h2o_stacking_example.py
# This example is outdated because we have the H2O Stacked Ensemble function now (so it's better to use that):
# http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/stacked-ensembles.html

import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from sklearn import metrics  #will be replaced with ensemble_performance later

## h2o-stacked-ensemble-demo.py
import h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
from h2o.grid.grid_search import H2OGridSearch
from __future__ import print_function


h2o.init(nthreads=-1)

## h2o-stacked-ensemble-demo.R
library(h2o)
h2o.init(nthreads = -1)

# Import a sample binary outcome train/test set into H2O
train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")

# Identify predictors and response
y <- "response"
x <- setdiff(names(train), y)

## h2o_rf_sigopt_demo_iris.R
# Set API Key
Sys.setenv(SIGOPT_API_TOKEN="HERE")

# Start a local H2O cluster for training models
library(h2o)
h2o.init(nthreads = -1)

# Load a dataset
data(iris)
y <- "Species"

## h2o_xgboost_grid_example.R
library(h2o)
h2o.init()


# Load the HIGGS dataset
train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
y <- "response"
x <- setdiff(names(train), y)
family <- "binomial"
	# The following two commands remove any previously installed H2O packages for R.
	if ("package:h2o" %in% search()) { detach("package:h2o", unload=TRUE) }
	if ("h2o" %in% rownames(installed.packages())) { remove.packages("h2o") }

	# Next, we download packages that H2O depends on.
	if (! ("methods" %in% rownames(installed.packages()))) { install.packages("methods") }
	if (! ("statmod" %in% rownames(installed.packages()))) { install.packages("statmod") }
	if (! ("stats" %in% rownames(installed.packages()))) { install.packages("stats") }
	if (! ("graphics" %in% rownames(installed.packages()))) { install.packages("graphics") }
	if (! ("RCurl" %in% rownames(installed.packages()))) { install.packages("RCurl") }
	# NOTE: This is now part of h2oEnsemble R package and should be used from there instead

	# Given a list of H2O models, ensemble the base learners usig a metalearner (Stacking / Super Learning)

	# Currently requires:
	#source("cvpreds.R")


	h2o.stack <- function(models, #list of H2OModels
	metalearner = "h2o.glm.wrapper",
	# Extract cross-validated predicted values (in order of original rows)
	h2o.cvpreds <- function(object, single_col = TRUE) {

	# TO DO: Check that object is an H2OModel
	# TO DO: Check that keep_cross_validation_predictions = TRUE in the model
	# TO DO: Need to add support for returning a multiclass prediction and binary (full frame: predict, p0, p1)
	# TO DO: Remove family variable and just check class(object) directly

	# Need to extract family from model object
	if (class(object) == "H2OBinomialModel") family <- "binomial"
	# Train 4 models and ensemble them together with new h2o.stack function.

	# Requirements: Models must be same type of model H2OBinomial, etc
	# Must have same outcome
	# Must have used `fold_assignment = "Modulo"` and same number for `nfolds`,
	# or identical `fold_column` must be used to guarantee same folds between base models

	# Requires: cvpreds.R and stack.R
	source("https://gist.githubusercontent.com/ledell/f3a87bd136ce06e0a5ff/raw/2a82535892ff66694a1a401de46b8b5a92820849/cvpreds.R")
	source("https://gist.githubusercontent.com/ledell/f389ac1e9c6e7000b299/raw/6bc1d2c9cfe1a51ffcdcf79cf184e80a40d4828f/stack.R")
	import h2o
	from h2o.estimators.glm import H2OGeneralizedLinearEstimator



	def make_Z(models):
	'''
	Takes a list of models and creates level-one data
	'''
	# This example is outdated because we have the H2O Stacked Ensemble function now (so it's better to use that):
	# http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/stacked-ensembles.html

	import h2o
	from h2o.estimators.gbm import H2OGradientBoostingEstimator
	from h2o.estimators.deeplearning import H2ODeepLearningEstimator
	from h2o.estimators.glm import H2OGeneralizedLinearEstimator
	from h2o.estimators.random_forest import H2ORandomForestEstimator
	from sklearn import metrics #will be replaced with ensemble_performance later
	library(h2o)
	h2o.init(nthreads = -1)

	# Import a sample binary outcome train/test set into H2O
	train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
	test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")

	# Identify predictors and response
	y <- "response"
	x <- setdiff(names(train), y)
	# Set API Key
	Sys.setenv(SIGOPT_API_TOKEN="HERE")

	# Start a local H2O cluster for training models
	library(h2o)
	h2o.init(nthreads = -1)

	# Load a dataset
	data(iris)
	y <- "Species"
	library(h2o)
	h2o.init()


	# Load the HIGGS dataset
	train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
	test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
	y <- "response"
	x <- setdiff(names(train), y)
	family <- "binomial"