Erin LeDell ledell

## package_gender.R
library(miniCRAN)
library(gender)
library(stringr)

# Get package description data
# This took about an hour to run, so you can load the data directly below
# pkgs <- available.packages("http://cran.rstudio.com/src/contrib")
# desc <- getCranDescription(pkgs, repos = c(CRAN="http://cran.rstudio.com"))
desc <- read.csv("http://www.stat.berkeley.edu/~ledell/data/RStudioCRAN_pkgDesc_20141216.csv")

## install_h2oEnsemble.R
library(devtools)
install_github("h2oai/h2o-3/h2o-r/ensemble/h2oEnsemble-package")

## CVFolds2.R
# Update for SuperLearner::CVFolds function that enables stratification by outcome and cluster ID

CVFolds2 <- function (N, id, Y, cvControl) {
    if (!is.null(cvControl$validRows)) {
        return(cvControl$validRows)
    }
    stratifyCV <- cvControl$stratifyCV
    shuffle <- cvControl$shuffle
    V <- cvControl$V
    if (!stratifyCV) {  ### Not Stratified

## train_h2o_cluster_stratified_folds.R
# Example of how to train an H2O model with folds that are
# stratified both by outcome and a cluster id

library(cvAUC)

data("adherence")  #load a dataset with an ID column
df <- adherence

# Load a utility function for creating stratified folds
source("https://gist.githubusercontent.com/ledell/bd4e227d4e5ff426c41d/raw/708eb429fa1954a140d65a6a42ce93847affd67c/CVFolds2.R")  #utility function

## h2o_deeplearning_gridsearch_mnist_example.R
library(h2o)
h2o.init(nthreads = -1)  # This means nthreads = num available cores

train_file <- "https://h2o-public-test-data.s3.amazonaws.com/bigdata/laptop/mnist/train.csv.gz"
test_file <- "https://h2o-public-test-data.s3.amazonaws.com/bigdata/laptop/mnist/test.csv.gz"

train <- h2o.importFile(train_file)
test <- h2o.importFile(test_file)

# To see a brief summary of the data, run the following command

## install_h2o_slater.R
# The following two commands remove any previously installed H2O packages for R.
if ("package:h2o" %in% search()) { detach("package:h2o", unload=TRUE) }
if ("h2o" %in% rownames(installed.packages())) { remove.packages("h2o") }

# Next, we download packages that H2O depends on.
if (! ("methods" %in% rownames(installed.packages()))) { install.packages("methods") }
if (! ("statmod" %in% rownames(installed.packages()))) { install.packages("statmod") }
if (! ("stats" %in% rownames(installed.packages()))) { install.packages("stats") }
if (! ("graphics" %in% rownames(installed.packages()))) { install.packages("graphics") }
if (! ("RCurl" %in% rownames(installed.packages()))) { install.packages("RCurl") }

## wisc_diag_breast_cancer_h2o_demo.R
library(h2o)
localH2O <- h2o.init(nthreads = -1)  #Start up H2O cluster using nthreads = ncores


# Get training data:
data <- h2o.importFile("http://www.stat.berkeley.edu/~ledell/data/wisc-diag-breast-cancer-shuffled.csv",
  destination_frame = "breast_cancer")
y <- "diagnosis"  #Response column
x <- setdiff(names(data), c(y, "id"))  #remove 'id' and response col

## cvpreds.R
# Extract cross-validated predicted values (in order of original rows)
h2o.cvpreds <- function(object, single_col = TRUE) {

  # TO DO: Check that object is an H2OModel
  # TO DO: Check that keep_cross_validation_predictions = TRUE in the model
  # TO DO: Need to add support for returning a multiclass prediction and binary (full frame: predict, p0, p1)
  # TO DO: Remove family variable and just check class(object) directly

  # Need to extract family from model object
  if (class(object) == "H2OBinomialModel") family <- "binomial"

## stack.R
# NOTE: This is now part of h2oEnsemble R package and should be used from there instead

# Given a list of H2O models, ensemble the base learners usig a metalearner (Stacking / Super Learning)

# Currently requires:
#source("cvpreds.R")


h2o.stack <- function(models,  #list of H2OModels
                      metalearner = "h2o.glm.wrapper",

## stack.py
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator


def make_Z(models):
	'''
	Takes a list of models and creates level-one data
	'''
	library(miniCRAN)
	library(gender)
	library(stringr)

	# Get package description data
	# This took about an hour to run, so you can load the data directly below
	# pkgs <- available.packages("http://cran.rstudio.com/src/contrib")
	# desc <- getCranDescription(pkgs, repos = c(CRAN="http://cran.rstudio.com"))
	desc <- read.csv("http://www.stat.berkeley.edu/~ledell/data/RStudioCRAN_pkgDesc_20141216.csv")
	library(devtools)
	install_github("h2oai/h2o-3/h2o-r/ensemble/h2oEnsemble-package")
	# Update for SuperLearner::CVFolds function that enables stratification by outcome and cluster ID

	CVFolds2 <- function (N, id, Y, cvControl) {
	if (!is.null(cvControl$validRows)) {
	return(cvControl$validRows)
	}
	stratifyCV <- cvControl$stratifyCV
	shuffle <- cvControl$shuffle
	V <- cvControl$V
	if (!stratifyCV) { ### Not Stratified
	# Example of how to train an H2O model with folds that are
	# stratified both by outcome and a cluster id

	library(cvAUC)

	data("adherence") #load a dataset with an ID column
	df <- adherence

	# Load a utility function for creating stratified folds
	source("https://gist.githubusercontent.com/ledell/bd4e227d4e5ff426c41d/raw/708eb429fa1954a140d65a6a42ce93847affd67c/CVFolds2.R") #utility function
	library(h2o)
	h2o.init(nthreads = -1) # This means nthreads = num available cores

	train_file <- "https://h2o-public-test-data.s3.amazonaws.com/bigdata/laptop/mnist/train.csv.gz"
	test_file <- "https://h2o-public-test-data.s3.amazonaws.com/bigdata/laptop/mnist/test.csv.gz"

	train <- h2o.importFile(train_file)
	test <- h2o.importFile(test_file)

	# To see a brief summary of the data, run the following command
	# The following two commands remove any previously installed H2O packages for R.
	if ("package:h2o" %in% search()) { detach("package:h2o", unload=TRUE) }
	if ("h2o" %in% rownames(installed.packages())) { remove.packages("h2o") }

	# Next, we download packages that H2O depends on.
	if (! ("methods" %in% rownames(installed.packages()))) { install.packages("methods") }
	if (! ("statmod" %in% rownames(installed.packages()))) { install.packages("statmod") }
	if (! ("stats" %in% rownames(installed.packages()))) { install.packages("stats") }
	if (! ("graphics" %in% rownames(installed.packages()))) { install.packages("graphics") }
	if (! ("RCurl" %in% rownames(installed.packages()))) { install.packages("RCurl") }
	library(h2o)
	localH2O <- h2o.init(nthreads = -1) #Start up H2O cluster using nthreads = ncores


	# Get training data:
	data <- h2o.importFile("http://www.stat.berkeley.edu/~ledell/data/wisc-diag-breast-cancer-shuffled.csv",
	destination_frame = "breast_cancer")
	y <- "diagnosis" #Response column
	x <- setdiff(names(data), c(y, "id")) #remove 'id' and response col
	# Extract cross-validated predicted values (in order of original rows)
	h2o.cvpreds <- function(object, single_col = TRUE) {

	# TO DO: Check that object is an H2OModel
	# TO DO: Check that keep_cross_validation_predictions = TRUE in the model
	# TO DO: Need to add support for returning a multiclass prediction and binary (full frame: predict, p0, p1)
	# TO DO: Remove family variable and just check class(object) directly

	# Need to extract family from model object
	if (class(object) == "H2OBinomialModel") family <- "binomial"
	# NOTE: This is now part of h2oEnsemble R package and should be used from there instead

	# Given a list of H2O models, ensemble the base learners usig a metalearner (Stacking / Super Learning)

	# Currently requires:
	#source("cvpreds.R")


	h2o.stack <- function(models, #list of H2OModels
	metalearner = "h2o.glm.wrapper",
	import h2o
	from h2o.estimators.glm import H2OGeneralizedLinearEstimator



	def make_Z(models):
	'''
	Takes a list of models and creates level-one data
	'''