Skip to content

Instantly share code, notes, and snippets.

View ledell's full-sized avatar
💭
Check out H2O AutoML: http://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html

Erin LeDell ledell

💭
Check out H2O AutoML: http://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html
View GitHub Profile
@ledell
ledell / package_gender.R
Last active August 29, 2015 14:11
Quick estimate of gender distribution of CRAN package maintainers
library(miniCRAN)
library(gender)
library(stringr)
# Get package description data
# This took about an hour to run, so you can load the data directly below
# pkgs <- available.packages("http://cran.rstudio.com/src/contrib")
# desc <- getCranDescription(pkgs, repos = c(CRAN="http://cran.rstudio.com"))
desc <- read.csv("http://www.stat.berkeley.edu/~ledell/data/RStudioCRAN_pkgDesc_20141216.csv")
library(h2o)
localH2O <- h2o.init(nthreads = -1) #Start up H2O cluster using nthreads = ncores
# Get training data:
data <- h2o.importFile("http://www.stat.berkeley.edu/~ledell/data/wisc-diag-breast-cancer-shuffled.csv",
destination_frame = "breast_cancer")
y <- "diagnosis" #Response column
x <- setdiff(names(data), c(y, "id")) #remove 'id' and response col
@ledell
ledell / install_h2o_simons.R
Created August 18, 2015 00:06
Install h2o R package (Simons release)
# Also available here: http://h2o-release.s3.amazonaws.com/h2o/rel-simons/7/index.html#R
# The following two commands remove any previously installed H2O packages for R.
if ("package:h2o" %in% search()) { detach("package:h2o", unload=TRUE) }
if ("h2o" %in% rownames(installed.packages())) { remove.packages("h2o") }
# Next, we download packages that H2O depends on.
if (! ("methods" %in% rownames(installed.packages()))) { install.packages("methods") }
if (! ("statmod" %in% rownames(installed.packages()))) { install.packages("statmod") }
if (! ("stats" %in% rownames(installed.packages()))) { install.packages("stats") }
@ledell
ledell / h2o_ensemble_higgs_demo.R
Last active January 27, 2018 12:09
h2oEnsemble R package demo (from h2o.ensemble docs)
# A small-data example of binary classification on a local machine using H2O Ensemble
library(h2oEnsemble)
localH2O <- h2o.init(nthreads = -1) # Start an H2O cluster with nthreads = num cores on your machine
# Import a sample binary outcome train/test set into R
train <- read.table("http://www.stat.berkeley.edu/~ledell/data/higgs_10k.csv", sep=",")
test <- read.table("http://www.stat.berkeley.edu/~ledell/data/higgs_test_5k.csv", sep=",")
@ledell
ledell / install_h2oEnsemble.R
Created August 18, 2015 01:04
Install h2oEnsemble package from GitHub
library(devtools)
install_github("h2oai/h2o-3/h2o-r/ensemble/h2oEnsemble-package")
# Update for SuperLearner::CVFolds function that enables stratification by outcome and cluster ID
CVFolds2 <- function (N, id, Y, cvControl) {
if (!is.null(cvControl$validRows)) {
return(cvControl$validRows)
}
stratifyCV <- cvControl$stratifyCV
shuffle <- cvControl$shuffle
V <- cvControl$V
if (!stratifyCV) { ### Not Stratified
# Example of how to train an H2O model with folds that are
# stratified both by outcome and a cluster id
library(cvAUC)
data("adherence") #load a dataset with an ID column
df <- adherence
# Load a utility function for creating stratified folds
source("https://gist.githubusercontent.com/ledell/bd4e227d4e5ff426c41d/raw/708eb429fa1954a140d65a6a42ce93847affd67c/CVFolds2.R") #utility function
library(h2o)
h2o.init(nthreads = -1) # This means nthreads = num available cores
train_file <- "https://h2o-public-test-data.s3.amazonaws.com/bigdata/laptop/mnist/train.csv.gz"
test_file <- "https://h2o-public-test-data.s3.amazonaws.com/bigdata/laptop/mnist/test.csv.gz"
train <- h2o.importFile(train_file)
test <- h2o.importFile(test_file)
# To see a brief summary of the data, run the following command
### Lending Club example using cleaned up dataset & h2o.ensemble ###
library(h2o)
h2o.init(nthreads = -1, max_mem_size = "8G")
loan_csv <- "https://raw.githubusercontent.com/h2oai/app-consumer-loan/master/data/loan.csv"
data <- h2o.importFile(loan_csv) # 163994 x 15
data$bad_loan <- as.factor(data$bad_loan)
rand <- h2o.runif(data, seed = 1)
# install.packages("h2o", type="source", repos=(c("http://h2o-release.s3.amazonaws.com/h2o/rel-slater/5/R")))
library(h2o)
localH2O <- h2o.init(ip = "XX.XX.XX.XX", port = 54321)
#higgs <- h2o.importFile("/home/0xdiag/datasets/higgs/HIGGS.csv", destination_frame = "higgs") #Local copy
higgs <- h2o.importFile("http://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz", destination_frame = "higgs")
dim(higgs) #11M x 29
higgs$C1 <- as.factor(higgs$C1) #Encode response as categorical