Skip to content

Instantly share code, notes, and snippets.

@szilard
Last active August 29, 2015 14:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save szilard/e5130287ea0ec9330457 to your computer and use it in GitHub Desktop.
Save szilard/e5130287ea0ec9330457 to your computer and use it in GitHub Desktop.
Random Forest all data vs subsamples
library(randomForest)
library(parallel)
genr_data <- function(n,p) {
X <- matrix(rnorm(n*p),n,p)
y <- as.factor(apply(X,1, function(x)
ifelse(sum(x^2)>qchisq(0.5,p),"+","-")))
## Hastie etal 10.2
data.frame(X,y)
}
err_rate <- function(md, d_test) {
yp <- predict(md, newdata=d_test)
sum(d_test$y!=yp)/nrow(d_test)
}
n <- 3000
p <- 300
d_train <- genr_data(n,p)
d_test <- genr_data(10000,p)
n_trees <- 500
rf_par <- function(frm, data, n_trees) {
nproc <- 10 ##detectCores()
mds <- mclapply(rep(floor(n_trees/nproc),nproc),
function(nn) randomForest(frm, data=data, ntree = nn),
mc.cores = nproc)
md <- do.call("combine", mds)
}
system.time({
##md <- randomForest(y~., d_train, ntree=n_trees)
md <- rf_par(y~., d_train, n_trees)
})
err_rate(md, d_test)
m <- 10
system.time({
mds_split <- lapply(0:(m-1), function(k) {
idx <- (1:nrow(d_train)) %% m == k
##randomForest(y~., d_train[idx,], ntree=n_trees/m)
rf_par(y~., d_train[idx,], n_trees/m)
})
})
md_split <- do.call(combine, mds_split)
err_rate(md_split, d_test)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment