Skip to content

Instantly share code, notes, and snippets.

@johanez
Created May 22, 2015 08:25
Show Gist options
  • Save johanez/7b227aca3db10309ebad to your computer and use it in GitHub Desktop.
Save johanez/7b227aca3db10309ebad to your computer and use it in GitHub Desktop.
randomForest CV based variable selection (rfcv) wrapper, that returns the best rf_model and important variable names.
# paralell RF with foreach
# careful: no confusion, err.rate, mse and rsq components
# (as well as the corresponding components in the test compnent, if exist)
rf_parallel <- function(x, y, ntree=500, ncore=4, ...){
require(foreach)
require(randomForest)
# optional parallel backend
if (ncore > 1) {
require(doParallel)
registerDoParallel(ncore)
}
rf <- foreach(ntree_i=rep(floor(ntree/ncore), ncore),
.combine=randomForest::combine,
.packages='randomForest') %do% {
randomForest(x, y, ntree=ntree_i, ...)
}
return(rf)
}
# modified rfcv that returns the rf witht he optimal ncov
# todo:
# parallelize cv instead of rf? -> (no double calc of top rf))
source("rf_parallel.R")
rfcv_ncov <- function (trainx, trainy, cv.fold = 5, scale = "log", step = 0.5,
mtry = function(p) max(1, floor(sqrt(p))), recursive = FALSE, ntree=500, ncore=4,
...)
{
classRF <- is.factor(trainy)
n <- nrow(trainx)
p <- ncol(trainx)
if (scale == "log") {
k <- floor(log(p, base = 1/step))
n.var <- round(p * step^(0:(k - 1)))
same <- diff(n.var) == 0
if (any(same))
n.var <- n.var[-which(same)]
if (!1 %in% n.var)
n.var <- c(n.var, 1)
}
else {
n.var <- seq(from = p, to = 1, by = step)
}
k <- length(n.var)
cv.pred <- vector(k, mode = "list")
for (i in 1:k) cv.pred[[i]] <- trainy
if (classRF) {
f <- trainy
}
else {
f <- factor(rep(1:5, length = length(trainy))[order(order(trainy))])
}
nlvl <- table(f)
idx <- numeric(n)
for (i in 1:length(nlvl)) {
idx[which(f == levels(f)[i])] <- sample(rep(1:cv.fold,
length = nlvl[i]))
}
for (i in 1:cv.fold) {
# replaced rf by rf_parallel
all.rf <- rf_parallel(x=trainx[idx != i, , drop = FALSE],
y=trainy[idx != i],
ntree=ntree,
ncore=ncore,
xtest=trainx[idx == i, , drop = FALSE],
ytest=trainy[idx == i], mtry = mtry(p), importance = TRUE,
keep.forest=FALSE,
...)
cv.pred[[1]][idx == i] <- all.rf$test$predicted
impvar <- (1:p)[order(all.rf$importance[, 1], decreasing = TRUE)]
for (j in 2:k) {
imp.idx <- impvar[1:n.var[j]]
# replaced rf by rf_parallel
sub.rf <- rf_parallel(x= trainx[idx != i, imp.idx, drop=F],
y=trainy[idx != i],
ntree=ntree,
ncore=ncore,
xtest= trainx[idx == i, imp.idx, drop = FALSE],
ytest= trainy[idx == i],
mtry = mtry(n.var[j]), importance = recursive,
keep.forest=FALSE,
...)
cv.pred[[j]][idx == i] <- sub.rf$test$predicted
if (recursive) {
impvar <- (1:length(imp.idx))[order(sub.rf$importance[,
1], decreasing = TRUE)]
}
NULL
}
NULL
cat(i, ".")
}
if (classRF) {
error.cv <- sapply(cv.pred, function(x) mean(trainy != x))
}
else {
error.cv <- sapply(cv.pred, function(x) mean((trainy - x)^2))
}
names(error.cv) <- names(cv.pred) <- n.var
n_top <- n.var[which.min(error.cv)]
#use seq. random forest here, to keep OOB error est.
rf_top <- randomForest(trainx[, impvar[1:n_top], drop = FALSE], trainy,
importance = T, ntree=ntree,
...)
rf_all <- randomForest(trainx, trainy,
importance = T, ntree=ntree,
...)
res <- data.frame(row.names=c("All cov.", "Imp. Cov."),
ncov=c(ncol(trainx), n_top),
rsqOOB=c(rf_all$rsq[ntree], rf_top$rsq[ntree]),
mseOOB=c(rf_all$mse[ntree], rf_top$mse[ntree]),
rsqCV=c(1 - (error.cv[1]/var(trainy)), 1 - (error.cv[which.min(error.cv)]/var(trainy))),
mseCV=c(error.cv[1], error.cv[which.min(error.cv)])
)
cat("\n")
print(round(res, 5))
return(list(rf=rf_top, res=res, impvar=names(trainx)[impvar[1:n_top]]))
}
@johanez
Copy link
Author

johanez commented May 22, 2015

See:
Svetnik, V., Liaw, A., Tong, C. and Wang, T., “Application of Breiman's Random Forest to Modeling Structure-Activity Relationships of Pharmaceutical Molecules”, MCS 2004, Roli, F. and Windeatt, T. (Eds.) pp. 334-343.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment