johanez/rf_parallel.R

## rf_parallel.R
# paralell RF with foreach
#       careful: no confusion, err.rate, mse and rsq components
#       (as well as the corresponding components in the test compnent, if exist)
rf_parallel <- function(x, y, ntree=500, ncore=4, ...){
  require(foreach)
  require(randomForest)
  # optional parallel backend
  if (ncore > 1) {
    require(doParallel)
    registerDoParallel(ncore)
  }
  rf <- foreach(ntree_i=rep(floor(ntree/ncore), ncore),
                .combine=randomForest::combine,
                .packages='randomForest') %do% {
                  randomForest(x, y, ntree=ntree_i, ...)
                }
  return(rf)
}

## rfcv_ncov.R
# modified rfcv that returns the rf witht he optimal ncov
# todo:
#   parallelize cv instead of rf? -> (no double calc of top rf))
source("rf_parallel.R")

rfcv_ncov <- function (trainx, trainy, cv.fold = 5, scale = "log", step = 0.5,
          mtry = function(p) max(1, floor(sqrt(p))), recursive = FALSE, ntree=500, ncore=4,
          ...)
{
  classRF <- is.factor(trainy)
  n <- nrow(trainx)
  p <- ncol(trainx)
  if (scale == "log") {
    k <- floor(log(p, base = 1/step))
    n.var <- round(p * step^(0:(k - 1)))
    same <- diff(n.var) == 0
    if (any(same))
      n.var <- n.var[-which(same)]
    if (!1 %in% n.var)
      n.var <- c(n.var, 1)
  }
  else {
    n.var <- seq(from = p, to = 1, by = step)
  }
  k <- length(n.var)
  cv.pred <- vector(k, mode = "list")
  for (i in 1:k) cv.pred[[i]] <- trainy
  if (classRF) {
    f <- trainy
  }
  else {
    f <- factor(rep(1:5, length = length(trainy))[order(order(trainy))])
  }
  nlvl <- table(f)
  idx <- numeric(n)
  for (i in 1:length(nlvl)) {
    idx[which(f == levels(f)[i])] <- sample(rep(1:cv.fold,
                                                length = nlvl[i]))
  }

  for (i in 1:cv.fold) {
    #  replaced rf by rf_parallel
    all.rf <- rf_parallel(x=trainx[idx != i, , drop = FALSE],
                          y=trainy[idx != i],
                          ntree=ntree,
                          ncore=ncore,
                          xtest=trainx[idx == i, , drop = FALSE],
                          ytest=trainy[idx == i], mtry = mtry(p), importance = TRUE,
                          keep.forest=FALSE,
                           ...)
    cv.pred[[1]][idx == i] <- all.rf$test$predicted
    impvar <- (1:p)[order(all.rf$importance[, 1], decreasing = TRUE)]

    for (j in 2:k) {
      imp.idx <- impvar[1:n.var[j]]
      #  replaced rf by rf_parallel
      sub.rf <- rf_parallel(x= trainx[idx != i, imp.idx, drop=F],
                            y=trainy[idx != i],
                            ntree=ntree,
                            ncore=ncore,
                            xtest= trainx[idx == i, imp.idx, drop = FALSE],
                            ytest= trainy[idx == i],
                            mtry = mtry(n.var[j]), importance = recursive,
                            keep.forest=FALSE,
                            ...)

      cv.pred[[j]][idx == i] <- sub.rf$test$predicted
      if (recursive) {
        impvar <- (1:length(imp.idx))[order(sub.rf$importance[,
                                                              1], decreasing = TRUE)]
      }
      NULL
    }
    NULL
    cat(i, ".")
  }
  if (classRF) {
    error.cv <- sapply(cv.pred, function(x) mean(trainy != x))
  }
  else {
    error.cv <- sapply(cv.pred, function(x) mean((trainy - x)^2))
  }
  names(error.cv) <- names(cv.pred) <- n.var


  n_top <- n.var[which.min(error.cv)]

  #use seq. random forest here, to keep OOB error est.
  rf_top <- randomForest(trainx[, impvar[1:n_top], drop = FALSE], trainy,
                       importance = T, ntree=ntree,
                       ...)
  rf_all <- randomForest(trainx, trainy,
                       importance = T, ntree=ntree,
                       ...)
  res <- data.frame(row.names=c("All cov.", "Imp. Cov."),
                    ncov=c(ncol(trainx), n_top),
                    rsqOOB=c(rf_all$rsq[ntree],  rf_top$rsq[ntree]),
                    mseOOB=c(rf_all$mse[ntree],  rf_top$mse[ntree]),
                    rsqCV=c(1 - (error.cv[1]/var(trainy)), 1 - (error.cv[which.min(error.cv)]/var(trainy))),
                    mseCV=c(error.cv[1], error.cv[which.min(error.cv)])
                    )
  cat("\n")
  print(round(res, 5))
  return(list(rf=rf_top, res=res, impvar=names(trainx)[impvar[1:n_top]]))
}
	# paralell RF with foreach
	# careful: no confusion, err.rate, mse and rsq components
	# (as well as the corresponding components in the test compnent, if exist)
	rf_parallel <- function(x, y, ntree=500, ncore=4, ...){
	require(foreach)
	require(randomForest)
	# optional parallel backend
	if (ncore > 1) {
	require(doParallel)
	registerDoParallel(ncore)
	}
	rf <- foreach(ntree_i=rep(floor(ntree/ncore), ncore),
	.combine=randomForest::combine,
	.packages='randomForest') %do% {
	randomForest(x, y, ntree=ntree_i, ...)
	}
	return(rf)
	}
	# modified rfcv that returns the rf witht he optimal ncov
	# todo:
	# parallelize cv instead of rf? -> (no double calc of top rf))
	source("rf_parallel.R")

	rfcv_ncov <- function (trainx, trainy, cv.fold = 5, scale = "log", step = 0.5,
	mtry = function(p) max(1, floor(sqrt(p))), recursive = FALSE, ntree=500, ncore=4,
	...)
	{
	classRF <- is.factor(trainy)
	n <- nrow(trainx)
	p <- ncol(trainx)
	if (scale == "log") {
	k <- floor(log(p, base = 1/step))
	n.var <- round(p * step^(0:(k - 1)))
	same <- diff(n.var) == 0
	if (any(same))
	n.var <- n.var[-which(same)]
	if (!1 %in% n.var)
	n.var <- c(n.var, 1)
	}
	else {
	n.var <- seq(from = p, to = 1, by = step)
	}
	k <- length(n.var)
	cv.pred <- vector(k, mode = "list")
	for (i in 1:k) cv.pred[[i]] <- trainy
	if (classRF) {
	f <- trainy
	}
	else {
	f <- factor(rep(1:5, length = length(trainy))[order(order(trainy))])
	}
	nlvl <- table(f)
	idx <- numeric(n)
	for (i in 1:length(nlvl)) {
	idx[which(f == levels(f)[i])] <- sample(rep(1:cv.fold,
	length = nlvl[i]))
	}

	for (i in 1:cv.fold) {
	# replaced rf by rf_parallel
	all.rf <- rf_parallel(x=trainx[idx != i, , drop = FALSE],
	y=trainy[idx != i],
	ntree=ntree,
	ncore=ncore,
	xtest=trainx[idx == i, , drop = FALSE],
	ytest=trainy[idx == i], mtry = mtry(p), importance = TRUE,
	keep.forest=FALSE,
	...)
	cv.pred[[1]][idx == i] <- all.rf$test$predicted
	impvar <- (1:p)[order(all.rf$importance[, 1], decreasing = TRUE)]

	for (j in 2:k) {
	imp.idx <- impvar[1:n.var[j]]
	# replaced rf by rf_parallel
	sub.rf <- rf_parallel(x= trainx[idx != i, imp.idx, drop=F],
	y=trainy[idx != i],
	ntree=ntree,
	ncore=ncore,
	xtest= trainx[idx == i, imp.idx, drop = FALSE],
	ytest= trainy[idx == i],
	mtry = mtry(n.var[j]), importance = recursive,
	keep.forest=FALSE,
	...)

	cv.pred[[j]][idx == i] <- sub.rf$test$predicted
	if (recursive) {
	impvar <- (1:length(imp.idx))[order(sub.rf$importance[,
	1], decreasing = TRUE)]
	}
	NULL
	}
	NULL
	cat(i, ".")
	}
	if (classRF) {
	error.cv <- sapply(cv.pred, function(x) mean(trainy != x))
	}
	else {
	error.cv <- sapply(cv.pred, function(x) mean((trainy - x)^2))
	}
	names(error.cv) <- names(cv.pred) <- n.var


	n_top <- n.var[which.min(error.cv)]

	#use seq. random forest here, to keep OOB error est.
	rf_top <- randomForest(trainx[, impvar[1:n_top], drop = FALSE], trainy,
	importance = T, ntree=ntree,
	...)
	rf_all <- randomForest(trainx, trainy,
	importance = T, ntree=ntree,
	...)
	res <- data.frame(row.names=c("All cov.", "Imp. Cov."),
	ncov=c(ncol(trainx), n_top),
	rsqOOB=c(rf_all$rsq[ntree], rf_top$rsq[ntree]),
	mseOOB=c(rf_all$mse[ntree], rf_top$mse[ntree]),
	rsqCV=c(1 - (error.cv[1]/var(trainy)), 1 - (error.cv[which.min(error.cv)]/var(trainy))),
	mseCV=c(error.cv[1], error.cv[which.min(error.cv)])
	)
	cat("\n")
	print(round(res, 5))
	return(list(rf=rf_top, res=res, impvar=names(trainx)[impvar[1:n_top]]))
	}