tobigithub/Random-Forests-are-Random.R

## Random-Forests-are-Random.R
# Random forest are random...indeed
# http://stats.stackexchange.com/questions/35609/why-do-i-need-bag-composition-to-calculate-oob-error-of-combined-random-forest-m
# https://github.com/mlist/IB2014/blob/master/helper_methods.R
# Random Forest combine: http://www.inside-r.org/packages/cran/randomForest/docs/combine
#
# This has implications on parallel random forests using snow, doSNOW, doParallel etc.
#
# err.rate : NULL
# err.rate : NULL
# OOB : NULL
# Tobias Kind (2015)

getConfusionMatrix <- function(rf) {

    tbl = table(predict(rf), rf$y)
    class.error = vector()

    for (i in 1:nrow(tbl)) {
        rowSum = sum(tbl[i,])
        accurate = diag(tbl)[i]
        error = rowSum - accurate

        class.error[i] = error / rowSum
    }
    return(cbind(tbl, class.error))
}

set.seed(123)
library(randomForest)
rf1 <- randomForest(Species ~ ., iris, ntree=50, norm.votes=FALSE)
rf2 <- randomForest(Species ~ ., iris, ntree=50, norm.votes=FALSE)
rf3 <- randomForest(Species ~ ., iris, ntree=50, norm.votes=FALSE)
rf4 <- randomForest(Species ~ ., iris, ntree=50, norm.votes=FALSE)
rf1;rf2;rf3;rf4

rf.all <- combine(rf1, rf2, rf3, rf4)
rf.all

#---

rf1$confusion = getConfusionMatrix(rf1)
rf1$confusion

rf2$confusion = getConfusionMatrix(rf2)
rf2$confusion

rf3$confusion = getConfusionMatrix(rf3)
rf3$confusion

rf4$confusion = getConfusionMatrix(rf4)
rf4$confusion

# same as
  conf <- rf4$confusion; conf

# does not work (NULL)
  conf <- rf.all$confusion; conf

# works
rf.all$confusion = getConfusionMatrix(rf.all)
rf.all$confusion

predict(rf.all, type='prob')

par(mfrow=c(4,2))
plot(rf1); plot(rf2); plot(rf3);plot(rf4);
varImpPlot(rf1); varImpPlot(rf2); varImpPlot(rf3); varImpPlot(rf4);

# str(rf1)    # OOB and err.rate exist // List of 19
# str(rf.all) # OOB and err.rate do not exist // List of 18
# rf.all error rate can not be plotted, unless error rates are combined
# plot(rf.all)

# END
	# Random forest are random...indeed
	# http://stats.stackexchange.com/questions/35609/why-do-i-need-bag-composition-to-calculate-oob-error-of-combined-random-forest-m
	# https://github.com/mlist/IB2014/blob/master/helper_methods.R
	# Random Forest combine: http://www.inside-r.org/packages/cran/randomForest/docs/combine
	#
	# This has implications on parallel random forests using snow, doSNOW, doParallel etc.
	#
	# err.rate : NULL
	# err.rate : NULL
	# OOB : NULL
	# Tobias Kind (2015)

	getConfusionMatrix <- function(rf) {

	tbl = table(predict(rf), rf$y)
	class.error = vector()

	for (i in 1:nrow(tbl)) {
	rowSum = sum(tbl[i,])
	accurate = diag(tbl)[i]
	error = rowSum - accurate

	class.error[i] = error / rowSum
	}
	return(cbind(tbl, class.error))
	}

	set.seed(123)
	library(randomForest)
	rf1 <- randomForest(Species ~ ., iris, ntree=50, norm.votes=FALSE)
	rf2 <- randomForest(Species ~ ., iris, ntree=50, norm.votes=FALSE)
	rf3 <- randomForest(Species ~ ., iris, ntree=50, norm.votes=FALSE)
	rf4 <- randomForest(Species ~ ., iris, ntree=50, norm.votes=FALSE)
	rf1;rf2;rf3;rf4

	rf.all <- combine(rf1, rf2, rf3, rf4)
	rf.all

	#---

	rf1$confusion = getConfusionMatrix(rf1)
	rf1$confusion

	rf2$confusion = getConfusionMatrix(rf2)
	rf2$confusion

	rf3$confusion = getConfusionMatrix(rf3)
	rf3$confusion

	rf4$confusion = getConfusionMatrix(rf4)
	rf4$confusion

	# same as
	conf <- rf4$confusion; conf

	# does not work (NULL)
	conf <- rf.all$confusion; conf

	# works
	rf.all$confusion = getConfusionMatrix(rf.all)
	rf.all$confusion

	predict(rf.all, type='prob')

	par(mfrow=c(4,2))
	plot(rf1); plot(rf2); plot(rf3);plot(rf4);
	varImpPlot(rf1); varImpPlot(rf2); varImpPlot(rf3); varImpPlot(rf4);

	# str(rf1) # OOB and err.rate exist // List of 19
	# str(rf.all) # OOB and err.rate do not exist // List of 18
	# rf.all error rate can not be plotted, unless error rates are combined
	# plot(rf.all)

	# END