geneorama/randomForest_factor_extrapolation.R

## randomForest_factor_extrapolation.R

rm(list=ls())
library(randomForest)

################################################################################
## EXAMPLE 1 (ver_1)
## Two data frames are created (test and train)
## They have their own independent factor levels, and some of the levels in
## `test_ver1` do not appear in `train_ver1`
##
## A model created on `train_ver1` will fail to predict on the previously
## unseen factor levels.
##
################################################################################

train_ver1 <- data.frame(x1 = c("f", "d", "b", "d"),
                         x2 = c("e", "c", "c", "d"),
                         y = c("yes", "yes", "no", "yes"),
                         stringsAsFactors = TRUE)

test_ver1 <- data.frame(x1 = c("b", "unwanted_char"),  ## "unwanted_char" is not in train
                        x2 = c("a", "e"),              ## "a" is not in train
                        y = c("no", "yes"),
                        stringsAsFactors = TRUE)


forest_ver1 <- randomForest(y ~ x1 + x2,
                            data = train_ver1,
                            ntree = 500)
predict(object = forest_ver1, newdata = test, type = 'response')
# Error in predict.randomForest(object = forest_ver1, newdata = test_ver1,  :
#   New factor levels not present in the training data


################################################################################
## EXAMPLE 2 (ver_2)
## In most text books / papers / examples there is a single data.frame with
## known factor levels.  This single data.frame is split into test / train,
## so the factor levels match in both subsets.
##
## This is equivalent to taking the levels in train and test and applying them
## to both factors.
##
################################################################################


(all_levels_x1 <- sort(unique(c(levels(train_ver1$x1), levels(test_ver1$x1)))))
# [1] "a"             "b"             "c"             "unwanted_char"

(all_levels_x2 <- sort(unique(c(levels(train_ver1$x2), levels(test_ver1$x2)))))
# [1] "a" "b" "c" "e"

train_ver2 <- train_ver1
test_ver2 <- test_ver1
levels(train_ver2$x1) <- all_levels_x1
levels(train_ver2$x2) <- all_levels_x2
levels(test_ver2$x1) <- all_levels_x1
levels(test_ver2$x2) <- all_levels_x2

forest_ver2 <- randomForest(y ~ x1 + x2,
                            data = train_ver2,
                            ntree = 500)
predict(object = forest_ver2, newdata = test_ver2, type = 'response')
#   1   2
#  no yes
# Levels: no yes


## The question is:
## How is the random forest model able predict on factor levels that are not
## seen in the training data?


################################################################################
## Other related questions
## (This is what I thought you were asking until I re-read the question a few
## times)
################################################################################

## How do you make sure that your factor levels in test and train match?
## e.g.
as.numeric(factor(c("a", "b", "c")))  ## You build a model on "a", "b", and "c"
# [1] 1 2 3
as.numeric(factor(c("b", "d")))       ## But you see "b" and "d" in production
# [1] 1 2
# How do you make sure that "b" is a "2"?
# What should "d" be, "NA" or "4"?


## No matter how you handle "d", what is the implication?
## Suppose that "a", "b", and "c" are highly correlated with a positive
## outcome, and NA is highly correlated with a negative outcome.  What if you
## want to treat newly observed levels as NA?
##
## Factors are the best and worst thing about R
##

	rm(list=ls())
	library(randomForest)

	################################################################################
	## EXAMPLE 1 (ver_1)
	## Two data frames are created (test and train)
	## They have their own independent factor levels, and some of the levels in
	## `test_ver1` do not appear in `train_ver1`
	##
	## A model created on `train_ver1` will fail to predict on the previously
	## unseen factor levels.
	##
	################################################################################

	train_ver1 <- data.frame(x1 = c("f", "d", "b", "d"),
	x2 = c("e", "c", "c", "d"),
	y = c("yes", "yes", "no", "yes"),
	stringsAsFactors = TRUE)

	test_ver1 <- data.frame(x1 = c("b", "unwanted_char"), ## "unwanted_char" is not in train
	x2 = c("a", "e"), ## "a" is not in train
	y = c("no", "yes"),
	stringsAsFactors = TRUE)


	forest_ver1 <- randomForest(y ~ x1 + x2,
	data = train_ver1,
	ntree = 500)
	predict(object = forest_ver1, newdata = test, type = 'response')
	# Error in predict.randomForest(object = forest_ver1, newdata = test_ver1, :
	# New factor levels not present in the training data


	################################################################################
	## EXAMPLE 2 (ver_2)
	## In most text books / papers / examples there is a single data.frame with
	## known factor levels. This single data.frame is split into test / train,
	## so the factor levels match in both subsets.
	##
	## This is equivalent to taking the levels in train and test and applying them
	## to both factors.
	##
	################################################################################


	(all_levels_x1 <- sort(unique(c(levels(train_ver1$x1), levels(test_ver1$x1)))))
	# [1] "a" "b" "c" "unwanted_char"

	(all_levels_x2 <- sort(unique(c(levels(train_ver1$x2), levels(test_ver1$x2)))))
	# [1] "a" "b" "c" "e"

	train_ver2 <- train_ver1
	test_ver2 <- test_ver1
	levels(train_ver2$x1) <- all_levels_x1
	levels(train_ver2$x2) <- all_levels_x2
	levels(test_ver2$x1) <- all_levels_x1
	levels(test_ver2$x2) <- all_levels_x2

	forest_ver2 <- randomForest(y ~ x1 + x2,
	data = train_ver2,
	ntree = 500)
	predict(object = forest_ver2, newdata = test_ver2, type = 'response')
	# 1 2
	# no yes
	# Levels: no yes


	## The question is:
	## How is the random forest model able predict on factor levels that are not
	## seen in the training data?



	################################################################################
	## Other related questions
	## (This is what I thought you were asking until I re-read the question a few
	## times)
	################################################################################

	## How do you make sure that your factor levels in test and train match?
	## e.g.
	as.numeric(factor(c("a", "b", "c"))) ## You build a model on "a", "b", and "c"
	# [1] 1 2 3
	as.numeric(factor(c("b", "d"))) ## But you see "b" and "d" in production
	# [1] 1 2
	# How do you make sure that "b" is a "2"?
	# What should "d" be, "NA" or "4"?


	## No matter how you handle "d", what is the implication?
	## Suppose that "a", "b", and "c" are highly correlated with a positive
	## outcome, and NA is highly correlated with a negative outcome. What if you
	## want to treat newly observed levels as NA?
	##
	## Factors are the best and worst thing about R
	##