Skip to content

Instantly share code, notes, and snippets.

@jeffwong
Last active December 17, 2015 10:19
Show Gist options
  • Save jeffwong/5593580 to your computer and use it in GitHub Desktop.
Save jeffwong/5593580 to your computer and use it in GitHub Desktop.
Hide new factor levels in a test set
require(data.table)
#Note that this permanently modifies test.data
hideNewLevels = function(train.data, test.data, replacement=NA, colnames.factor=NULL) {
if(is.null(colnames.factor)) colnames.factor = extractFactors(test.data)
for (col in colnames.factor) {
levels.test = levels(test.data[,get(col)])
levels.train = levels(train.data[,get(col)])
badlevels.indices = which(!levels.test %in% levels.train)
badlevels = levels.test[badlevels.indices]
if(length(badlevels) > 0) {
print(col); print(badlevels)
rows.badlevels = which(test.data[,get(col)] %in% badlevels)
x = as.character(test.data[,get(col)])
x[rows.badlevels] = replacement
x = factor(x, levels = c(levels.train, replacement))
set(test.data, j = col, value = x)
}
}
return (test.data)
}
extractFactors = function(x, pattern="*") {
factor.cols = rep(0, ncol(x))
cnames = colnames(x)
for(i in 1:ncol(x)) {
col = cnames[i]
if (is.factor(x[,get(col)])) {
factor.cols[i] = 1
}
}
return (grep(pattern, cnames[which(factor.cols == 1)], value = T))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment