library(data.table) | |
train <- fread('train.csv'); test <- fread('test.csv') | |
# consolidate the 2 data sets after creating a variable indicating train / test | |
train$flag <- 0; test$flag <- 1 | |
dat <- rbind(train,test) | |
# change outcome, var_b and var_e into factor var | |
dat$outcome <- factor(dat$outcome) | |
dat$var_b <- factor(dat$var_b) | |
dat$var_e <- factor(dat$var_e) | |
# check the levels of var_b and var_e in this consolidated, train and test data sets | |
length(levels(dat$var_b)); length(unique(train$var_b)); length(unique(test$var_b)) | |
# get back the train and test data | |
train <- subset(dat, flag == 0); test <- subset(dat, flag == 1) | |
train$flag <- NULL; test$flag <- NULL | |
# Build Logit Model using train data and make predictions | |
logitModel <- glm(outcome ~ ., data = train, family = 'binomial') | |
preds_train <- predict(logitModel, type = 'response') | |
# Model Predictions on test data | |
preds_test <- predict(logitModel, newdata = test, type = 'response') | |
# running the above code gives us the following error: | |
# factor var_b has new levels 16060, 17300, 17980, 19060, 21420, 21820, | |
# 25220, 29340, 30300, 33260, 34100, 38340, 39660, 44300, 45460 | |
# Workaround: | |
source('remove_missing_levels.R') | |
preds_test <- predict(logitModel, | |
newdata = remove_missing_levels(fit = logitModel, test_data = test), | |
type = 'response') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment