Skip to content

Instantly share code, notes, and snippets.

@sebp
Created August 17, 2018 18:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sebp/2b5bf8e9f323cce8454f72d5365abf9f to your computer and use it in GitHub Desktop.
Save sebp/2b5bf8e9f323cce8454f72d5365abf9f to your computer and use it in GitHub Desktop.
Simulation study demonstrating performance of logistic regression in the presence of non-linear effects
# simulation study related to https://stats.stackexchange.com/questions/362194/data-how-smart-are-models-is-my-dummy-redundant
library(ROCR)
gen.features <- function(n.samples) {
age <- runif(n.samples, min=6, max=89)
sex <- factor(rbinom(n.samples, 1, 0.5), c(0, 1), c("male", "female"))
data <- data.frame(age, sex)
return(data)
}
gen.outcome <- function(data) {
# create model
mat <- model.matrix(~ sex + I(age < 18), data=data)
col.mean <- apply(mat, 2, mean)
# subtract the column means
mm <- sweep(mat, 2, col.mean)
# coefficients in terms of odds-ratio of survival
beta <- log(c(1, 1.5, 4))
names(beta) <- colnames(mm)
# add independent noise
noise <- rnorm(nrow(data), sd=0.2)
# linear model
nu <- mm %*% beta + noise
# probabilities
prob <- 1 / (1. + exp(-nu))
y <- prob > 0.5
# flip labels for about 5%
flip <- which(rbinom(nrow(data), 1, 0.05) == 1)
y[flip] <- !y[flip]
return(factor(y, c(FALSE, TRUE), c("dead", "alive")))
}
gen.data <- function(n.samples) {
data <- gen.features(n.samples)
data$outcome <- gen.outcome(data)
return(data)
}
predict.and.evaluate <- function(fit, newdata) {
print(summary(fit))
predictions <- predict(fit, type="response", newdata=newdata)
pred <- prediction(predictions, newdata$outcome, label.ordering=c("dead", "alive"))
perf <- performance(pred, measure = "auc")
auc <- unlist(perf@y.values)
cat(paste("ROC AUC", auc, "\n"))
}
set.seed(62354)
# generate training data
data <- gen.data(200)
cat("\n# Overall distribution of outcome:\n")
print(table(data$outcome))
cat("\n# Distribution of outcome among children:\n")
print(table(data$outcome[data$age < 18]))
cat("\n# Distribution of outcome among adults:\n")
print(table(data$outcome[data$age >= 18]))
# generate testing data
test.data <- gen.data(200)
cat("\n\n##################\n")
cat("## LINEAR MODEL ##\n")
cat("##################\n")
fit <- glm(I(outcome == "alive") ~ ., data=data, family="binomial")
predict.and.evaluate(fit, test.data)
cat("\n\n########################\n")
cat("## DICHOTOMISED MODEL ##\n")
cat("########################\n")
fit.actual <- glm(I(outcome == "alive") ~ I(age < 18) + ., data=data, family="binomial")
predict.and.evaluate(fit.actual, test.data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment