Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save diamonaj/a7672279dccc11d9bf29e9c1bf635cfe to your computer and use it in GitHub Desktop.
Save diamonaj/a7672279dccc11d9bf29e9c1bf635cfe to your computer and use it in GitHub Desktop.
install.packages("MASS")
library(MASS)
data(Pima.tr)
data(Pima.te)
#############
## STEP 1: Logistic regression, predict diabetes yes or no ##
logistic_reg <- glm(type ~ ., data = Pima.tr, family = binomial) # basic model
predict_logistic.tr <- predict(logistic_reg, type = "response") # predicted probabilities (TRAINING SET)
## STEP 2: Create a function that evaluates the misclassification rate for the TRAINING SET, for any threshold
evaluate_fn <- function(threshold = NA)
{
predicted_outcomes <- as.numeric(predict_logistic.tr > threshold)
table_logistic <- table(Pima.tr$type, predicted_outcomes)
error_rate_logistic <- sum(table_logistic[2:3])/sum(table_logistic)
return(error_rate_logistic)
}
## STEP 3: Optimize for threshold within TRAINING SET using a hill-climbing algorithm
best_threshold <- optim(0.5, evaluate_fn, method = "Brent",
lower = 0, upper = 1)$par
cat("\nThe best threshold is:", best_threshold, "\n")
cat("\nThe error rate at this threshold is:", evaluate_fn(best_threshold), "\n\n")
## OPTIONAL STEP 4: Optimize for threshold within TRAINING SET using a genetic algorithm
install.packages("rgenoud")
library(rgenoud)
best_threshold_genetic <- genoud(fn = evaluate_fn, nvars = 1, max = FALSE, print.level = 1, starting.values = best_threshold)$par
cat("\nThe best (genetic algorithm-derived) threshold is:", best_threshold_genetic, "\n\n")
cat("\nThe error rate at this (genetic-derived) threshold is:", evaluate_fn(best_threshold_genetic), "\n\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment