Skip to content

Instantly share code, notes, and snippets.

@djhocking
Created May 21, 2014 14:12
Show Gist options
  • Save djhocking/fee60fabe16e86e46bd7 to your computer and use it in GitHub Desktop.
Save djhocking/fee60fabe16e86e46bd7 to your computer and use it in GitHub Desktop.
Bootstrap mixed effects logistic regression predictions
# Function for getting bootstrapped glmer predictions in parallel
glmmBoot <- function(dat, form, R, nc){
# dat = data for glmer (lme4) logistic regression
# form = formula of glmer equation for fitting
# R = total number of bootstrap draws - should be multiple of nc b/c divided among cores evenly
# nc = number of cores to use in parallel
library(parallel)
cl <- makeCluster(nc) # Request # cores
clusterExport(cl, c("dat", "form", "nc", "R"), envir = environment()) # Make these available to each core
clusterSetRNGStream(cl = cl, 1259)
out <- clusterEvalQ(cl, {
library(lme4)
library(boot) # used for the inv.logit function
b.star <- NULL
pres.star <- NULL
pred.star <- NULL
n <- length(dat$pres)
for(draw in 1:(R/nc)){ # within each core draw R/nc samples
df.star <- dat[sample(1:n, size=n, replace=T), ] # bootstrap data
mod <- glmer(form, family = binomial(link = "logit"), data = df.star, control = glmerControl(optimizer="bobyqa")) # regression on each bootstrapped data set
b.star <- rbind(b.star, coef(mod)) # combine coefficients from each draw (not currently used)
pres.star <- rbind(pres.star, df.star$pres) # combine "observed" presence-absences from each bootstrapped data iteration
pred.star <- rbind(pred.star, inv.logit(predict(mod, df.star, allow.new.levels = TRUE))) # # combine predicted presence probability from each draw
}
# make into lists formatted for the ROCR package
lab <- list(pres.star)
for(i in 1:dim(pres.star)[1]){
lab[[i]] <- as.integer(pres.star[i, ])
}
pred <- list(pred.star)
for(i in 1:dim(pred.star)[1]){
pred[[i]] <- as.numeric(pred.star[i,])
}
pred.all <- list(pred, lab)
names(pred.all) <- c("predictions", "observed")
return(pred.all)
}) # end cluster call
stopCluster(cl)
assign("out", out, envir = .GlobalEnv) # allow access outside function to help debugging
# combine lists from each core to format for ROCR
lab1 <- out[[1]]$observed
for(i in 2:length(out)){
foo <- out[[i]]$observed
lab1 <- c(lab1, foo)
}
pred1 <- out[[1]]$predictions
for(i in 2:length(out)){
foo <- out[[i]]$predictions
pred1 <- c(pred1, foo)
}
pred.mod <- list(pred1, lab1)
names(pred.mod) <- c("predictions", "observed")
return(pred.mod)
} # end function
# Run function for model glmm.M35
system.time(pred.M35 <- glmmBoot(dat=df.fit, form=formula(glmm.M35), R=9, nc=3)) # takes ~2 min with 9 draws on 3 cores
library(ROCR)
p35 <- prediction(pred.M35$predictions, pred.M35$observed)
perf35 <- performance(p35, "tpr", "fpr")
par(mfrow = c(1,1))
plot(perf35, lty=3, col="blue")
abline(0, 1)
plot(perf35, avg="vertical", lwd=2, col="black", spread.estimate="stderror", plotCI.lwd=2, add=TRUE)
# function to summarize AUC across all bootstrap draws
aucSummary <- function(pred.list, Mean=T, CI=c(0.025, 0.975), SE=F){
library(AUC)
auc.out <- NULL
for(i in 1:length(pred.list$predictions)) {
auc.out[i] <- auc(roc(as.numeric(pred.list$predictions[[i]]), as.factor(pred.list$observed[[i]])))
}
auc.CI <- quantile(auc.out, probs=CI)
auc.mean <- NA
auc.se <- NA
if(Mean == T){
auc.mean <- mean(auc.out)
}
if(SE == T) {
auc.se <- sd(auc.out)/sqrt(length(auc.out))
}
auc.summary <- data.frame(auc.mean, auc.CI[1], auc.CI[2], auc.se)
names(auc.summary) <- c('Mean', 'LCI', 'UCI', 'SE')
row.names(auc.summary) <- deparse(substitute(pred.list))
return(auc.summary)
} # end function
auc35 <- aucSummary(pred.M35, SE=T)
pred.fit <- inv.logit(predict(glmm.M35, df.fit, allow.new.levels = TRUE))
auc(roc(pred.fit, as.factor(df.fit$pres))) # 0.875 - why so diff than auc35??????????????????????????????
# alternative ways to get it - still different
auc(roc(as.numeric(pred.list$predictions[[1]]), as.factor(pred.list$observed[[1]])))
mean(unlist(performance(p35, "auc")@y.values))
@AFerrero82
Copy link

system.time(pred.M35 <- glmmBoot(dat=newBD, RIPCV ~ Edat65 + COL + TAB_R + (1|TEMP) +(1|medalla), R=100, nc=2)) # takes ~2 min with 9 draws on 3 cores
Error durante el wrapup: 2 nodes produced errors; first error: Invalid grouping factor specification, TEMP

I don't know why is not working the random effects...

@AFerrero82
Copy link

###########################
####bootMer-> boot AUC#####
###########################


library(lme4)
library(lattice)
data(cbpp)

#fit a model

cbpp$Y<-cbpp$incidence>=1
glmm<-glmer(Y~period + (1|herd), family=binomial, data=cbpp)
glmm

##### change "cbpp$Y" for the interested endpoint in the function

ROCFun <- function(fit) {
  library(pROC)
        pred<-predict(fit, type="response")
   AUC<-as.numeric(auc(cbpp$Y, pred))
   }

###run bootMer

system.time(AUC.boot <- bootMer(glmm,nsim=100,FUN=ROCFun,seed=101, use.u=TRUE,
                    type="parametric", parallel="multicore", ncpus=2))

AUC.boot


Call:
bootMer(x = glmm, FUN = ROCFun, nsim = 100, seed = 101, use.u = TRUE)


Bootstrap Statistics :
     original      bias    std. error
t1* 0.7479947 -0.02104278  0.03489653

#Now it seems more reasonable, bias as "optimism"... but still do not know #if I am just doing a AUC with bootstrap CI 

#test

(ROCFun(glmm))

#...

(boot.ci(AUC.boot, index =c(1,1), type="norm"))

roc(cbpp$Y, predict(glmm, type="response"))

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment