djhocking/glmmBoot

## glmmBoot
# Function for getting bootstrapped glmer predictions in parallel
glmmBoot <- function(dat, form, R, nc){
  # dat = data for glmer (lme4) logistic regression
  # form = formula of glmer equation for fitting
  # R = total number of bootstrap draws - should be multiple of nc b/c divided among cores evenly
  # nc = number of cores to use in parallel

  library(parallel)
  cl <- makeCluster(nc) # Request # cores
  clusterExport(cl, c("dat", "form", "nc", "R"), envir = environment()) # Make these available to each core
  clusterSetRNGStream(cl = cl, 1259)

  out <- clusterEvalQ(cl, {
    library(lme4)
    library(boot) # used for the inv.logit function
    b.star <- NULL
    pres.star <- NULL
    pred.star <- NULL
    n <- length(dat$pres)
    for(draw in 1:(R/nc)){ # within each core draw R/nc samples
      df.star <- dat[sample(1:n, size=n, replace=T), ] # bootstrap data
      mod <- glmer(form, family = binomial(link = "logit"), data = df.star, control = glmerControl(optimizer="bobyqa")) # regression on each bootstrapped data set
      b.star <- rbind(b.star, coef(mod)) # combine coefficients from each draw (not currently used)
      pres.star <- rbind(pres.star, df.star$pres) # combine "observed" presence-absences from each bootstrapped data iteration
      pred.star <- rbind(pred.star, inv.logit(predict(mod, df.star, allow.new.levels = TRUE))) # # combine predicted presence probability from each draw
      }

    # make into lists formatted for the ROCR package
    lab <- list(pres.star)
    for(i in 1:dim(pres.star)[1]){
      lab[[i]] <- as.integer(pres.star[i, ])
      }

    pred <- list(pred.star)
    for(i in 1:dim(pred.star)[1]){
      pred[[i]] <- as.numeric(pred.star[i,])
      }

    pred.all <- list(pred, lab)
    names(pred.all) <- c("predictions", "observed")

    return(pred.all)
    }) # end cluster call
  stopCluster(cl)

  assign("out", out, envir = .GlobalEnv) # allow access outside function to help debugging

  # combine lists from each core to format for ROCR
  lab1 <- out[[1]]$observed
  for(i in 2:length(out)){
    foo <- out[[i]]$observed
    lab1 <- c(lab1, foo)
    }

  pred1 <- out[[1]]$predictions
  for(i in 2:length(out)){
      foo <- out[[i]]$predictions
      pred1 <- c(pred1, foo)
    }

  pred.mod <- list(pred1, lab1)
  names(pred.mod) <- c("predictions", "observed")

  return(pred.mod)
  } # end function

# Run function for model glmm.M35
system.time(pred.M35 <- glmmBoot(dat=df.fit, form=formula(glmm.M35), R=9, nc=3)) # takes ~2 min with 9 draws on 3 cores

library(ROCR)
p35 <- prediction(pred.M35$predictions, pred.M35$observed)
perf35 <- performance(p35, "tpr", "fpr")

par(mfrow = c(1,1))
plot(perf35, lty=3, col="blue")
abline(0, 1)
plot(perf35, avg="vertical", lwd=2, col="black", spread.estimate="stderror", plotCI.lwd=2, add=TRUE)

# function to summarize AUC across all bootstrap draws
aucSummary <- function(pred.list, Mean=T, CI=c(0.025, 0.975), SE=F){
  library(AUC)
  auc.out <- NULL
  for(i in 1:length(pred.list$predictions)) {
    auc.out[i] <- auc(roc(as.numeric(pred.list$predictions[[i]]), as.factor(pred.list$observed[[i]])))
    }
    auc.CI <- quantile(auc.out, probs=CI)
    auc.mean <- NA
    auc.se <- NA
    if(Mean == T){
      auc.mean <- mean(auc.out)
    }
    if(SE == T) {
      auc.se <- sd(auc.out)/sqrt(length(auc.out))
    }
    auc.summary <- data.frame(auc.mean, auc.CI[1], auc.CI[2], auc.se)
    names(auc.summary) <- c('Mean', 'LCI', 'UCI', 'SE')
    row.names(auc.summary) <- deparse(substitute(pred.list))
    return(auc.summary)
  } # end function

auc35 <- aucSummary(pred.M35, SE=T)

pred.fit <- inv.logit(predict(glmm.M35, df.fit, allow.new.levels = TRUE))
auc(roc(pred.fit, as.factor(df.fit$pres)))         # 0.875 - why so diff than auc35??????????????????????????????

# alternative ways to get it - still different
auc(roc(as.numeric(pred.list$predictions[[1]]), as.factor(pred.list$observed[[1]])))
mean(unlist(performance(p35, "auc")@y.values))
	# Function for getting bootstrapped glmer predictions in parallel
	glmmBoot <- function(dat, form, R, nc){
	# dat = data for glmer (lme4) logistic regression
	# form = formula of glmer equation for fitting
	# R = total number of bootstrap draws - should be multiple of nc b/c divided among cores evenly
	# nc = number of cores to use in parallel

	library(parallel)
	cl <- makeCluster(nc) # Request # cores
	clusterExport(cl, c("dat", "form", "nc", "R"), envir = environment()) # Make these available to each core
	clusterSetRNGStream(cl = cl, 1259)

	out <- clusterEvalQ(cl, {
	library(lme4)
	library(boot) # used for the inv.logit function
	b.star <- NULL
	pres.star <- NULL
	pred.star <- NULL
	n <- length(dat$pres)
	for(draw in 1:(R/nc)){ # within each core draw R/nc samples
	df.star <- dat[sample(1:n, size=n, replace=T), ] # bootstrap data
	mod <- glmer(form, family = binomial(link = "logit"), data = df.star, control = glmerControl(optimizer="bobyqa")) # regression on each bootstrapped data set
	b.star <- rbind(b.star, coef(mod)) # combine coefficients from each draw (not currently used)
	pres.star <- rbind(pres.star, df.star$pres) # combine "observed" presence-absences from each bootstrapped data iteration
	pred.star <- rbind(pred.star, inv.logit(predict(mod, df.star, allow.new.levels = TRUE))) # # combine predicted presence probability from each draw
	}

	# make into lists formatted for the ROCR package
	lab <- list(pres.star)
	for(i in 1:dim(pres.star)[1]){
	lab[[i]] <- as.integer(pres.star[i, ])
	}

	pred <- list(pred.star)
	for(i in 1:dim(pred.star)[1]){
	pred[[i]] <- as.numeric(pred.star[i,])
	}

	pred.all <- list(pred, lab)
	names(pred.all) <- c("predictions", "observed")

	return(pred.all)
	}) # end cluster call
	stopCluster(cl)

	assign("out", out, envir = .GlobalEnv) # allow access outside function to help debugging

	# combine lists from each core to format for ROCR
	lab1 <- out[[1]]$observed
	for(i in 2:length(out)){
	foo <- out[[i]]$observed
	lab1 <- c(lab1, foo)
	}

	pred1 <- out[[1]]$predictions
	for(i in 2:length(out)){
	foo <- out[[i]]$predictions
	pred1 <- c(pred1, foo)
	}

	pred.mod <- list(pred1, lab1)
	names(pred.mod) <- c("predictions", "observed")

	return(pred.mod)
	} # end function

	# Run function for model glmm.M35
	system.time(pred.M35 <- glmmBoot(dat=df.fit, form=formula(glmm.M35), R=9, nc=3)) # takes ~2 min with 9 draws on 3 cores

	library(ROCR)
	p35 <- prediction(pred.M35$predictions, pred.M35$observed)
	perf35 <- performance(p35, "tpr", "fpr")

	par(mfrow = c(1,1))
	plot(perf35, lty=3, col="blue")
	abline(0, 1)
	plot(perf35, avg="vertical", lwd=2, col="black", spread.estimate="stderror", plotCI.lwd=2, add=TRUE)

	# function to summarize AUC across all bootstrap draws
	aucSummary <- function(pred.list, Mean=T, CI=c(0.025, 0.975), SE=F){
	library(AUC)
	auc.out <- NULL
	for(i in 1:length(pred.list$predictions)) {
	auc.out[i] <- auc(roc(as.numeric(pred.list$predictions[[i]]), as.factor(pred.list$observed[[i]])))
	}
	auc.CI <- quantile(auc.out, probs=CI)
	auc.mean <- NA
	auc.se <- NA
	if(Mean == T){
	auc.mean <- mean(auc.out)
	}
	if(SE == T) {
	auc.se <- sd(auc.out)/sqrt(length(auc.out))
	}
	auc.summary <- data.frame(auc.mean, auc.CI[1], auc.CI[2], auc.se)
	names(auc.summary) <- c('Mean', 'LCI', 'UCI', 'SE')
	row.names(auc.summary) <- deparse(substitute(pred.list))
	return(auc.summary)
	} # end function

	auc35 <- aucSummary(pred.M35, SE=T)

	pred.fit <- inv.logit(predict(glmm.M35, df.fit, allow.new.levels = TRUE))
	auc(roc(pred.fit, as.factor(df.fit$pres))) # 0.875 - why so diff than auc35??????????????????????????????

	# alternative ways to get it - still different
	auc(roc(as.numeric(pred.list$predictions[[1]]), as.factor(pred.list$observed[[1]])))
	mean(unlist(performance(p35, "auc")@y.values))