alexpkeil1/qgcomp_no_intercept.R

## qgcomp_no_intercept.R
library(qgcomp)

msm.fit.noint <- function(f,
                          qdata,
                          intvals,
                          expnms,
                          rr=TRUE,
                          main=TRUE,
                          degree=1,
                          id=NULL,
                          weights,
                          bayes=FALSE,
                          MCsize=nrow(qdata), ...){

  newform <- terms(f, data = qdata)
  nobs = nrow(qdata)
  thecall <- match.call(expand.dots = FALSE)
  names(thecall) <- gsub("qdata", "data", names(thecall))
  names(thecall) <- gsub("f", "formula", names(thecall))
  m <- match(c("formula", "data", "weights", "offset"), names(thecall), 0L)
  hasweights = ifelse("weights" %in% names(thecall), TRUE, FALSE)
  thecall <- thecall[c(1L, m)]
  thecall$drop.unused.levels <- TRUE

  thecall[[1L]] <- quote(stats::model.frame)
  thecalle <- eval(thecall, parent.frame())
  if(hasweights){
    qdata$weights <- as.vector(model.weights(thecalle))
  } else qdata$weights = rep(1, nobs)

  if(is.null(id)) {
    id <- "id__"
    qdata$id__ <- seq_len(dim(qdata)[1])
  }
  # conditional outcome regression fit
  nidx = which(!(names(qdata) %in% id))
  if(!bayes) fit <- glm(newform, data = qdata,
                        weights=weights,
                        ...)
  if(bayes){
    requireNamespace("arm")
    fit <- bayesglm(f, data = qdata[,nidx,drop=FALSE],
                    weights=weights,
                    ...)
  }
  if(fit$family$family %in% c("gaussian", "poisson")) rr=FALSE
  ###
  # get predictions (set exposure to 0,1,...,q-1)
  if(is.null(intvals)){
    intvals <- (seq_len(length(table(qdata[expnms[1]])))) - 1
  }
  predit <- function(idx, newdata){
    #newdata <- qdata
    newdata[,expnms] <- idx
    suppressWarnings(predict(fit, newdata=newdata, type='response'))
  }
  if(MCsize==nrow(qdata)){
    newdata <- qdata
  }else{
    newids <- data.frame(temp=sort(sample(unique(qdata[,id, drop=TRUE]), MCsize,
                                          #probs=weights, #bootstrap sampling with weights works with fixed weights, but not time-varying weights
                                          replace = TRUE
    )))
    names(newids) <- id
    newdata <- merge(qdata,newids, by=id, all.x=FALSE, all.y=TRUE)[seq_len(MCsize),]
  }
  predmat = lapply(intvals, predit, newdata=newdata)
  # fit MSM using g-computation estimates of expected outcomes under joint
  #  intervention
  #nobs <- dim(qdata)[1]
  msmdat <- data.frame(
    cbind(
      Ya = unlist(predmat),
      psi = rep(intvals, each=MCsize),
      weights = rep(newdata$weights, times=length(intvals))
      #times=length(table(qdata[expnms[1]])))
    )
  )
  # to do: allow functional form variations for the MSM via specifying the model formula
  if(bayes){
    if(!rr) suppressWarnings(msmfit <- bayesglm(Ya ~ -1 + poly(psi, degree=degree, raw=TRUE), data=msmdat,
                                                weights=weights, x=TRUE,
                                                ...))
    if(rr)  suppressWarnings(msmfit <- bayesglm(Ya ~ -1 + poly(psi, degree=degree, raw=TRUE), data=msmdat,
                                                family=binomial(link='log'), start=rep(-0.0001, degree+1),
                                                weights=weights, x=TRUE))
  }
  if(!bayes){
    if(!rr) suppressWarnings(msmfit <- glm(Ya ~ -1 + poly(psi, degree=degree, raw=TRUE), data=msmdat,
                                           weights=weights, x=TRUE,
                                           ...))
    if(rr)  suppressWarnings(msmfit <- glm(Ya ~ -1 + poly(psi, degree=degree, raw=TRUE), data=msmdat,
                                           family=binomial(link='log'), start=rep(-0.0001, degree+1),
                                           weights=weights, x=TRUE))
  }
  res <- list(fit=fit, msmfit=msmfit)
  if(main) {
    res$Ya <- msmdat$Ya   # expected outcome under joint exposure, by gcomp
    res$Yamsm <- predict(msmfit, type='response')
    res$Yamsml <- predict(msmfit, type="link")
    res$A <- msmdat$psi # joint exposure (0 = all exposures set category with
    # upper cut-point as first quantile)
  }
  res
}


qgcomp.boot.noint <- function(f,
                              data,
                              expnms=NULL,
                              q=4,
                              breaks=NULL,
                              id=NULL,
                              weights,
                              alpha=0.05,
                              B=200,
                              rr=TRUE,
                              degree=1,
                              seed=NULL,
                              bayes=FALSE,
                              MCsize=nrow(data),
                              parallel=FALSE, ...){
  oldq = NULL
  if(is.null(seed)) seed = round(runif(1, min=0, max=1e8))

  newform <- terms(f, data = data)
  class(newform) <- "formula"

  nobs = nrow(data)
  origcall <- thecall <- match.call(expand.dots = FALSE)
  names(thecall) <- gsub("f", "formula", names(thecall))
  m <- match(c("formula", "data", "weights", "offset"), names(thecall), 0L)
  hasweights = ifelse("weights" %in% names(thecall), TRUE, FALSE)
  thecall <- thecall[c(1L, m)]
  thecall$drop.unused.levels <- TRUE

  thecall[[1L]] <- quote(stats::model.frame)
  thecalle <- eval(thecall, parent.frame())
  if(hasweights){
    data$weights <- as.vector(model.weights(thecalle))
  } else data$weights = rep(1, nobs)


  if (is.null(expnms)) {

    #expnms <- attr(terms(f, data = data), "term.labels")
    expnms <- attr(newform, "term.labels")

    message("Including all model terms as exposures of interest\n")
  }
  lin = qgcomp:::checknames(expnms)
  if(!lin) stop("Model appears to be non-linear and I'm having trouble parsing it:
                  please use `expnms` parameter to define the variables making up the exposure")
  if (!is.null(q) & !is.null(breaks)){
    # if user specifies breaks, prioritize those
    oldq = q
    q <- NULL
  }
  if (!is.null(q) | !is.null(breaks)){
    ql <- quantize(data, expnms, q, breaks)
    qdata <- ql$data
    br <- ql$breaks
    if(is.null(q)){
      # rare scenario with user specified breaks and q is left at NULL
      nvals <- length(br[[1]])-1
    } else{
      nvals <- q
    }
    intvals <- (seq_len(nvals))-1
  } else {
    # if( is.null(breaks) & is.null(q)) # also includes NA
    qdata <- data
    # if no transformation is made (no quantiles, no breaks given)
    # then draw distribution values from quantiles of all the exposures
    # pooled together
    # : allow user specification of this
    nvals = length(table(unlist(data[,expnms])))
    if(nvals < 10){
      message("\nNote: using all possible values of exposure as the
              intervention values\n")
      p = length(expnms)
      intvals <- as.numeric(names(table(unlist(data[,expnms]))))

      br <- lapply(seq_len(p), function(x) c(-1e16, intvals[2:nvals]-1e-16, 1e16))
    }else{
      message("\nNote: using quantiles of all exposures combined in order to set
          proposed intervention values for overall effect (25th, 50th, 75th %ile)
        You can ensure this is valid by scaling all variables in expnms to have similar ranges.")
      intvals = as.numeric(quantile(unlist(data[,expnms]), c(.25, .5, .75)))
      br <- NULL
    }
  }
  if(is.null(id)) {
    id <- "id__"
    qdata$id__ <- seq_len(dim(qdata)[1])
  }
  ###
  msmfit <- msm.fit.noint(newform, qdata, intvals, expnms, rr, main=TRUE,degree=degree, id=id,
                          weights,
                          bayes,
                          MCsize=MCsize,
                          ...)
  # main estimate
  #estb <- as.numeric(msmfit$msmfit$coefficients[-1])
  estb <- as.numeric(msmfit$msmfit$coefficients)
  #bootstrap to get std. error
  nobs <- dim(qdata)[1]
  nids <- length(unique(qdata[,id, drop=TRUE]))
  starttime = Sys.time()
  psi.only <- function(i=1, f=f, qdata=qdata, intvals=intvals, expnms=expnms, rr=rr, degree=degree,
                       nids=nids, id=id,
                       weights,MCsize=MCsize,
                       ...){
    if(i==2 & !parallel){
      timeiter = as.numeric(Sys.time() - starttime)
      if((timeiter*B/60)>0.5) message(paste0("Expected time to finish: ", round(B*timeiter/60, 2), " minutes \n"))
    }
    bootids <- data.frame(temp=sort(sample(unique(qdata[,id, drop=TRUE]), nids,
                                           replace = TRUE
    )))
    names(bootids) <- id
    qdata_ <- merge(qdata,bootids, by=id, all.x=FALSE, all.y=TRUE)
    ft = msm.fit.noint(f, qdata_, intvals, expnms, rr, main=FALSE, degree, id, weights=weights, bayes, MCsize=MCsize,
                       ...)
    yhatty = data.frame(yhat=predict(ft$msmfit), psi=ft$msmfit$data[,"psi"])
    as.numeric(
      # the yhat estimates will be identical across individuals due to this being a marginal model
      c(ft$msmfit$coefficients, with(yhatty, tapply(yhat, psi, mean)))
    )
  }
  set.seed(seed)
  if(parallel){
    #Sys.setenv(R_FUTURE_SUPPORTSMULTICORE_UNSTABLE="quiet")
    future::plan(strategy = future::multisession)
    bootsamps <- future.apply::future_lapply(X=seq_len(B), FUN=psi.only,f=f, qdata=qdata, intvals=intvals,
                                             expnms=expnms, rr=rr, degree=degree, nids=nids, id=id,
                                             weights=qdata$weights,MCsize=MCsize,
                                             future.seed=TRUE,
                                             ...)

    future::plan(strategy = future::transparent)
  }else{
    bootsamps <- lapply(X=seq_len(B), FUN=psi.only,f=f, qdata=qdata, intvals=intvals,
                        expnms=expnms, rr=rr, degree=degree, nids=nids, id=id,
                        weights=weights, MCsize=MCsize,
                        ...)

  }
  bootsamps = do.call("cbind", bootsamps)
  # these are the linear predictors
  hats = t(bootsamps[-c(seq_len(degree)),])
  # covariance of the linear predictors
  cov.yhat = cov(hats)
  bootsamps = bootsamps[seq_len(degree),, drop=FALSE]
  seb <- apply(bootsamps, 1, sd)
  covmat <- cov(t(bootsamps))
  colnames(covmat) <- rownames(covmat) <- names(estb) <- c(paste0("psi", seq_len(nrow(bootsamps))))

  tstat <- estb / seb
  df <- nobs - length(attr(terms(f, data = data), "term.labels")) - 1 - degree # df based on obs - gcomp terms - msm terms
  pval <- 2 - 2 * pt(abs(tstat), df = df)
  pvalz <- 2 - 2 * pnorm(abs(tstat))
  ci <- cbind(estb + seb * qnorm(alpha / 2), estb + seb * qnorm(1 - alpha / 2))
  # outcome 'weights' not applicable in this setting, generally (i.e. if using this function for non-linearity,
  #   then weights will vary with level of exposure)
  if (!is.null(oldq)){
    q = oldq
  }
  qx <- qdata[, expnms]
  res <- list(
    qx = qx, fit = msmfit$fit, msmfit = msmfit$msmfit,
    psi = estb, var.psi = seb ^ 2, covmat.psi=covmat, ci = ci,
    coef = estb, var.coef = seb ^ 2, covmat.coef=covmat, ci.coef = ci,
    expnms=expnms, q=q, breaks=br, degree=degree,
    pos.psi = NULL, neg.psi = NULL,
    pos.weights = NULL, neg.weights = NULL, pos.size = NULL,neg.size = NULL, bootstrap=TRUE,
    y.expected=msmfit$Ya, y.expectedmsm=msmfit$Yamsm, index=msmfit$A,
    bootsamps = bootsamps,
    cov.yhat=cov.yhat,
    alpha=alpha,
    call=origcall
  )
  if(msmfit$fit$family$family=='gaussian'){
    res$tstat <- tstat
    res$df <- df
    res$pval <- pval
  }
  if(msmfit$fit$family$family %in% c('binomial', 'poisson')){
    res$zstat <- tstat
    res$pval <- pvalz
  }
  attr(res, "class") <- c("qgcompintfit", "qgcompfit")
  res
}

qgcomp.noboot.noint <- function(f,
                          data,
                          expnms=NULL,
                          q=4,
                          breaks=NULL,
                          id=NULL,
                          weights,
                          alpha=0.05,
                          bayes=FALSE,
                          ...){

  newform <- terms(f, data = data)

  nobs = nrow(data)
  origcall <- thecall <- match.call(expand.dots = FALSE)
  names(thecall) <- gsub("f", "formula", names(thecall))
  m <- match(c("formula", "data", "weights", "offset"), names(thecall), 0L)
  #m <- match(c("f", "data", "weights", "offset"), names(thecall), 0L)
  hasweights = ifelse("weights" %in% names(thecall), TRUE, FALSE)
  thecall <- thecall[c(1L, m)]
  thecall$drop.unused.levels <- TRUE

  thecall[[1L]] <- quote(stats::model.frame)
  thecalle <- eval(thecall, parent.frame())
  if(hasweights){
    data$weights <- as.vector(model.weights(thecalle))
  } else data$weights = rep(1, nobs)


  if (is.null(expnms)) {

    #expnms <- attr(terms(f, data = data), "term.labels")
    expnms <- attr(newform, "term.labels")

    message("Including all model terms as exposures of interest\n")
  }
  lin = qgcomp:::checknames(expnms)
  if(!lin) stop("Model appears to be non-linear: use qgcomp.boot instead")
  if (!is.null(q) | !is.null(breaks)){
    ql <- quantize(data, expnms, q, breaks)
    qdata <- ql$data
    br <- ql$breaks
  } else{
    qdata <- data
    br <- breaks
  }
  if(is.null(id)) {
    # not yet implemented
    id = "id__"
    qdata$id__ = seq_len(dim(qdata)[1])
  }

  if(!bayes) fit <- glm(newform, data = qdata,
                        weights=weights,
                        ...)
  if(bayes){
    requireNamespace("arm")
    fit <- bayesglm(newform, data = qdata,
                    weights=weights,
                    ...)
  }


  #if(!bayes) fit <- glm(f, data = qdata[,!(names(qdata) %in% id), drop=FALSE],
  #                      #weights=weights,
  #                      ...)
  #if(bayes){
  #  requireNamespace("arm")
  #  fit <- bayesglm(f, data = qdata[,!(names(qdata) %in% id), drop=FALSE],
  #                  #weights=weights,
  #                  ...)
  #}
  mod <- summary(fit)
  if(length(setdiff(expnms, rownames(mod$coefficients)))>0){
    stop("Model aliasing occurred, likely due to perfectly correlated quantized exposures.
           Try one of the following:
             1) set 'bayes' to TRUE in the qgcomp function (recommended)
             2) set 'q' to a higher value in the qgcomp function (recommended)
             3) check correlation matrix of exposures, and drop all but one variable in each highly correlated set  (not recommended)
           ")
  }
  estb <- c(sum(mod$coefficients[expnms,1, drop=TRUE]))
  seb <- c(qgcomp:::se_comb(expnms, covmat = mod$cov.scaled))
  tstat <- estb / seb
  df <- mod$df.null - length(expnms)
  pval <- 2 - 2 * pt(abs(tstat), df = df)
  pvalz <- 2 - 2 * pnorm(abs(tstat))
  ci <- cbind(estb + seb * qnorm(alpha / 2), estb + seb * qnorm(1 - alpha / 2))
  # 'weights'
  wcoef <- fit$coefficients[expnms]
  names(wcoef) <- gsub("_q", "", names(wcoef))
  pos.coef <- which(wcoef > 0)
  neg.coef <- which(wcoef <= 0)
  pos.weights <- abs(wcoef[pos.coef]) / sum(abs(wcoef[pos.coef]))
  neg.weights <- abs(wcoef[neg.coef]) / sum(abs(wcoef[neg.coef]))
  # 'post-hoc' positive and negative estimators
  # similar to constrained gWQS
  pos.psi <- sum(wcoef[pos.coef])
  neg.psi <- sum(wcoef[neg.coef])
  #nmpos <- names(pos.weights)
  #nmneg <- names(neg.weights)
  #se.pos.psi <- se_comb(nmpos, covmat = mod$cov.scaled)
  #se.neg.psi <- se_comb(nmneg, covmat = mod$cov.scaled)
  qx <- qdata[, expnms]
  names(qx) <- paste0(names(qx), "_q")
  names(estb) <- c("psi1")
  res <- list(
    qx = qx, fit = fit,
    psi = estb, var.psi = seb ^ 2, covmat.psi=c('psi1' = seb^2), ci = ci,
    coef = estb, var.coef = seb ^ 2,
    #covmat.coef=c('(Intercept)' = seb[1]^2, 'psi1' = seb[2]^2),
    #covmat.coef=vc_comb(aname="(Intercept)", expnms=expnms, covmat = mod$cov.scaled),
    covmat.coef=NULL,
    ci.coef = ci,
    expnms=expnms, q=q, breaks=br, degree=1,
    pos.psi = pos.psi, neg.psi = neg.psi,
    pos.weights = sort(pos.weights, decreasing = TRUE),
    neg.weights = sort(neg.weights, decreasing = TRUE),
    pos.size = sum(abs(wcoef[pos.coef])),
    neg.size = sum(abs(wcoef[neg.coef])),
    bootstrap=FALSE,
    cov.yhat=NULL,
    alpha=alpha,
    call=origcall
  )
  if(fit$family$family=='gaussian'){
    res$tstat <- tstat
    res$df <- df
    res$pval <- pval
  }
  if(fit$family$family %in% c('binomial', 'poisson')){
    res$zstat <- tstat
    res$pval <- pvalz
  }
  attr(res, "class") <- c("qgcompintfit", "qgcompfit")
  res
}

#coef.qgcompintfit <- coef.qgcompfit

print.qgcompintfit <- function (x, showweights = TRUE, ...)
{
  fam <- x$fit$family$family
  if (inherits(x, "ziqgcompfit")) {
    stop("not implemented")
  }
  if (!is.null(x$pos.size) & showweights) {
    cat(paste0("Scaled effect size (positive direction, sum of positive coefficients = ",
               signif(x$pos.size, 3), ")\n"))
    if (length(x$pos.weights) > 0) {
      print(x$pos.weights, digits = 3)
    }
    else cat("None\n")
    cat("\n")
  }
  if (!is.null(x$neg.size) & showweights) {
    cat(paste0("Scaled effect size (negative direction, sum of negative coefficients = ",
               signif(-x$neg.size, 3), ")\n"))
    if (length(x$neg.weights) > 0) {
      print(x$neg.weights, digits = 3)
    }
    else cat("None\n")
    cat("\n")
  }
  if (fam == "binomial") {
    estimand <- "OR"
    if (x$bootstrap && x$msmfit$family$link == "log")
      estimand = "RR"
    cat(paste0("Mixture log(", estimand, ")", ifelse(x$bootstrap,
                                                     " (bootstrap CI)", " (Delta method CI)"), ":\n\n"))
    testtype = "Z"
    rnm = c(c(paste0("psi", 1:max(1, length(coef(x))))))
  }
  if (fam == "poisson") {
    estimand <- "RR"
    cat(paste0("Mixture log(", estimand, ")", ifelse(x$bootstrap,
                                                     " (bootstrap CI)", " (Delta method CI)"), ":\n\n"))
    testtype = "Z"
    rnm = c(c(paste0("psi", 1:max(1, length(coef(x))))))
  }
  if (fam == "gaussian") {
    cat(paste0("Mixture slope parameters", ifelse(x$bootstrap,
                                                  " (bootstrap CI)", " (Delta method CI)"), ":\n\n"))
    testtype = "t"
    x$zstat = x$tstat
    rnm = c(c(paste0("psi", 1:max(1, length(coef(x))))))
  }
  if (fam == "cox") {
    stop("not implemented")
  }
  if (!(fam %in% c("poisson", "binomial", "cox", "gaussian"))) {
    warning(paste0("The ", fam, " distribution has not been tested with qgcomp! Please use with extreme caution\n                   and check results thoroughly with simulated data to ensure it works."))
  }
  plab = ifelse(testtype == "Z", "Pr(>|z|)", "Pr(>|t|)")
  if (is.null(dim(x$ci.coef))) {
    pdat <- cbind(Estimate = coef(x), `Std. Error` = sqrt(x$var.coef),
                  `Lower CI` = x$ci.coef[1], `Upper CI` = x$ci.coef[2],
                  test = x$zstat, pval = x$pval)
  }
  else {
    pdat <- cbind(Estimate = coef(x), `Std. Error` = sqrt(x$var.coef),
                  `Lower CI` = x$ci.coef[, 1], `Upper CI` = x$ci.coef[,
                                                                      2], test = x$zstat, pval = x$pval)
  }
  colnames(pdat)[which(colnames(pdat) == "test")] = eval(paste(testtype,
                                                               "value"))
  colnames(pdat)[which(colnames(pdat) == "pval")] = eval(paste(plab))
  rownames(pdat) <- rnm
  printCoefmat(pdat, has.Pvalue = TRUE, tst.ind = 5L, signif.stars = FALSE,
               cs.ind = 1L:2)
  invisible(x)
}

# quick simulation examples
# true intercept is not zero: the methods should give different answers, suppressing intercept yields bias
dat <- qgcomp:::.dgm_quantized(N=200, b0=0, coef=c(.1, .1))
dat$y = dat$y + 10
lm(y ~ x1 + x2, data = dat)
lm(y ~ -1 + x1 + x2, data = dat)
(qf <- qgcomp.noboot(y ~ x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4))
#(qfb <- qgcomp.noboot.noint(y ~ x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4)) # this will just report estimates without intercepts
(qfnoint <- qgcomp.noboot.noint(y ~ -1 + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4))
(qfbnoint <- qgcomp.boot.noint(y ~ -1 + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4))


# now when true intercept is zero, answers will be similar, suppressing intercept will improve precision
set.seed(1232)
dat <- qgcomp:::.dgm_quantized(N=500, b0=0, coef=c(.1, .1))
lm(y ~ x1 + x2, data = dat)
lm(y ~ -1 + x1 + x2, data = dat)


(qf <- qgcomp.noboot(y ~ x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4))
(qfnoint <- qgcomp.noboot.noint(y ~ -1 + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4))
(qfbnoint <- qgcomp.boot.noint(y ~ -1 + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4))
	library(qgcomp)

	msm.fit.noint <- function(f,
	qdata,
	intvals,
	expnms,
	rr=TRUE,
	main=TRUE,
	degree=1,
	id=NULL,
	weights,
	bayes=FALSE,
	MCsize=nrow(qdata), ...){

	newform <- terms(f, data = qdata)
	nobs = nrow(qdata)
	thecall <- match.call(expand.dots = FALSE)
	names(thecall) <- gsub("qdata", "data", names(thecall))
	names(thecall) <- gsub("f", "formula", names(thecall))
	m <- match(c("formula", "data", "weights", "offset"), names(thecall), 0L)
	hasweights = ifelse("weights" %in% names(thecall), TRUE, FALSE)
	thecall <- thecall[c(1L, m)]
	thecall$drop.unused.levels <- TRUE

	thecall[[1L]] <- quote(stats::model.frame)
	thecalle <- eval(thecall, parent.frame())
	if(hasweights){
	qdata$weights <- as.vector(model.weights(thecalle))
	} else qdata$weights = rep(1, nobs)

	if(is.null(id)) {
	id <- "id__"
	qdata$id__ <- seq_len(dim(qdata)[1])
	}
	# conditional outcome regression fit
	nidx = which(!(names(qdata) %in% id))
	if(!bayes) fit <- glm(newform, data = qdata,
	weights=weights,
	...)
	if(bayes){
	requireNamespace("arm")
	fit <- bayesglm(f, data = qdata[,nidx,drop=FALSE],
	weights=weights,
	...)
	}
	if(fit$family$family %in% c("gaussian", "poisson")) rr=FALSE
	###
	# get predictions (set exposure to 0,1,...,q-1)
	if(is.null(intvals)){
	intvals <- (seq_len(length(table(qdata[expnms[1]])))) - 1
	}
	predit <- function(idx, newdata){
	#newdata <- qdata
	newdata[,expnms] <- idx
	suppressWarnings(predict(fit, newdata=newdata, type='response'))
	}
	if(MCsize==nrow(qdata)){
	newdata <- qdata
	}else{
	newids <- data.frame(temp=sort(sample(unique(qdata[,id, drop=TRUE]), MCsize,
	#probs=weights, #bootstrap sampling with weights works with fixed weights, but not time-varying weights
	replace = TRUE
	)))
	names(newids) <- id
	newdata <- merge(qdata,newids, by=id, all.x=FALSE, all.y=TRUE)[seq_len(MCsize),]
	}
	predmat = lapply(intvals, predit, newdata=newdata)
	# fit MSM using g-computation estimates of expected outcomes under joint
	# intervention
	#nobs <- dim(qdata)[1]
	msmdat <- data.frame(
	cbind(
	Ya = unlist(predmat),
	psi = rep(intvals, each=MCsize),
	weights = rep(newdata$weights, times=length(intvals))
	#times=length(table(qdata[expnms[1]])))
	)
	)
	# to do: allow functional form variations for the MSM via specifying the model formula
	if(bayes){
	if(!rr) suppressWarnings(msmfit <- bayesglm(Ya ~ -1 + poly(psi, degree=degree, raw=TRUE), data=msmdat,
	weights=weights, x=TRUE,
	...))
	if(rr) suppressWarnings(msmfit <- bayesglm(Ya ~ -1 + poly(psi, degree=degree, raw=TRUE), data=msmdat,
	family=binomial(link='log'), start=rep(-0.0001, degree+1),
	weights=weights, x=TRUE))
	}
	if(!bayes){
	if(!rr) suppressWarnings(msmfit <- glm(Ya ~ -1 + poly(psi, degree=degree, raw=TRUE), data=msmdat,
	weights=weights, x=TRUE,
	...))
	if(rr) suppressWarnings(msmfit <- glm(Ya ~ -1 + poly(psi, degree=degree, raw=TRUE), data=msmdat,
	family=binomial(link='log'), start=rep(-0.0001, degree+1),
	weights=weights, x=TRUE))
	}
	res <- list(fit=fit, msmfit=msmfit)
	if(main) {
	res$Ya <- msmdat$Ya # expected outcome under joint exposure, by gcomp
	res$Yamsm <- predict(msmfit, type='response')
	res$Yamsml <- predict(msmfit, type="link")
	res$A <- msmdat$psi # joint exposure (0 = all exposures set category with
	# upper cut-point as first quantile)
	}
	res
	}


	qgcomp.boot.noint <- function(f,
	data,
	expnms=NULL,
	q=4,
	breaks=NULL,
	id=NULL,
	weights,
	alpha=0.05,
	B=200,
	rr=TRUE,
	degree=1,
	seed=NULL,
	bayes=FALSE,
	MCsize=nrow(data),
	parallel=FALSE, ...){
	oldq = NULL
	if(is.null(seed)) seed = round(runif(1, min=0, max=1e8))

	newform <- terms(f, data = data)
	class(newform) <- "formula"

	nobs = nrow(data)
	origcall <- thecall <- match.call(expand.dots = FALSE)
	names(thecall) <- gsub("f", "formula", names(thecall))
	m <- match(c("formula", "data", "weights", "offset"), names(thecall), 0L)
	hasweights = ifelse("weights" %in% names(thecall), TRUE, FALSE)
	thecall <- thecall[c(1L, m)]
	thecall$drop.unused.levels <- TRUE

	thecall[[1L]] <- quote(stats::model.frame)
	thecalle <- eval(thecall, parent.frame())
	if(hasweights){
	data$weights <- as.vector(model.weights(thecalle))
	} else data$weights = rep(1, nobs)


	if (is.null(expnms)) {

	#expnms <- attr(terms(f, data = data), "term.labels")
	expnms <- attr(newform, "term.labels")

	message("Including all model terms as exposures of interest\n")
	}
	lin = qgcomp:::checknames(expnms)
	if(!lin) stop("Model appears to be non-linear and I'm having trouble parsing it:
	please use `expnms` parameter to define the variables making up the exposure")
	if (!is.null(q) & !is.null(breaks)){
	# if user specifies breaks, prioritize those
	oldq = q
	q <- NULL
	}
	if (!is.null(q) \| !is.null(breaks)){
	ql <- quantize(data, expnms, q, breaks)
	qdata <- ql$data
	br <- ql$breaks
	if(is.null(q)){
	# rare scenario with user specified breaks and q is left at NULL
	nvals <- length(br[[1]])-1
	} else{
	nvals <- q
	}
	intvals <- (seq_len(nvals))-1
	} else {
	# if( is.null(breaks) & is.null(q)) # also includes NA
	qdata <- data
	# if no transformation is made (no quantiles, no breaks given)
	# then draw distribution values from quantiles of all the exposures
	# pooled together
	# : allow user specification of this
	nvals = length(table(unlist(data[,expnms])))
	if(nvals < 10){
	message("\nNote: using all possible values of exposure as the
	intervention values\n")
	p = length(expnms)
	intvals <- as.numeric(names(table(unlist(data[,expnms]))))

	br <- lapply(seq_len(p), function(x) c(-1e16, intvals[2:nvals]-1e-16, 1e16))
	}else{
	message("\nNote: using quantiles of all exposures combined in order to set
	proposed intervention values for overall effect (25th, 50th, 75th %ile)
	You can ensure this is valid by scaling all variables in expnms to have similar ranges.")
	intvals = as.numeric(quantile(unlist(data[,expnms]), c(.25, .5, .75)))
	br <- NULL
	}
	}
	if(is.null(id)) {
	id <- "id__"
	qdata$id__ <- seq_len(dim(qdata)[1])
	}
	###
	msmfit <- msm.fit.noint(newform, qdata, intvals, expnms, rr, main=TRUE,degree=degree, id=id,
	weights,
	bayes,
	MCsize=MCsize,
	...)
	# main estimate
	#estb <- as.numeric(msmfit$msmfit$coefficients[-1])
	estb <- as.numeric(msmfit$msmfit$coefficients)
	#bootstrap to get std. error
	nobs <- dim(qdata)[1]
	nids <- length(unique(qdata[,id, drop=TRUE]))
	starttime = Sys.time()
	psi.only <- function(i=1, f=f, qdata=qdata, intvals=intvals, expnms=expnms, rr=rr, degree=degree,
	nids=nids, id=id,
	weights,MCsize=MCsize,
	...){
	if(i==2 & !parallel){
	timeiter = as.numeric(Sys.time() - starttime)
	if((timeiterB/60)>0.5) message(paste0("Expected time to finish: ", round(Btimeiter/60, 2), " minutes \n"))
	}
	bootids <- data.frame(temp=sort(sample(unique(qdata[,id, drop=TRUE]), nids,
	replace = TRUE
	)))
	names(bootids) <- id
	qdata_ <- merge(qdata,bootids, by=id, all.x=FALSE, all.y=TRUE)
	ft = msm.fit.noint(f, qdata_, intvals, expnms, rr, main=FALSE, degree, id, weights=weights, bayes, MCsize=MCsize,
	...)
	yhatty = data.frame(yhat=predict(ft$msmfit), psi=ft$msmfit$data[,"psi"])
	as.numeric(
	# the yhat estimates will be identical across individuals due to this being a marginal model
	c(ft$msmfit$coefficients, with(yhatty, tapply(yhat, psi, mean)))
	)
	}
	set.seed(seed)
	if(parallel){
	#Sys.setenv(R_FUTURE_SUPPORTSMULTICORE_UNSTABLE="quiet")
	future::plan(strategy = future::multisession)
	bootsamps <- future.apply::future_lapply(X=seq_len(B), FUN=psi.only,f=f, qdata=qdata, intvals=intvals,
	expnms=expnms, rr=rr, degree=degree, nids=nids, id=id,
	weights=qdata$weights,MCsize=MCsize,
	future.seed=TRUE,
	...)

	future::plan(strategy = future::transparent)
	}else{
	bootsamps <- lapply(X=seq_len(B), FUN=psi.only,f=f, qdata=qdata, intvals=intvals,
	expnms=expnms, rr=rr, degree=degree, nids=nids, id=id,
	weights=weights, MCsize=MCsize,
	...)

	}
	bootsamps = do.call("cbind", bootsamps)
	# these are the linear predictors
	hats = t(bootsamps[-c(seq_len(degree)),])
	# covariance of the linear predictors
	cov.yhat = cov(hats)
	bootsamps = bootsamps[seq_len(degree),, drop=FALSE]
	seb <- apply(bootsamps, 1, sd)
	covmat <- cov(t(bootsamps))
	colnames(covmat) <- rownames(covmat) <- names(estb) <- c(paste0("psi", seq_len(nrow(bootsamps))))

	tstat <- estb / seb
	df <- nobs - length(attr(terms(f, data = data), "term.labels")) - 1 - degree # df based on obs - gcomp terms - msm terms
	pval <- 2 - 2 * pt(abs(tstat), df = df)
	pvalz <- 2 - 2 * pnorm(abs(tstat))
	ci <- cbind(estb + seb * qnorm(alpha / 2), estb + seb * qnorm(1 - alpha / 2))
	# outcome 'weights' not applicable in this setting, generally (i.e. if using this function for non-linearity,
	# then weights will vary with level of exposure)
	if (!is.null(oldq)){
	q = oldq
	}
	qx <- qdata[, expnms]
	res <- list(
	qx = qx, fit = msmfit$fit, msmfit = msmfit$msmfit,
	psi = estb, var.psi = seb ^ 2, covmat.psi=covmat, ci = ci,
	coef = estb, var.coef = seb ^ 2, covmat.coef=covmat, ci.coef = ci,
	expnms=expnms, q=q, breaks=br, degree=degree,
	pos.psi = NULL, neg.psi = NULL,
	pos.weights = NULL, neg.weights = NULL, pos.size = NULL,neg.size = NULL, bootstrap=TRUE,
	y.expected=msmfit$Ya, y.expectedmsm=msmfit$Yamsm, index=msmfit$A,
	bootsamps = bootsamps,
	cov.yhat=cov.yhat,
	alpha=alpha,
	call=origcall
	)
	if(msmfit$fit$family$family=='gaussian'){
	res$tstat <- tstat
	res$df <- df
	res$pval <- pval
	}
	if(msmfit$fit$family$family %in% c('binomial', 'poisson')){
	res$zstat <- tstat
	res$pval <- pvalz
	}
	attr(res, "class") <- c("qgcompintfit", "qgcompfit")
	res
	}

	qgcomp.noboot.noint <- function(f,
	data,
	expnms=NULL,
	q=4,
	breaks=NULL,
	id=NULL,
	weights,
	alpha=0.05,
	bayes=FALSE,
	...){

	newform <- terms(f, data = data)

	nobs = nrow(data)
	origcall <- thecall <- match.call(expand.dots = FALSE)
	names(thecall) <- gsub("f", "formula", names(thecall))
	m <- match(c("formula", "data", "weights", "offset"), names(thecall), 0L)
	#m <- match(c("f", "data", "weights", "offset"), names(thecall), 0L)
	hasweights = ifelse("weights" %in% names(thecall), TRUE, FALSE)
	thecall <- thecall[c(1L, m)]
	thecall$drop.unused.levels <- TRUE

	thecall[[1L]] <- quote(stats::model.frame)
	thecalle <- eval(thecall, parent.frame())
	if(hasweights){
	data$weights <- as.vector(model.weights(thecalle))
	} else data$weights = rep(1, nobs)


	if (is.null(expnms)) {

	#expnms <- attr(terms(f, data = data), "term.labels")
	expnms <- attr(newform, "term.labels")

	message("Including all model terms as exposures of interest\n")
	}
	lin = qgcomp:::checknames(expnms)
	if(!lin) stop("Model appears to be non-linear: use qgcomp.boot instead")
	if (!is.null(q) \| !is.null(breaks)){
	ql <- quantize(data, expnms, q, breaks)
	qdata <- ql$data
	br <- ql$breaks
	} else{
	qdata <- data
	br <- breaks
	}
	if(is.null(id)) {
	# not yet implemented
	id = "id__"
	qdata$id__ = seq_len(dim(qdata)[1])
	}

	if(!bayes) fit <- glm(newform, data = qdata,
	weights=weights,
	...)
	if(bayes){
	requireNamespace("arm")
	fit <- bayesglm(newform, data = qdata,
	weights=weights,
	...)
	}


	#if(!bayes) fit <- glm(f, data = qdata[,!(names(qdata) %in% id), drop=FALSE],
	# #weights=weights,
	# ...)
	#if(bayes){
	# requireNamespace("arm")
	# fit <- bayesglm(f, data = qdata[,!(names(qdata) %in% id), drop=FALSE],
	# #weights=weights,
	# ...)
	#}
	mod <- summary(fit)
	if(length(setdiff(expnms, rownames(mod$coefficients)))>0){
	stop("Model aliasing occurred, likely due to perfectly correlated quantized exposures.
	Try one of the following:
	1) set 'bayes' to TRUE in the qgcomp function (recommended)
	2) set 'q' to a higher value in the qgcomp function (recommended)
	3) check correlation matrix of exposures, and drop all but one variable in each highly correlated set (not recommended)
	")
	}
	estb <- c(sum(mod$coefficients[expnms,1, drop=TRUE]))
	seb <- c(qgcomp:::se_comb(expnms, covmat = mod$cov.scaled))
	tstat <- estb / seb
	df <- mod$df.null - length(expnms)
	pval <- 2 - 2 * pt(abs(tstat), df = df)
	pvalz <- 2 - 2 * pnorm(abs(tstat))
	ci <- cbind(estb + seb * qnorm(alpha / 2), estb + seb * qnorm(1 - alpha / 2))
	# 'weights'
	wcoef <- fit$coefficients[expnms]
	names(wcoef) <- gsub("_q", "", names(wcoef))
	pos.coef <- which(wcoef > 0)
	neg.coef <- which(wcoef <= 0)
	pos.weights <- abs(wcoef[pos.coef]) / sum(abs(wcoef[pos.coef]))
	neg.weights <- abs(wcoef[neg.coef]) / sum(abs(wcoef[neg.coef]))
	# 'post-hoc' positive and negative estimators
	# similar to constrained gWQS
	pos.psi <- sum(wcoef[pos.coef])
	neg.psi <- sum(wcoef[neg.coef])
	#nmpos <- names(pos.weights)
	#nmneg <- names(neg.weights)
	#se.pos.psi <- se_comb(nmpos, covmat = mod$cov.scaled)
	#se.neg.psi <- se_comb(nmneg, covmat = mod$cov.scaled)
	qx <- qdata[, expnms]
	names(qx) <- paste0(names(qx), "_q")
	names(estb) <- c("psi1")
	res <- list(
	qx = qx, fit = fit,
	psi = estb, var.psi = seb ^ 2, covmat.psi=c('psi1' = seb^2), ci = ci,
	coef = estb, var.coef = seb ^ 2,
	#covmat.coef=c('(Intercept)' = seb[1]^2, 'psi1' = seb[2]^2),
	#covmat.coef=vc_comb(aname="(Intercept)", expnms=expnms, covmat = mod$cov.scaled),
	covmat.coef=NULL,
	ci.coef = ci,
	expnms=expnms, q=q, breaks=br, degree=1,
	pos.psi = pos.psi, neg.psi = neg.psi,
	pos.weights = sort(pos.weights, decreasing = TRUE),
	neg.weights = sort(neg.weights, decreasing = TRUE),
	pos.size = sum(abs(wcoef[pos.coef])),
	neg.size = sum(abs(wcoef[neg.coef])),
	bootstrap=FALSE,
	cov.yhat=NULL,
	alpha=alpha,
	call=origcall
	)
	if(fit$family$family=='gaussian'){
	res$tstat <- tstat
	res$df <- df
	res$pval <- pval
	}
	if(fit$family$family %in% c('binomial', 'poisson')){
	res$zstat <- tstat
	res$pval <- pvalz
	}
	attr(res, "class") <- c("qgcompintfit", "qgcompfit")
	res
	}

	#coef.qgcompintfit <- coef.qgcompfit

	print.qgcompintfit <- function (x, showweights = TRUE, ...)
	{
	fam <- x$fit$family$family
	if (inherits(x, "ziqgcompfit")) {
	stop("not implemented")
	}
	if (!is.null(x$pos.size) & showweights) {
	cat(paste0("Scaled effect size (positive direction, sum of positive coefficients = ",
	signif(x$pos.size, 3), ")\n"))
	if (length(x$pos.weights) > 0) {
	print(x$pos.weights, digits = 3)
	}
	else cat("None\n")
	cat("\n")
	}
	if (!is.null(x$neg.size) & showweights) {
	cat(paste0("Scaled effect size (negative direction, sum of negative coefficients = ",
	signif(-x$neg.size, 3), ")\n"))
	if (length(x$neg.weights) > 0) {
	print(x$neg.weights, digits = 3)
	}
	else cat("None\n")
	cat("\n")
	}
	if (fam == "binomial") {
	estimand <- "OR"
	if (x$bootstrap && x$msmfit$family$link == "log")
	estimand = "RR"
	cat(paste0("Mixture log(", estimand, ")", ifelse(x$bootstrap,
	" (bootstrap CI)", " (Delta method CI)"), ":\n\n"))
	testtype = "Z"
	rnm = c(c(paste0("psi", 1:max(1, length(coef(x))))))
	}
	if (fam == "poisson") {
	estimand <- "RR"
	cat(paste0("Mixture log(", estimand, ")", ifelse(x$bootstrap,
	" (bootstrap CI)", " (Delta method CI)"), ":\n\n"))
	testtype = "Z"
	rnm = c(c(paste0("psi", 1:max(1, length(coef(x))))))
	}
	if (fam == "gaussian") {
	cat(paste0("Mixture slope parameters", ifelse(x$bootstrap,
	" (bootstrap CI)", " (Delta method CI)"), ":\n\n"))
	testtype = "t"
	x$zstat = x$tstat
	rnm = c(c(paste0("psi", 1:max(1, length(coef(x))))))
	}
	if (fam == "cox") {
	stop("not implemented")
	}
	if (!(fam %in% c("poisson", "binomial", "cox", "gaussian"))) {
	warning(paste0("The ", fam, " distribution has not been tested with qgcomp! Please use with extreme caution\n and check results thoroughly with simulated data to ensure it works."))
	}
	plab = ifelse(testtype == "Z", "Pr(>\|z\|)", "Pr(>\|t\|)")
	if (is.null(dim(x$ci.coef))) {
	pdat <- cbind(Estimate = coef(x), `Std. Error` = sqrt(x$var.coef),
	`Lower CI` = x$ci.coef[1], `Upper CI` = x$ci.coef[2],
	test = x$zstat, pval = x$pval)
	}
	else {
	pdat <- cbind(Estimate = coef(x), `Std. Error` = sqrt(x$var.coef),
	`Lower CI` = x$ci.coef[, 1], `Upper CI` = x$ci.coef[,
	2], test = x$zstat, pval = x$pval)
	}
	colnames(pdat)[which(colnames(pdat) == "test")] = eval(paste(testtype,
	"value"))
	colnames(pdat)[which(colnames(pdat) == "pval")] = eval(paste(plab))
	rownames(pdat) <- rnm
	printCoefmat(pdat, has.Pvalue = TRUE, tst.ind = 5L, signif.stars = FALSE,
	cs.ind = 1L:2)
	invisible(x)
	}

	# quick simulation examples
	# true intercept is not zero: the methods should give different answers, suppressing intercept yields bias
	dat <- qgcomp:::.dgm_quantized(N=200, b0=0, coef=c(.1, .1))
	dat$y = dat$y + 10
	lm(y ~ x1 + x2, data = dat)
	lm(y ~ -1 + x1 + x2, data = dat)
	(qf <- qgcomp.noboot(y ~ x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4))
	#(qfb <- qgcomp.noboot.noint(y ~ x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4)) # this will just report estimates without intercepts
	(qfnoint <- qgcomp.noboot.noint(y ~ -1 + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4))
	(qfbnoint <- qgcomp.boot.noint(y ~ -1 + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4))


	# now when true intercept is zero, answers will be similar, suppressing intercept will improve precision
	set.seed(1232)
	dat <- qgcomp:::.dgm_quantized(N=500, b0=0, coef=c(.1, .1))
	lm(y ~ x1 + x2, data = dat)
	lm(y ~ -1 + x1 + x2, data = dat)


	(qf <- qgcomp.noboot(y ~ x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4))
	(qfnoint <- qgcomp.noboot.noint(y ~ -1 + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4))
	(qfbnoint <- qgcomp.boot.noint(y ~ -1 + x1 + x2, expnms = c('x1', 'x2'), data=dat, q=4))