jgub/R_Startup.R

## R_Startup.R
###########################################
###First, some code to update all packages at startup:
local({r <- getOption("repos")
      r["CRAN"] <- "https://cran.revolutionanalytics.com/"
      options(repos=r)
      })

library(utils)
options(pkgType="binary")
update.packages()


#############################################
###Now to load the packages I most often use:
library(foreign)
library(haven)
library(readstata13)
library(ggplot2)
library(psych)
library(multilevel)
library(stargazer)
library(car)
library(lmtest)
library(xtable)
library(MASS)
library(sandwich)
library(RItools)
library(multiwayvcov)
library(prediction)
library(margins)


###############################################
###############################################
robustse.f <- function(model, cluster, df_correction) {
  ## Huber-White heteroskedasticity-robust standard error calculation and table generation code for lm and glm models in R.
  ##Written by Joshua Gubler ~  http://joshuagubler.com.  Note that the second half of this function is just a wrapper for the excellent "multiwaycov" package here: https://cran.r-project.org/web/packages/multiwayvcov/multiwayvcov.pdf .  Love the authors of that package...
  ##Last updated: 3 July 2017

  #model = the model you estimated, now calculated with robust standard errors
  #cluster = the name of the variable on which you will cluster. Put a tilda in front of it (e.g. ~ state).  If you don't put in a cluster, you will simply get huber-white robust errors.
  #df_correction: If you do not want the number of levels in your cluster variable to count against your degrees of freedom (like the xt- options in Stata), then type "F".  Otherwise, type "T" and these levels will be counted against your degrees of freedom

  require(sandwich)
  require(lmtest)
  require(multiwayvcov)
  if(missing(cluster)) {
    name <- deparse(substitute(model))
    modelname <- paste(name,"rob",sep=".")
    model$se <- coeftest(model, vcov=vcovHC(model,"HC1"))[,2]
    model$vcovHC <- vcovHC(model,"HC1")
    assign(modelname,model,envir = .GlobalEnv)
    coeftest(model, vcov=vcovHC(model,"HC1"))
  } else {
    name <- deparse(substitute(model))
    modelname <- paste(name,"rob",sep=".")
    vcovCL <- cluster.vcov(model, cluster, df_correction = df_correction)
    model$vcovCL <- vcovCL
    modelname <- paste(name,"clustrob",sep=".")
    model$se <- coeftest(model, vcovCL)[,2]
    assign(modelname,model,envir = .GlobalEnv)
    coeftest(model, vcovCL)
  }
}


###############################################
###############################################
TwowayME.f <- function(M,X,Z,xlab,ylab,level,rob,cluster,df_correction,hist,Low,High)
{
  ## A function to generate 2-way marginal effects plots in R.
  ## Written by Joshua Gubler ~  http://joshuagubler.com
  ## Last modified: 17 August 2017 -- Made it such that the function can now handle a dichotomous Z, merging the old TwowayDME.f function into this function.
  ## Variables must be in the following order: y = x z (control variables here) xz. The model can include as many control variables as you need.
  ## M = an object of type "lm," "glm," or other estimation -- i.e. the object that contains the regression estimation you seek to plot.
  ## X = the variable whose marginal effect on Y you seek to plot
  ## Z = the moderating variable (will be positioned on the X-axis of the plot).  If Z is dichotomous, the function will plot these points with confidence intervals, otherwise Z will be plotted continuously.
  ## xlab = label for x-axis (in quotes)
  ## ylab = label for y-axis (in quotes)
  ## level = to set the confidence level. Two options (don't put these in quotes): 95, 90.  If you do not specify either option, the confidence
  ## intervals will not be correct.
  ## rob: if you desire huber-white robust standard errors
  ## cluster: the name of the variable you desire to cluster on, with a tilda in front (e.g.: ~ state).  Leave "rob" blank when including this.
  ##df_correction: If you do not want the number of levels in your cluster variable to count against your degrees of freedom (like the xt- options in Stata), then type "F".  Otherwise, type "T" and these levels will be counted against your degrees of freedom.  Leave this blank if "cluster" is blank.
  ## hist: This will show a density distribution of the varible on the x-axis.  If you'll leave it blank, you'll get a rug plot for the same. This is only used for non-dichotomous Z variables.
  ##Low and High: these are only for use when you have a dichotomous Z variable: they allow you to input the names (in quotes) that you would like for these two points in Z.  Omit these otherwise.

  #Example code:
  #Without robust standard errors; including histogram: TwowayME.f(test.lm,df$effect,df$moderator,"Levels of Moderator","ME of effect on Y",95,,,,hist)
  #With robust standard errors; with rugplot: TwowayME.f(test.lm,df$effect,df$moderator,"Levels of Moderator","ME of effect on Y",95,rob,,,)
  #With cluster-robust standard errors; no dfc correction; with histogram: TwowayME.f(test.lm,df$effect,df$moderator,"Levels of Moderator","ME of effect on Y",95,,~clustervar,T,hist)

  require(multiwayvcov)

  if(length(na.omit(unique(Z)))>2){
    S <- summary(M)
    N <- c(1:20)

    ## 20 equally-spaced values for the moderating variable
    zmin <- rep(min(Z, na.rm = TRUE), 20)
    zmax <- rep(max(Z, na.rm = TRUE), 20)
    Znew <- (((N - 1) / (20 - 1)) * (zmax - zmin)) + zmin

    ## Grab elements of coefficient and vcov matrix
    H <- head(S$coefficients, 3)
    T <- tail(S$coefficients, 1)
    b <- rbind(H, T)
    if(missing(cluster)){
      if(missing(rob)){Vcov <- vcov(M)}
      else{
        Vcov <- vcovHC(M,"HC1")
      }
    } else{
      Vcov <- cluster.vcov(M, cluster, df_correction = df_correction)
    }
    Vcov <- as.data.frame(Vcov)
    Vcov1 <- Vcov[, c(1:3)]
    Vcov2 <- Vcov[, -c(0:0 - length(Vcov))]
    Vcov <- cbind(Vcov1, Vcov2)
    Vh <- head(Vcov, 3)
    Vt <- tail(Vcov, 1)
    V <- rbind(Vh, Vt)
    b1 <- b[2, 1]
    b3 <- b[4, 1]
    varb1 <- V[2, 2]
    varb3 <- V[4, 4]
    covb1b3 <- V[4, 2]

    ## Calculate ME values
    conb <- b1 + b3 * Znew

    ## Calculate standard errors
    conse <- sqrt(varb1 + varb3 * (Znew^2) + 2 * covb1b3 * Znew)

    ## Upper and lower CIs
    ci <- NA
    ci[level == 95] <- qt(.975,M$df.residual)
    ci[level == 90] <- qt(.95,M$df.residual)
    a = ci * conse
    upper = conb + a
    lower = conb - a

    ##Graph the results
    if(missing(hist)){
      plot(c(Znew,Znew,Znew), c(a,upper,lower), type="n",xlab=xlab,ylab=ylab)+rug(jitter(Z))
      lines(Znew,conb,col="black")
      lines(Znew,upper,col="black",lty=2)
      lines(Znew,lower,col="black",lty=2)
      abline(h=0)
    }else{
      hist(Z, axes=F, xlab="", ylab="",main="", col="light gray")
      par(new=TRUE)
      plot(c(Znew,Znew,Znew), c(a,upper,lower), type="n",xlab=xlab,ylab=ylab)
      lines(Znew,conb,col="black")
      lines(Znew,upper,col="black",lty=2)
      lines(Znew,lower,col="black",lty=2)
      abline(h=0)
    }
  }else{
    S <- summary(M)

    ##To create the high/low dichotomous variable
    Z0 <- quantile(Z,   0, na.rm=TRUE)
    Z1 <- quantile(Z,   1, na.rm=TRUE)

    ## Grab elements of coefficient and vcov matrix
    require(sandwich)
    H <- head(S$coefficients,3)
    T <- tail(S$coefficients,1)
    b <- rbind(H,T)
    if(missing(cluster)){
      if(missing(rob)){Vcov <- vcov(M)}
      else{
        Vcov <- vcovHC(M,"HC1")
      }
    } else{
      Vcov <- cluster.vcov(M, cluster, df_correction = df_correction)
    }
    Vcov <- as.data.frame(Vcov)
    Vcov1 <- Vcov[,c(1:3)]
    Vcov2 <- Vcov[,-c(0:0-length(Vcov))]
    Vcov <- cbind(Vcov1,Vcov2)
    Vh <- head(Vcov,3)
    Vt <- tail(Vcov,1)
    V <- rbind(Vh,Vt)
    b1 <- b[2,1]
    b3 <- b[4,1]
    varb1 <- V[2,2]
    varb3 <- V[4,4]
    covb1b3 <- V[4,2]

    ## Calculate ME values
    conb0 <- b1+b3*Z0
    conb1 <- b1+b3*Z1

    ## Calculate standard errors
    conse0 <- sqrt(varb1 + varb3*(Z0^2) + 2*covb1b3*Z0)
    conse1 <- sqrt(varb1 + varb3*(Z1^2) + 2*covb1b3*Z1)

    ## Upper and lower CIs
    ci <- NA
    ci[level == 95] <- qt(.975,M$df.residual)
    ci[level == 90] <- qt(.95,M$df.residual)
    a0 = ci*conse0
    a1 = ci*conse1
    upper0 = conb0 + a0
    lower0 = conb0 - a0
    upper1 = conb1 + a1
    lower1 = conb1 - a1

    ##Graph the results
    require(ggplot2)
    margplot <- qplot(
      c(Low,High),
      c(conb0,conb1),
      xlab=xlab,
      ylab=ylab
    ) +
      geom_point() +
      theme_bw()

    margplot + geom_linerange(aes(ymin=c(lower0,lower1),ymax=c(upper0,upper1))) + geom_abline(intercept=0,slope=0)
  }
}

###############################################
###############################################
###Three-way Marginal effects function:

ThreewayME.f <- function(M,X,Z,W,xlab,ylab,lloc,Min,Q1,Mean,Q3,Max,level,rob,hist)
{
  ## A function to generate 3-way marginal effects plots in R with stars for set confidence levels.
  ## Written by Joshua Gubler ~  http://joshuagubler.com
  ## Last updated: 26 April 2017
  ## Variables must be in the following order: y = x z w (control variables here) xz xw zw xzw .  Of course, the model can include as many control variables as you need; the following code specifically uses just the variables we want for the interaction.  As long as you get the first three variables in correct order, R will correctly order the interaction terms.
  ## M = an object of type "lm," "glm," or other estimation -- i.e. the object that contains the regression estimation you seek to plot
  ## X = the variable whose effect on Y you seek to plot
  ## Z = the first moderating variable (will be positioned on the X-axis of the plot)
  ## W = the second moderating variable (the lines on the plot)
  ## xlab = Label for x-axis (in quotes)
  ## ylab = label for y-axis (in quotes)
  ## lloc = location of the legend for the plot, use values like "bottomleft"
  ## Min, Q1, Mean, Q3, Max = titles for each of these quartiles to be put in the legend -- i.e. "Min Q88" (titles must be in quotes)
  ## level = to set the confidence level.  Two options (don't put these in quotes): 95, 90.  Stars will show on lines that are significant at the level you set.  If you do not put in either option, stars will show on all lines.
  ## rob = if you desire Huber-White robust standard errors
  ## hist = if hist, then you will get a density histogram of your variable on the X-axis.  If not, then you will get a rug plot of your variable on the X-axis

  ## Example: ThreewayME.f(estimation.lm,ses,edu,pop,"Education levels","Effect of SES on Civil War","bottomleft","Min Pop","1Q Pop","Mean Pop","3Q Pop","Max Pop",90,rob,hist)

  ##Note: this function can be altered easily to change the number of lines in "W", depending on how many one prefers (W could be made dichtomous, etc.)

  S <- summary(M)
  N <- c(1:20)

  # Create 20 equally spaced values in a vector between min and max on the Z variable
  zmin <- rep(min(Z,na.rm=TRUE), 20)
  zmax <- rep(max(Z, na.rm=TRUE), 20)
  Znew <- (((N-1)/(20-1))*(zmax-zmin))+zmin

  # Generate the values of W for which you want to calculate the marginal effect (and standard errors) of X on Y.  Note: the default is the following percentiles, but these can be changed.  If one deletes lines here, the corresponding lines will need to be deleted later in the function.
  W0 <- quantile(W,   0, na.rm=TRUE)
  W1 <- quantile(W, .25, na.rm=TRUE)
  W2 <- quantile(W, .50, na.rm=TRUE)
  W3 <- quantile(W, .75, na.rm=TRUE)
  W4 <- quantile(W,   1, na.rm=TRUE)

  # Grab elements of coefficient and vcov matrix.
  require(sandwich)
  H <- head(S$coefficients,4)
  T <- tail(S$coefficients,4)
  b <- rbind(H,T)
  if(missing(rob)){Vcov <- vcov(M)}
  else{
    Vcov <- vcovHC(M,"HC1")
  }
  Vcov <- as.data.frame(Vcov)
  Vcov1 <- Vcov[,c(1:4)]
  Vcov2 <- Vcov[,-c(3:0-length(Vcov))]
  Vcov <- cbind(Vcov1,Vcov2)
  Vh <- head(Vcov,4)
  Vt <- tail(Vcov,4)
  V <- rbind(Vh,Vt)

  b1 <- b[2,1]
  b4 <- b[5,1]
  b5 <- b[6,1]
  b7 <- b[8,1]

  varb1 <- V[2,2]
  varb4 <- V[5,5]
  varb5 <- V[6,6]
  varb7 <- V[8,8]

  covb1b4 <- V[5,2]
  covb1b5 <- V[6,2]
  covb1b7 <- V[8,2]
  covb4b5 <- V[6,5]
  covb4b7 <- V[8,5]
  covb5b7 <- V[8,6]

  # Calculate ME values for all levels of W
  conb0 <- b1+b4*Znew+b5*W0+b7*(Znew*W0)
  conb1 <- b1+b4*Znew+b5*W1+b7*(Znew*W1)
  conb2 <- b1+b4*Znew+b5*W2+b7*(Znew*W2)
  conb3 <- b1+b4*Znew+b5*W3+b7*(Znew*W3)
  conb4 <- b1+b4*Znew+b5*W4+b7*(Znew*W4)

  # Calculate standard errors for all levels of W
  if(missing(rob)){
    conse0 <- sqrt(varb1
                   + varb4*(Znew^2) + varb5*(W0^2) + varb7*(Znew^2)*(W0^2)
                   + 2*Znew*covb1b4 + 2*W0*covb1b5 + 2*Znew*W0*covb1b7 + 2*Znew*W0*covb4b5
                   + 2*W0*(Znew^2)*covb4b7 + 2*(W0^2)*Znew*covb5b7)

    conse1 <- sqrt(varb1
                   + varb4*(Znew^2) + varb5*(W1^2) + varb7*(Znew^2)*(W1^2)
                   + 2*Znew*covb1b4 + 2*W1*covb1b5 + 2*Znew*W1*covb1b7 + 2*Znew*W1*covb4b5
                   + 2*W1*(Znew^2)*covb4b7 + 2*(W1^2)*Znew*covb5b7)

    conse2 <- sqrt(varb1
                   + varb4*(Znew^2) + varb5*(W2^2) + varb7*(Znew^2)*(W2^2)
                   + 2*Znew*covb1b4 + 2*W2*covb1b5 + 2*Znew*W2*covb1b7 + 2*Znew*W2*covb4b5
                   + 2*W2*(Znew^2)*covb4b7 + 2*(W2^2)*Znew*covb5b7)

    conse3 <- sqrt(varb1
                   + varb4*(Znew^2) + varb5*(W3^2) + varb7*(Znew^2)*(W3^2)
                   + 2*Znew*covb1b4 + 2*W3*covb1b5 + 2*Znew*W3*covb1b7 + 2*Znew*W3*covb4b5
                   + 2*W3*(Znew^2)*covb4b7 + 2*(W3^2)*Znew*covb5b7)

    conse4 <- sqrt(varb1
                   + varb4*(Znew^2) + varb5*(W4^2) + varb7*(Znew^2)*(W4^2)
                   + 2*Znew*covb1b4 + 2*W4*covb1b5 + 2*Znew*W4*covb1b7 + 2*Znew*W4*covb4b5
                   + 2*W4*(Znew^2)*covb4b7 + 2*(W4^2)*Znew*covb5b7)
  }else{
    conse0 <- sqrt(varb1
                   + varb4*(Znew^2) + varb5*(W0^2) + varb7*(Znew^2)*(W0^2)
                   + 2*Znew*covb1b4 + 2*W0*covb1b5 + 2*Znew*W0*covb1b7 + 2*Znew*W0*covb4b5
                   + 2*W0*(Znew^2)*covb4b7 + 2*(W0^2)*Znew*covb5b7)

    conse1 <- sqrt(varb1
                   + varb4*(Znew^2) + varb5*(W1^2) + varb7*(Znew^2)*(W1^2)
                   + 2*Znew*covb1b4 + 2*W1*covb1b5 + 2*Znew*W1*covb1b7 + 2*Znew*W1*covb4b5
                   + 2*W1*(Znew^2)*covb4b7 + 2*(W1^2)*Znew*covb5b7)

    conse2 <- sqrt(varb1
                   + varb4*(Znew^2) + varb5*(W2^2) + varb7*(Znew^2)*(W2^2)
                   + 2*Znew*covb1b4 + 2*W2*covb1b5 + 2*Znew*W2*covb1b7 + 2*Znew*W2*covb4b5
                   + 2*W2*(Znew^2)*covb4b7 + 2*(W2^2)*Znew*covb5b7)

    conse3 <- sqrt(varb1
                   + varb4*(Znew^2) + varb5*(W3^2) + varb7*(Znew^2)*(W3^2)
                   + 2*Znew*covb1b4 + 2*W3*covb1b5 + 2*Znew*W3*covb1b7 + 2*Znew*W3*covb4b5
                   + 2*W3*(Znew^2)*covb4b7 + 2*(W3^2)*Znew*covb5b7)

    conse4 <- sqrt(varb1
                   + varb4*(Znew^2) + varb5*(W4^2) + varb7*(Znew^2)*(W4^2)
                   + 2*Znew*covb1b4 + 2*W4*covb1b5 + 2*Znew*W4*covb1b7 + 2*Znew*W4*covb4b5
                   + 2*W4*(Znew^2)*covb4b7 + 2*(W4^2)*Znew*covb5b7)
  }

  # Create t statistics
  t0 <- conb0/conse0
  t1 <- conb1/conse1
  t2 <- conb2/conse2
  t3 <- conb3/conse3
  t4 <- conb4/conse4

  # Make a `shadow' variable that is missing if the t score is not larger than the critical level of significance you want.
  ci <- NA
  ci[level == 95] <- qt(.975,M$df.residual)
  ci[level == 90] <- qt(.95,M$df.residual)

  stars.df <- data.frame(consb0=conb0,consb1=conb1,consb2=conb2,consb3=conb3,consb4=conb4,t0=t0,t1=t1,t2=t2,t3=t3,t4=t4)

  stars.df$consb0[abs(stars.df$t0)<ci] <- NA
  stars.df$consb1[abs(stars.df$t1)<ci] <- NA
  stars.df$consb2[abs(stars.df$t2)<ci] <- NA
  stars.df$consb3[abs(stars.df$t3)<ci] <- NA
  stars.df$consb4[abs(stars.df$t4)<ci] <- NA

  # Generate a string variable called txt that is designated with a star.
  txt <- c("*")

  # Graph the marginal effect of X on Y across the desired range of the modifying variable Z. Do this for when W=0, when W=1, when W=2, when W=3, and when W=4.

  if(missing(hist)){
    plot(c(Znew,Znew,Znew,Znew,Znew,Znew,Znew,Znew,Znew,Znew), c(conb0,stars.df$consb0,conb1,stars.df$consb1,conb2,stars.df$consb2,conb3,stars.df$consb3,conb4,stars.df$consb4), type="n",xlab=xlab,ylab=ylab)+rug(jitter(Z))

    lines(Znew,conb0,col="blue")
    text(x=Znew,y=stars.df$consb0,labels=txt)
    lines(Znew,conb1,col="red")
    text(x=Znew,y=stars.df$consb1,labels=txt)
    lines(Znew,conb2,col="forest green")
    text(x=Znew,y=stars.df$consb2,labels=txt)
    lines(Znew,conb3,col="yellow")
    text(x=Znew,y=stars.df$consb3,labels=txt)
    lines(Znew,conb4,col="brown")
    text(x=Znew,y=stars.df$consb4,labels=txt)
    legend(lloc,legend=c(Min,Q1,Mean,Q3,Max),col=c("blue","red","forest green","yellow","brown"),lty = c("solid"))
    abline(h=0)
  }else{
    hist(Z, axes=F, xlab="", ylab="",main="", col="light gray")
    par(new=TRUE)
    plot(c(Znew,Znew,Znew,Znew,Znew,Znew,Znew,Znew,Znew,Znew), c(conb0,stars.df$consb0,conb1,stars.df$consb1,conb2,stars.df$consb2,conb3,stars.df$consb3,conb4,stars.df$consb4), type="n",xlab=xlab,ylab=ylab)

    lines(Znew,conb0,col="blue")
    text(x=Znew,y=stars.df$consb0,labels=txt)
    lines(Znew,conb1,col="red")
    text(x=Znew,y=stars.df$consb1,labels=txt)
    lines(Znew,conb2,col="forest green")
    text(x=Znew,y=stars.df$consb2,labels=txt)
    lines(Znew,conb3,col="yellow")
    text(x=Znew,y=stars.df$consb3,labels=txt)
    lines(Znew,conb4,col="brown")
    text(x=Znew,y=stars.df$consb4,labels=txt)
    legend(lloc,legend=c(Min,Q1,Mean,Q3,Max),col=c("blue","red","forest green","yellow","brown"),lty = c("solid"), cex = 0.5)
    abline(h=0)
  }
}


###############################################
###############################################
predict.plot.f <- function(predicted.df, X, Z, Zname, xlab, ylab)
{
  ## Written by Joshua Gubler ~  http://joshuagubler.com
  ## Last updated: 1 Feb. 2018 (stopped using qplot and added additional labeling features and notes)
  ## predicted.df = the dataframe created by Gubler's predict functions -- always ending in .pred.df
  ## X = the variable you want to the predicted effect for (all others are most often held at their means).  Note: you need to enter it with the full path (e.g. .pred.df$X)
  ## Z = if you want predictions based on the particular levels of Z you specified in your pred.df dataframe, type your Z variable here (entering the full path as with X).  If not, leave blank.
  ## Zname = the name to put in the legend for Z (if using Z).  If not using Z, leave blank.
  ## xlab = the label for the X axis; put in quotation marks
  ## ylab = the label for the Y axis; put in quotation marks
  ##This function simply puts a wrapper around some basic ggplot code.  As such, one can add extra layers to the plot simply by adding them to the end of the function call.  For example, to add a line between points, simply add: geom_line(aes(group=1)); to add lines for multiple groups: geom_line(aes(group=as.factor(variablename))).  To change/customize the labels on the x-axis: scale_x_discrete(labels = c("label 1","label 2")), etc.

  require(ggplot2)
  if(missing(Z)){
    predplot <- ggplot(data=predicted.df, aes(as.factor(X),predicted.df$predicted.value)) + geom_point(stat="identity") + theme_bw()
  } else{
    predplot <- ggplot(data=predicted.df, aes(as.factor(X),predicted.df$predicted.value, shape = as.factor(Z))) + geom_point(stat="identity") + theme_bw() + guides(shape=guide_legend(title=Zname))
  }
  predplot + geom_linerange(aes(ymin=predicted.df$ci.lower95,ymax=predicted.df$ci.upper95)) + labs(x=xlab,y=ylab)

  ##Example code: (using Z):
  ##First, estimate a model and run one of my predict functions.  Here's an example using predict.probit.f for some mortgage data found here: https://www.dropbox.com/s/2cdinsxfd59yh83/hmda_sw.dta?dl=0.  "deny" is coded 0/1 (1 if the application is denied); pi_ratio is a measure of an applicant's "payment to income ratio", and "black" is coded 0/1:
  #mort.probit <- glm(deny ~ pi_rat + black + pi_rat*black)
  #pred.df <- data.frame("pi_rat" = rep(seq(.2,.5,.01),2), "black" = c(rep(0,62),rep(1,62)))
  #predict.probit.f(mort.probit,pred.df)
  #predict.plot.f(mort.probit.pred.df,mort.probit.pred.df$pi_rat, mort.probit.pred.df$black, "Race","PI_ratio","Prob of Deny")

  ##If you don't have a moderating variable (Z), then simply leave the spots for Z and Zname in the function blank -- e.g.: predict.plot.f(mort.probit.pred.df,mort.probit.pred.df$pi_rat,,,"PI_ratio","Prob of Deny")
}


###############################################
###############################################
predict.f <- function(mod,predict.df,rob,cluster,df_correction){

  ## Written by Joshua Gubler ~  http://joshuagubler.com
  ## Last updated on 17 August 2017 -- now the function will detect model type: will take lm and glm (probit, logit, negative binomial) models.  Note that for the glm models, the function produces predicted probabilities, transforming the coefficents
  ## This organizes the data in a format so it can be automatically graphed by my predict.plot.f and/or marginal effects functions (similar to the "margins" command in Stata)
  ## Note, that for this function to work well, you must input factor variables with more than two levels individually (as individual dummies).
  ## ALSO NOTE THAT THE ROBUST AND CLUSTER-ROBUST STANDARD ERRORS ARE NOT OPTIONS FOR GLM MODELS.

  ## mod = the object for the model you estimated
  ## predict.df = the dataframe that indicates the values you want to use to predict change in Y.  Same as in  the default R "predict" function.
  ## rob = include this if you want robust standard errors; exclude if you do not.
  ## cluster = the name of the variable you desire to cluster on, with a tilda in front (e.g.: ~ state). Leave"rob" blank when including this.
  ## df_correction: If you do not want the number of levels in your cluster variable to count against your degrees of freedom (like the xt- options in Stata), then type "F".  Otherwise, type "T" and these levels will be counted against your degrees of freedom

  #Example code:
  #Huber-White Robust standard errors: predict.lm.f(test.lm,pred.df,rob)
  #For clustered errors with no dfc correction: predict.lm.f(test.lm,pred.df,df$clustervar)
  #For clustered errors with dfc correction: predict.lm.f(test.lm,pred.df,df$clustervar,T)

  require(sandwich)
  require(multiwayvcov)

  if(class(mod)[1]=="lm"){
    tt <- terms(mod)
    Terms <- delete.response(tt)
    m.mat <- model.matrix(Terms,data=(predict.df))
    fit <- as.vector(m.mat %*% mod$coef)

    if(missing(cluster)){
      if(missing(rob)){
        varcov <- vcov(mod)
        se.fit <- sqrt(diag(m.mat%*%varcov%*%t(m.mat)))
        ci.lower95 <- fit - qt(.975,mod$df.residual)*se.fit
        ci.upper95 <- fit + qt(.975,mod$df.residual)*se.fit
      }
      else{
        ##To generate the robust covariance matrix
        varcov <- vcovHC(mod,"HC1")
        se.fit <- sqrt(diag(m.mat%*%varcov%*%t(m.mat)))
        ci.lower95 <- fit - qt(.975,mod$df.residual)*se.fit
        ci.upper95 <- fit + qt(.975,mod$df.residual)*se.fit
      }
    }
    else {
      varcov <- cluster.vcov(mod, cluster, df_correction = df_correction)
      se.fit <- sqrt(diag(m.mat%*%varcov%*%t(m.mat)))
      ci.lower95 <- fit - qt(.975,mod$df.residual)*se.fit
      ci.upper95 <- fit + qt(.975,mod$df.residual)*se.fit
    }

    pred.df <- data.frame(m.mat,predicted.value=fit,se=se.fit,ci.lower95=ci.lower95,ci.upper95=ci.upper95)[,-1]
    nm <-deparse(substitute(mod))
    mdlname <- paste(nm,"pred.df",sep=".")
    assign(mdlname,pred.df,envir = .GlobalEnv)
  }
  else if(class(mod)[1]=="glm" & mod$family[2]=="logit"){
    tt <- terms(mod)
    Terms <- delete.response(tt)
    m.mat <- model.matrix(Terms,data=(predict.df))
    fit1 <- as.vector(m.mat %*% mod$coef)
    fit <- 1/(1+exp(-(fit1)))

    require(sandwich)
    se.fit <- predict(mod,predict.df,se.fit=TRUE,type="response")$se.fit
    ci.lower95 <- fit - 1.96*se.fit
    ci.upper95 <- fit + 1.96*se.fit

    pred.df <- data.frame(m.mat,predicted.value=fit,se=se.fit,ci.lower95=ci.lower95,ci.upper95=ci.upper95)[,-1]
    nm <-deparse(substitute(mod))
    mdlname <- paste(nm,"pred.df",sep=".")
    assign(mdlname,pred.df,envir = .GlobalEnv)
  }
  else if(class(mod)[1]=="glm" & mod$family[2]=="probit"){
    tt <- terms(mod)
    Terms <- delete.response(tt)
    m.mat <- model.matrix(Terms,data=(predict.df))
    fit1 <- as.vector(m.mat %*% mod$coef)
    fit <- pnorm(fit1)

    se.fit <- predict(mod,predict.df,se.fit=TRUE,type="response")$se.fit
    ci.lower95 <- fit - 1.96*se.fit
    ci.upper95 <- fit + 1.96*se.fit

    pred.df <- data.frame(m.mat,predicted.value=fit,se=se.fit,ci.lower95=ci.lower95,ci.upper95=ci.upper95)[,-1]
    nm <-deparse(substitute(mod))
    mdlname <- paste(nm,"pred.df",sep=".")
    assign(mdlname,pred.df,envir = .GlobalEnv)
  }
  else if(class(mod)[1]=="negbin"){
    tt <- terms(mod)
    Terms <- delete.response(tt)
    m.mat <- model.matrix(Terms,data=(predict.df))
    fit1 <- as.vector(m.mat %*% mod$coef)
    fit <- exp(fit1)

    se.fit <- predict(mod,predict.df,se.fit=TRUE,type="response")$se.fit
    ci.lower95 <- fit - 1.96*se.fit
    ci.upper95 <- fit + 1.96*se.fit

    pred.df <- data.frame(m.mat,predicted.value=fit,se=se.fit,ci.lower95=ci.lower95,ci.upper95=ci.upper95)[,-1]
    nm <-deparse(substitute(mod))
    mdlname <- paste(nm,"pred.df",sep=".")
    assign(mdlname,pred.df,envir = .GlobalEnv)
  }
}


###############################################
###############################################
## A function to standardize variables on a 0 to 1 scale
standardize <- function(x, na.rm = T)
{
  (x - min(x, na.rm = T)) / (max(x, na.rm = T) - min(x,na.rm = T))
}


###############################################
###############################################
rescale.to.1 <- function(var, num) (1/(num-1)*(var-num)+1) #var=name of variable, num = max number on current scale (so, for a 7-point scale, you would put 7 -- starting number must be 1)


###############################################
###############################################
t2p <- function(t,df){
  2*pt(abs(t),df,lower.tail=FALSE)
}


###############################################
###############################################
z2p <- function(z){
  2*pnorm(-abs(z))
}


###############################################
###############################################
##A nice function to create lag or lead variables, from: https://ctszkin.com/2012/03/11/generating-a-laglead-variables/

shift <- function(x,shift_by){
  stopifnot(is.numeric(shift_by))
  stopifnot(is.numeric(x))

  if (length(shift_by)>1)
    return(sapply(shift_by,shift, x=x))

  out<-NULL
  abs_shift_by=abs(shift_by)
  if (shift_by > 0 )
    out<-c(tail(x,-abs_shift_by),rep(NA,abs_shift_by))
  else if (shift_by < 0 )
    out<-c(rep(NA,abs_shift_by), head(x,-abs_shift_by))
  else
    out<-x
  out
}

#####################################
#####################################
##Multiplot function for ggplot that works quite well:

###Taken from: http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }

  if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}


print(lsf.str())