chrishanretty/vpt.R

## vpt.R
library(MASS)
library(rio)
library(tidyverse)

multhet <- function(y, ## = formula(1~1),
	v, ##  = formula(~1),
	data) {

	## check length of inputs
	if (class(y) != "formula") {
		stop("y not a formula!")
	}
	if (class(v) != "formula") {
		stop("v not a formula!")
	}
	## check lengths
	if (length(y) != 3) {
		stop("Y formula not fully specified")
	}
	if (length(v) > 2) {
		## they've included an outcome variable
		v <- v[-1]
	}

	## set up model matrices
	mX <- model.matrix(as.formula(y[-2]), data = data)
	mZ <- model.matrix(as.formula(v), data = data)

	Y <- model.matrix(as.formula(paste0("~",y[2])), data = data)[,-1]

	## Check lengths
	if (nrow(mX) != nrow(mZ)) {
		stop("Model matrices of diffferent sizes: check missingness!")
	}
	## check missingness
	good1 <- all(complete.cases(mX) == complete.cases(mZ))
	good2 <- all(complete.cases(mZ) == complete.cases(Y))
	good3 <- all(complete.cases(mX) == complete.cases(Y))

	if (!good1 | !good2 | !good3) {
		stop("Different patterns of missingness present!")
	}

	## Set up the model ll function
	## taken from a very helpful stackexchange post
	## http://stats.stackexchange.com/questions/97437/feasible-generalized-least-square-in-r
	fnLogLik <- function(vParam, vY, mX, mZ) {
		vBeta = vParam[1:ncol(mX)]
		vAlpha = vParam[(ncol(mX)+1):(ncol(mX)+ncol(mZ))]
		nObs <- nrow(mX)
		negLogLik <- -1 * sum(0.5*log(2*pi) - mZ %*% vAlpha - (vY - mX %*% vBeta) ^ 2/ exp(mZ %*% vAlpha))
		return(negLogLik)
	}
	fnLogLik <- function(vParam, vY, mX, mZ) {
		vBeta = vParam[1:ncol(mX)]
		vAlpha = vParam[(ncol(mX)+1):(ncol(mX)+ncol(mZ))]
		nObs <- nrow(mX)
		LogLik <- (nObs / 2) * log(2*pi) - 0.5 * sum(mZ %*% vAlpha)  - 0.5 * sum(exp(-1 * mZ %*% vAlpha) * (vY - mX %*% vBeta) ^ 2)
		return(LogLik * -1)
	}

	## Estimate the OLS coefficients
	olsmod <- lm(y, data = data)

	## Now model the residuals
	resids <- resid(olsmod)
	data$resid.l <- log(resids^2)
	residmod <- lm(update(v, resid.l ~ .), data = data)

	## MLE
	optimHet <- optim(c(coef(olsmod),coef(residmod)),
		fnLogLik,
		vY = Y,
		mX = mX,
		mZ = mZ,
		method = "BFGS",
		hessian = TRUE,
		control = list(maxit = 1500))

	## Gather things up
	est <- optimHet$par
	vc <- solve(optimHet$hessian)
	se <- sqrt(diag(vc))

	beta.outcome <- est[1:length(coef(olsmod))]
	se.outcome <- se[1:length(coef(olsmod))]
	vc.outcome <-vc[1: length(coef(olsmod)),1:length(coef(olsmod))]

	beta.variance <- est[(length(coef(olsmod))+1):length(se)]
	se.variance <- se[(length(coef(olsmod))+1):length(se)]
	vc.variance <- vc[(length(coef(olsmod))+1):length(se),(length(coef(olsmod))+1):length(se)]

	return(list(outcome = list(coef = 	beta.outcome, se = se.outcome, model = olsmod, vc = vc.outcome),
		variance = list(coef = beta.variance, se = se.variance, model = residmod, vc = vc.variance),
		ll = optimHet$value,
		df = length(optimHet$par)))
}


### Get data from https://homepage.univie.ac.at/daniel.strobl/replication-data.html
dat <- import("VPT_reformdata_SFB_v1.dta")

mod1.form <- formula(reformMeasures ~
	recession + jobCrisis + mwc + yearsToElection +
	electionYear + fractionDays +
	countryDummies1 + countryDummies2 +
	countryDummies3 + countryDummies4 +
	countryDummies5 + countryDummies6 +
	countryDummies7 + countryDummies8 +
	countryDummies9 + countryDummies10 +
	countryDummies11 + countryDummies12)
mod1.altform <- update(mod1.form, log1p(reformMeasures) ~ .)

summary(mod1 <- glm.nb(mod1.form,
	data = subset(dat, !is.na(ideologicalRange))))

summary(mod1.alt <- lm(mod1.altform,
	data = subset(dat, !is.na(ideologicalRange))))

mod3.form <- formula(reformMeasures ~
	ideologicalRange + mwc + yearsToElection +
	electionYear + fractionDays +
	countryDummies1 + countryDummies2 +
	countryDummies3 + countryDummies4 +
	countryDummies5 + countryDummies6 +
	countryDummies7 + countryDummies8 +
	countryDummies9 + countryDummies10 +
	countryDummies11 + countryDummies12)
mod3.altform <- update(mod3.form, log1p(reformMeasures) ~ .)

summary(mod3 <- glm.nb(mod3.form,
	data = subset(dat, !is.na(ideologicalRange))))

summary(mod3.alt <- lm(mod3.altform,
	data = subset(dat, !is.na(ideologicalRange))))

mod3.het <- multhet(y = mod3.form,
	v =~ ideologicalRange + mwc,
	data = subset(dat, !is.na(ideologicalRange)))

mod7.form <- formula(reformMeasures ~
	ideologicalRange * mwc + yearsToElection +
	electionYear + fractionDays +
	countryDummies1 + countryDummies2 +
	countryDummies3 + countryDummies4 +
	countryDummies5 + countryDummies6 +
	countryDummies7 + countryDummies8 +
	countryDummies9 + countryDummies10 +
	countryDummies11 + countryDummies12)

mod7.altform <- update(mod7.form, log1p(reformMeasures) ~ .)

summary(mod7 <- glm.nb(mod7.form,
	data = subset(dat, !is.na(ideologicalRange))))

summary(mod7.alt <- lm(mod7.altform,
	data = subset(dat, !is.na(ideologicalRange))))

mod7.het <- multhet(y = mod7.form,
	v =~ ideologicalRange * mwc,
	data = subset(dat, !is.na(ideologicalRange)))

summary(mod7.het$outcome$model)
summary(mod7.het$variance$model)

## > summary(mod3.het$variance$model)

## Call:
## lm(formula = update(v, resid.l ~ .), data = data)

## Residuals:
##     Min      1Q  Median      3Q     Max
## -7.9092 -1.1911  0.4206  1.4957  3.7744

## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)
## (Intercept)        2.3353     0.2253  10.364   <2e-16 ***
## ideologicalRange   0.2227     0.1087   2.049   0.0414 *
## mwc                0.4089     0.2487   1.644   0.1012
## ---
## Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

## Residual standard error: 2.105 on 288 degrees of freedom
## Multiple R-squared:  0.02126,	Adjusted R-squared:  0.01446
## F-statistic: 3.128 on 2 and 288 DF,  p-value: 0.04531

## > summary(mod3.het$variance$model)

## Call:
## lm(formula = update(v, resid.l ~ .), data = data)

## Residuals:
##     Min      1Q  Median      3Q     Max
## -7.9092 -1.1911  0.4206  1.4957  3.7744

## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)
## (Intercept)        2.3353     0.2253  10.364   <2e-16 ***
## ideologicalRange   0.2227     0.1087   2.049   0.0414 *
## mwc                0.4089     0.2487   1.644   0.1012
## ---
## Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

## Residual standard error: 2.105 on 288 degrees of freedom
## Multiple R-squared:  0.02126,	Adjusted R-squared:  0.01446
## F-statistic: 3.128 on 2 and 288 DF,  p-value: 0.04531

## > summary(mod7.het$variance$model)

## Call:
## lm(formula = update(v, resid.l ~ .), data = data)

## Residuals:
##      Min       1Q   Median       3Q      Max
## -10.3420  -0.9681   0.6045   1.7432   3.9390

## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)
## (Intercept)          2.445093   0.304737   8.024 2.64e-14 ***
## ideologicalRange     0.009374   0.174787   0.054    0.957
## mwc                  0.198977   0.411586   0.483    0.629
## ideologicalRange:mwc 0.119561   0.258891   0.462    0.645
## ---
## Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

## Residual standard error: 2.496 on 287 degrees of freedom
## Multiple R-squared:  0.005613,	Adjusted R-squared:  -0.004781
## F-statistic:  0.54 on 3 and 287 DF,  p-value: 0.6553

## > summary(mod7.het$outcome$model)

## Call:
## lm(formula = y, data = data)

## Residuals:
##      Min       1Q   Median       3Q      Max
## -18.0531  -5.2076  -0.1084   4.6668  30.7078

## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)
## (Intercept)           -6.7612     5.1577  -1.311 0.191002
## ideologicalRange       1.0667     0.7184   1.485 0.138752
## mwc                    4.2140     1.8540   2.273 0.023816 *
## yearsToElection        1.1941     0.4324   2.762 0.006140 **
## electionYear          -3.1848     1.4839  -2.146 0.032742 *
## fractionDays          15.7457     4.0079   3.929 0.000108 ***
## countryDummies1        8.5644     2.8944   2.959 0.003359 **
## countryDummies2        4.1655     2.6984   1.544 0.123835
## countryDummies3        4.0221     2.9889   1.346 0.179526
## countryDummies4        4.7098     3.2554   1.447 0.149114
## countryDummies5       11.5087     2.7559   4.176    4e-05 ***
## countryDummies6        5.0201     2.3790   2.110 0.035756 *
## countryDummies7        4.4295     2.6127   1.695 0.091153 .
## countryDummies8        7.7470     2.8996   2.672 0.008002 **
## countryDummies9        4.8009     2.5747   1.865 0.063305 .
## countryDummies10       6.7077     2.5751   2.605 0.009698 **
## countryDummies11       2.7811     2.6681   1.042 0.298173
## countryDummies12       8.2831     3.0001   2.761 0.006156 **
## ideologicalRange:mwc  -2.4272     0.9657  -2.513 0.012536 *
## ---
## Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

## Residual standard error: 8.132 on 272 degrees of freedom
## Multiple R-squared:  0.2618,	Adjusted R-squared:  0.213
## F-statistic:  5.36 on 18 and 272 DF,  p-value: 1.162e-10
	library(MASS)
	library(rio)
	library(tidyverse)

	multhet <- function(y, ## = formula(1~1),
	v, ## = formula(~1),
	data) {

	## check length of inputs
	if (class(y) != "formula") {
	stop("y not a formula!")
	}
	if (class(v) != "formula") {
	stop("v not a formula!")
	}
	## check lengths
	if (length(y) != 3) {
	stop("Y formula not fully specified")
	}
	if (length(v) > 2) {
	## they've included an outcome variable
	v <- v[-1]
	}

	## set up model matrices
	mX <- model.matrix(as.formula(y[-2]), data = data)
	mZ <- model.matrix(as.formula(v), data = data)

	Y <- model.matrix(as.formula(paste0("~",y[2])), data = data)[,-1]

	## Check lengths
	if (nrow(mX) != nrow(mZ)) {
	stop("Model matrices of diffferent sizes: check missingness!")
	}
	## check missingness
	good1 <- all(complete.cases(mX) == complete.cases(mZ))
	good2 <- all(complete.cases(mZ) == complete.cases(Y))
	good3 <- all(complete.cases(mX) == complete.cases(Y))

	if (!good1 \| !good2 \| !good3) {
	stop("Different patterns of missingness present!")
	}

	## Set up the model ll function
	## taken from a very helpful stackexchange post
	## http://stats.stackexchange.com/questions/97437/feasible-generalized-least-square-in-r
	fnLogLik <- function(vParam, vY, mX, mZ) {
	vBeta = vParam[1:ncol(mX)]
	vAlpha = vParam[(ncol(mX)+1):(ncol(mX)+ncol(mZ))]
	nObs <- nrow(mX)
	negLogLik <- -1 * sum(0.5log(2pi) - mZ %% vAlpha - (vY - mX %% vBeta) ^ 2/ exp(mZ %*% vAlpha))
	return(negLogLik)
	}
	fnLogLik <- function(vParam, vY, mX, mZ) {
	vBeta = vParam[1:ncol(mX)]
	vAlpha = vParam[(ncol(mX)+1):(ncol(mX)+ncol(mZ))]
	nObs <- nrow(mX)
	LogLik <- (nObs / 2) * log(2pi) - 0.5 sum(mZ %% vAlpha) - 0.5 sum(exp(-1 * mZ %% vAlpha) (vY - mX %*% vBeta) ^ 2)
	return(LogLik * -1)
	}

	## Estimate the OLS coefficients
	olsmod <- lm(y, data = data)

	## Now model the residuals
	resids <- resid(olsmod)
	data$resid.l <- log(resids^2)
	residmod <- lm(update(v, resid.l ~ .), data = data)

	## MLE
	optimHet <- optim(c(coef(olsmod),coef(residmod)),
	fnLogLik,
	vY = Y,
	mX = mX,
	mZ = mZ,
	method = "BFGS",
	hessian = TRUE,
	control = list(maxit = 1500))

	## Gather things up
	est <- optimHet$par
	vc <- solve(optimHet$hessian)
	se <- sqrt(diag(vc))

	beta.outcome <- est[1:length(coef(olsmod))]
	se.outcome <- se[1:length(coef(olsmod))]
	vc.outcome <-vc[1: length(coef(olsmod)),1:length(coef(olsmod))]

	beta.variance <- est[(length(coef(olsmod))+1):length(se)]
	se.variance <- se[(length(coef(olsmod))+1):length(se)]
	vc.variance <- vc[(length(coef(olsmod))+1):length(se),(length(coef(olsmod))+1):length(se)]

	return(list(outcome = list(coef = beta.outcome, se = se.outcome, model = olsmod, vc = vc.outcome),
	variance = list(coef = beta.variance, se = se.variance, model = residmod, vc = vc.variance),
	ll = optimHet$value,
	df = length(optimHet$par)))
	}


	### Get data from https://homepage.univie.ac.at/daniel.strobl/replication-data.html
	dat <- import("VPT_reformdata_SFB_v1.dta")

	mod1.form <- formula(reformMeasures ~
	recession + jobCrisis + mwc + yearsToElection +
	electionYear + fractionDays +
	countryDummies1 + countryDummies2 +
	countryDummies3 + countryDummies4 +
	countryDummies5 + countryDummies6 +
	countryDummies7 + countryDummies8 +
	countryDummies9 + countryDummies10 +
	countryDummies11 + countryDummies12)
	mod1.altform <- update(mod1.form, log1p(reformMeasures) ~ .)

	summary(mod1 <- glm.nb(mod1.form,
	data = subset(dat, !is.na(ideologicalRange))))

	summary(mod1.alt <- lm(mod1.altform,
	data = subset(dat, !is.na(ideologicalRange))))

	mod3.form <- formula(reformMeasures ~
	ideologicalRange + mwc + yearsToElection +
	electionYear + fractionDays +
	countryDummies1 + countryDummies2 +
	countryDummies3 + countryDummies4 +
	countryDummies5 + countryDummies6 +
	countryDummies7 + countryDummies8 +
	countryDummies9 + countryDummies10 +
	countryDummies11 + countryDummies12)
	mod3.altform <- update(mod3.form, log1p(reformMeasures) ~ .)

	summary(mod3 <- glm.nb(mod3.form,
	data = subset(dat, !is.na(ideologicalRange))))

	summary(mod3.alt <- lm(mod3.altform,
	data = subset(dat, !is.na(ideologicalRange))))

	mod3.het <- multhet(y = mod3.form,
	v =~ ideologicalRange + mwc,
	data = subset(dat, !is.na(ideologicalRange)))

	mod7.form <- formula(reformMeasures ~
	ideologicalRange * mwc + yearsToElection +
	electionYear + fractionDays +
	countryDummies1 + countryDummies2 +
	countryDummies3 + countryDummies4 +
	countryDummies5 + countryDummies6 +
	countryDummies7 + countryDummies8 +
	countryDummies9 + countryDummies10 +
	countryDummies11 + countryDummies12)

	mod7.altform <- update(mod7.form, log1p(reformMeasures) ~ .)

	summary(mod7 <- glm.nb(mod7.form,
	data = subset(dat, !is.na(ideologicalRange))))

	summary(mod7.alt <- lm(mod7.altform,
	data = subset(dat, !is.na(ideologicalRange))))

	mod7.het <- multhet(y = mod7.form,
	v =~ ideologicalRange * mwc,
	data = subset(dat, !is.na(ideologicalRange)))

	summary(mod7.het$outcome$model)
	summary(mod7.het$variance$model)

	## > summary(mod3.het$variance$model)

	## Call:
	## lm(formula = update(v, resid.l ~ .), data = data)

	## Residuals:
	## Min 1Q Median 3Q Max
	## -7.9092 -1.1911 0.4206 1.4957 3.7744

	## Coefficients:
	## Estimate Std. Error t value Pr(>\|t\|)
	## (Intercept) 2.3353 0.2253 10.364 <2e-16 ***
	## ideologicalRange 0.2227 0.1087 2.049 0.0414 *
	## mwc 0.4089 0.2487 1.644 0.1012
	## ---
	## Signif. codes: 0 ‘*’ 0.001 ‘’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

	## Residual standard error: 2.105 on 288 degrees of freedom
	## Multiple R-squared: 0.02126, Adjusted R-squared: 0.01446
	## F-statistic: 3.128 on 2 and 288 DF, p-value: 0.04531

	## > summary(mod3.het$variance$model)

	## Call:
	## lm(formula = update(v, resid.l ~ .), data = data)

	## Residuals:
	## Min 1Q Median 3Q Max
	## -7.9092 -1.1911 0.4206 1.4957 3.7744

	## Coefficients:
	## Estimate Std. Error t value Pr(>\|t\|)
	## (Intercept) 2.3353 0.2253 10.364 <2e-16 ***
	## ideologicalRange 0.2227 0.1087 2.049 0.0414 *
	## mwc 0.4089 0.2487 1.644 0.1012
	## ---
	## Signif. codes: 0 ‘*’ 0.001 ‘’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

	## Residual standard error: 2.105 on 288 degrees of freedom
	## Multiple R-squared: 0.02126, Adjusted R-squared: 0.01446
	## F-statistic: 3.128 on 2 and 288 DF, p-value: 0.04531

	## > summary(mod7.het$variance$model)

	## Call:
	## lm(formula = update(v, resid.l ~ .), data = data)

	## Residuals:
	## Min 1Q Median 3Q Max
	## -10.3420 -0.9681 0.6045 1.7432 3.9390

	## Coefficients:
	## Estimate Std. Error t value Pr(>\|t\|)
	## (Intercept) 2.445093 0.304737 8.024 2.64e-14 ***
	## ideologicalRange 0.009374 0.174787 0.054 0.957
	## mwc 0.198977 0.411586 0.483 0.629
	## ideologicalRange:mwc 0.119561 0.258891 0.462 0.645
	## ---
	## Signif. codes: 0 ‘*’ 0.001 ‘’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

	## Residual standard error: 2.496 on 287 degrees of freedom
	## Multiple R-squared: 0.005613, Adjusted R-squared: -0.004781
	## F-statistic: 0.54 on 3 and 287 DF, p-value: 0.6553

	## > summary(mod7.het$outcome$model)

	## Call:
	## lm(formula = y, data = data)

	## Residuals:
	## Min 1Q Median 3Q Max
	## -18.0531 -5.2076 -0.1084 4.6668 30.7078

	## Coefficients:
	## Estimate Std. Error t value Pr(>\|t\|)
	## (Intercept) -6.7612 5.1577 -1.311 0.191002
	## ideologicalRange 1.0667 0.7184 1.485 0.138752
	## mwc 4.2140 1.8540 2.273 0.023816 *
	## yearsToElection 1.1941 0.4324 2.762 0.006140 **
	## electionYear -3.1848 1.4839 -2.146 0.032742 *
	## fractionDays 15.7457 4.0079 3.929 0.000108 ***
	## countryDummies1 8.5644 2.8944 2.959 0.003359 **
	## countryDummies2 4.1655 2.6984 1.544 0.123835
	## countryDummies3 4.0221 2.9889 1.346 0.179526
	## countryDummies4 4.7098 3.2554 1.447 0.149114
	## countryDummies5 11.5087 2.7559 4.176 4e-05 ***
	## countryDummies6 5.0201 2.3790 2.110 0.035756 *
	## countryDummies7 4.4295 2.6127 1.695 0.091153 .
	## countryDummies8 7.7470 2.8996 2.672 0.008002 **
	## countryDummies9 4.8009 2.5747 1.865 0.063305 .
	## countryDummies10 6.7077 2.5751 2.605 0.009698 **
	## countryDummies11 2.7811 2.6681 1.042 0.298173
	## countryDummies12 8.2831 3.0001 2.761 0.006156 **
	## ideologicalRange:mwc -2.4272 0.9657 -2.513 0.012536 *
	## ---
	## Signif. codes: 0 ‘*’ 0.001 ‘’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

	## Residual standard error: 8.132 on 272 degrees of freedom
	## Multiple R-squared: 0.2618, Adjusted R-squared: 0.213
	## F-statistic: 5.36 on 18 and 272 DF, p-value: 1.162e-10