chrishanretty/final_polls_to_election_day.R

## final_polls_to_election_day.R
### Forecast election day results on the basis of final polls
### Some of this code is clunky (particularly calibration)
### but it is a useful test bed if you want to move from
### a multivariate normal to another distribution (say, a
### multivariate t distribution) -- just change the code in
### check_calibration. Do, however, read
### https://journal.r-project.org/archive/2013-2/hofert.pdf
### first!


## ----loadlibs------------------------------------------------------------
library(knitr)
library(pander)
library(Matrix)
library(mvtnorm)

## ----datin---------------------------------------------------------------
polls <- structure(list(Conservatives = c(0.463137689855468, 0.456132838783667,
0.42634772561737, 0.375342469945955, 0.308737479390641, 0.322761590590601,
0.32407274487598, 0.352164318385373, 0.336886748984473), Labour = c(0.393273527306154,
0.266928720258727, 0.335530448967622, 0.391501599270666, 0.478126236740175,
0.458478005198966, 0.379647880476463, 0.274403670248034, 0.336584668910591
), Liberal.Democrats = c(0.118964928480032, 0.254004731185892,
0.210710045749512, 0.190497666825356, 0.14310292507149, 0.156146559027138,
0.223943628451435, 0.273044090920638, 0.0849106722877976), Other = c(0.0246238543583452,
0.0229337097717135, 0.0274117796654951, 0.0426582639580229, 0.0700333587976943,
0.0626138451832949, 0.0723357461961213, 0.100387920445954, 0.241617909817138
)), .Names = c("Conservatives", "Labour", "Liberal.Democrats",
"Other"), class = "data.frame", row.names = c("1979", "1983",
"1987", "1992", "1997", "2001", "2005", "2010", "2015"))
shares <- structure(list(Conservatives = c(0.447774470245795, 0.43485092464566,
0.432466614101659, 0.428498495558431, 0.314618578007995, 0.324463265750917,
0.332359474708807, 0.368804460197394, 0.376137629058115), Labour = c(0.376979254862873,
0.282617296534093, 0.31519111447088, 0.351497022738108, 0.442984244236696,
0.416369176542003, 0.361397077962564, 0.296545633515872, 0.311041250597578
), Liberal.Democrats = c(0.141014904292, 0.260029558584563, 0.230726978209406,
0.182418283404197, 0.171809011855532, 0.186903837282912, 0.226447535045442,
0.235549351036152, 0.0804167859715201), Other = c(0.0342313705993318,
0.0225022202356833, 0.0216152932180549, 0.0375861982992636, 0.0705881658997772,
0.0722637204241681, 0.0797959122831868, 0.0991005552505823, 0.232404334372786
)), .Names = c("Conservatives", "Labour", "Liberal.Democrats",
"Other"), class = "data.frame", row.names = c("1979", "1983",
                                              "1987", "1992", "1997", "2001", "2005", "2010", "2015"))


## ----diffsout, results = "asis"------------------------------------------
diffs <- shares - polls
diffs <- rbind(diffs, colMeans(diffs))
rownames(diffs)[nrow(diffs)] <- "Average"
pander(diffs)

## ----ecm-----------------------------------------------------------------
example.shares <- c(0.44, 0.29, 0.11, 0.16)

ecm <- function(x, y) { ## expect correlation matrix
    -1 * sqrt((x*y)/((1-x)*(1-y)))
}
mat <- outer(example.shares, example.shares, ecm)
diag(mat) <- 1
pander(mat)

## ----cor2cov-------------------------------------------------------------
cor2cov = function(corMat, varVec) {
  # test the input
  if (!is.matrix(corMat)) stop("'corMat must be a matrix")
  n = nrow(corMat)
  if (ncol(corMat) != n) stop("'corMat' must be square")
  if (mode(corMat) != "numeric") stop("'corMat must be numeric")
  if (mode(varVec) != "numeric") stop("'varVec must be numeric")
  if (!is.null(dim(varVec))) {
    if (length(dim(varVec)) != 2) stop("'varVec' should be a vector")
    if (any(dim(varVec)==1)) stop("'varVec' cannot be a matrix")
    varVec = as.numeric(varVec) # convert row or col matrix to a vector
  }
  if (!all(diag(corMat) == 1)) stop("correlation matrices have 1 on the diagonal")
  if (any(corMat < -1 | corMat > +1))
    stop("correlations must be between -1 and 1")
  if (any(varVec <= 0)) stop("variances must be non-negative")
   if (length(varVec) != n) stop("length of 'varMat' does not match 'corMat' size")

  # Compute the covariance
  sdMat = diag(sqrt(varVec))
  rtn = sdMat %*% corMat %*% t(sdMat)
  if (det(rtn)<=0) warning("covariance matrix is not positive definite")
  return(rtn)
}

## ----cv------------------------------------------------------------------

check_calibration <- function(sigma = 0.01, alpha = 0.05,
                              p = polls, s = shares, nSims = 1000) {
    calibs <- numeric(0)
    for (i in 1:nrow(s)) {
        errs <- s[-i,] - p[-i,]
        delta <- colMeans(errs)

        cormat <- outer(as.vector(unlist(p[i,])),
                        as.vector(unlist(p[i,])),
                        ecm)
        diag(cormat) <- 1
        cormat <- as.matrix(nearPD(cormat)$mat)
        diag(cormat) <- 1
        Sigma <- cor2cov(corMat = cormat, varVec = rep(sigma, ncol(cormat)))
        deviates <- rmvnorm(n = nSims,
                            mean = as.vector(unlist(p[i,] + delta)),
                            sigma = Sigma)
        ci <- apply(deviates, 2, quantile, c(alpha/2, 1 - (alpha/2)))
        calib <- apply(s[i,] > ci[1,] & s[i,] < ci[2,], 2, min)

        calibs <- c(calibs, sum(calib))
    }
    return(sum(calibs) / length(unlist(shares)))
}

## ----plotfigs, fig = TRUE, fig.cap = "Calibration"-----------------------
### Let's evaluate this function for a range of values
test.df <- expand.grid(alpha = c(0.05, 0.1, 0.5),
                       sigma = seq(0.0001, 0.005, length.out = 100))
test.df$calib <- apply(test.df, 1, function(x)
    check_calibration(sigma = x["sigma"], alpha = x["alpha"]))

plot(test.df$sigma, test.df$calib,
     type = "n")
with(subset(test.df, alpha == 0.05),
     lines(sigma, calib, lty = 1, col = 'red'))
with(subset(test.df, alpha == 0.1),
     lines(sigma, calib, lty = 2, col = 'darkblue'))
with(subset(test.df, alpha == 0.5),
     lines(sigma, calib, lty = 3, col = 'forestgreen'))

for (a in c(0.5, 0.9, 0.95)) {
    abline(h = a, lty = 1, col = 'grey')
}
legend("bottomright",
       lty = 1:3,
       col = c("red", "darkblue", "forestgreen"),
       legend = c("95%", "90%", "50%"))

### Find out the actual values
test.df <- test.df[order(test.df$sigma),]
sigma1 <- test.df$sigma[which(test.df$calib > 0.9 & test.df$alpha == 0.1)[1]]
sigma2 <- test.df$sigma[which(test.df$calib > 0.95 & test.df$alpha == 0.05)[1]]
sigma3 <- test.df$sigma[which(test.df$calib > 0.95 & test.df$alpha == 0.05)[1]]
chosen.sigma <- min(c(sigma1, sigma2, sigma3))

## ----newsims-------------------------------------------------------------
### Now simulate
newpolls <- c(0.43, 0.28, 0.11)
newpolls <- c(newpolls, 1 - sum(newpolls))
cormat <- outer(newpolls,
                newpolls,
                ecm)

diag(cormat) <- 1
cormat <- as.matrix(nearPD(cormat)$mat)
diag(cormat) <- 1
Sigma <- cor2cov(corMat = cormat, varVec = rep(chosen.sigma, ncol(cormat)))


meanmat <- matrix(newpolls,
                  nrow = 1000,
                  ncol = length(newpolls),
                  byrow = TRUE)
deviates <- rmvnorm(n = 1000,
                    mean = newpolls,
                    sigma = Sigma)
	### Forecast election day results on the basis of final polls
	### Some of this code is clunky (particularly calibration)
	### but it is a useful test bed if you want to move from
	### a multivariate normal to another distribution (say, a
	### multivariate t distribution) -- just change the code in
	### check_calibration. Do, however, read
	### https://journal.r-project.org/archive/2013-2/hofert.pdf
	### first!



	## ----loadlibs------------------------------------------------------------
	library(knitr)
	library(pander)
	library(Matrix)
	library(mvtnorm)

	## ----datin---------------------------------------------------------------
	polls <- structure(list(Conservatives = c(0.463137689855468, 0.456132838783667,
	0.42634772561737, 0.375342469945955, 0.308737479390641, 0.322761590590601,
	0.32407274487598, 0.352164318385373, 0.336886748984473), Labour = c(0.393273527306154,
	0.266928720258727, 0.335530448967622, 0.391501599270666, 0.478126236740175,
	0.458478005198966, 0.379647880476463, 0.274403670248034, 0.336584668910591
	), Liberal.Democrats = c(0.118964928480032, 0.254004731185892,
	0.210710045749512, 0.190497666825356, 0.14310292507149, 0.156146559027138,
	0.223943628451435, 0.273044090920638, 0.0849106722877976), Other = c(0.0246238543583452,
	0.0229337097717135, 0.0274117796654951, 0.0426582639580229, 0.0700333587976943,
	0.0626138451832949, 0.0723357461961213, 0.100387920445954, 0.241617909817138
	)), .Names = c("Conservatives", "Labour", "Liberal.Democrats",
	"Other"), class = "data.frame", row.names = c("1979", "1983",
	"1987", "1992", "1997", "2001", "2005", "2010", "2015"))
	shares <- structure(list(Conservatives = c(0.447774470245795, 0.43485092464566,
	0.432466614101659, 0.428498495558431, 0.314618578007995, 0.324463265750917,
	0.332359474708807, 0.368804460197394, 0.376137629058115), Labour = c(0.376979254862873,
	0.282617296534093, 0.31519111447088, 0.351497022738108, 0.442984244236696,
	0.416369176542003, 0.361397077962564, 0.296545633515872, 0.311041250597578
	), Liberal.Democrats = c(0.141014904292, 0.260029558584563, 0.230726978209406,
	0.182418283404197, 0.171809011855532, 0.186903837282912, 0.226447535045442,
	0.235549351036152, 0.0804167859715201), Other = c(0.0342313705993318,
	0.0225022202356833, 0.0216152932180549, 0.0375861982992636, 0.0705881658997772,
	0.0722637204241681, 0.0797959122831868, 0.0991005552505823, 0.232404334372786
	)), .Names = c("Conservatives", "Labour", "Liberal.Democrats",
	"Other"), class = "data.frame", row.names = c("1979", "1983",
	"1987", "1992", "1997", "2001", "2005", "2010", "2015"))


	## ----diffsout, results = "asis"------------------------------------------
	diffs <- shares - polls
	diffs <- rbind(diffs, colMeans(diffs))
	rownames(diffs)[nrow(diffs)] <- "Average"
	pander(diffs)

	## ----ecm-----------------------------------------------------------------
	example.shares <- c(0.44, 0.29, 0.11, 0.16)

	ecm <- function(x, y) { ## expect correlation matrix
	-1 * sqrt((xy)/((1-x)(1-y)))
	}
	mat <- outer(example.shares, example.shares, ecm)
	diag(mat) <- 1
	pander(mat)

	## ----cor2cov-------------------------------------------------------------
	cor2cov = function(corMat, varVec) {
	# test the input
	if (!is.matrix(corMat)) stop("'corMat must be a matrix")
	n = nrow(corMat)
	if (ncol(corMat) != n) stop("'corMat' must be square")
	if (mode(corMat) != "numeric") stop("'corMat must be numeric")
	if (mode(varVec) != "numeric") stop("'varVec must be numeric")
	if (!is.null(dim(varVec))) {
	if (length(dim(varVec)) != 2) stop("'varVec' should be a vector")
	if (any(dim(varVec)==1)) stop("'varVec' cannot be a matrix")
	varVec = as.numeric(varVec) # convert row or col matrix to a vector
	}
	if (!all(diag(corMat) == 1)) stop("correlation matrices have 1 on the diagonal")
	if (any(corMat < -1 \| corMat > +1))
	stop("correlations must be between -1 and 1")
	if (any(varVec <= 0)) stop("variances must be non-negative")
	if (length(varVec) != n) stop("length of 'varMat' does not match 'corMat' size")

	# Compute the covariance
	sdMat = diag(sqrt(varVec))
	rtn = sdMat %% corMat %% t(sdMat)
	if (det(rtn)<=0) warning("covariance matrix is not positive definite")
	return(rtn)
	}

	## ----cv------------------------------------------------------------------

	check_calibration <- function(sigma = 0.01, alpha = 0.05,
	p = polls, s = shares, nSims = 1000) {
	calibs <- numeric(0)
	for (i in 1:nrow(s)) {
	errs <- s[-i,] - p[-i,]
	delta <- colMeans(errs)

	cormat <- outer(as.vector(unlist(p[i,])),
	as.vector(unlist(p[i,])),
	ecm)
	diag(cormat) <- 1
	cormat <- as.matrix(nearPD(cormat)$mat)
	diag(cormat) <- 1
	Sigma <- cor2cov(corMat = cormat, varVec = rep(sigma, ncol(cormat)))
	deviates <- rmvnorm(n = nSims,
	mean = as.vector(unlist(p[i,] + delta)),
	sigma = Sigma)
	ci <- apply(deviates, 2, quantile, c(alpha/2, 1 - (alpha/2)))
	calib <- apply(s[i,] > ci[1,] & s[i,] < ci[2,], 2, min)

	calibs <- c(calibs, sum(calib))
	}
	return(sum(calibs) / length(unlist(shares)))
	}

	## ----plotfigs, fig = TRUE, fig.cap = "Calibration"-----------------------
	### Let's evaluate this function for a range of values
	test.df <- expand.grid(alpha = c(0.05, 0.1, 0.5),
	sigma = seq(0.0001, 0.005, length.out = 100))
	test.df$calib <- apply(test.df, 1, function(x)
	check_calibration(sigma = x["sigma"], alpha = x["alpha"]))

	plot(test.df$sigma, test.df$calib,
	type = "n")
	with(subset(test.df, alpha == 0.05),
	lines(sigma, calib, lty = 1, col = 'red'))
	with(subset(test.df, alpha == 0.1),
	lines(sigma, calib, lty = 2, col = 'darkblue'))
	with(subset(test.df, alpha == 0.5),
	lines(sigma, calib, lty = 3, col = 'forestgreen'))

	for (a in c(0.5, 0.9, 0.95)) {
	abline(h = a, lty = 1, col = 'grey')
	}
	legend("bottomright",
	lty = 1:3,
	col = c("red", "darkblue", "forestgreen"),
	legend = c("95%", "90%", "50%"))

	### Find out the actual values
	test.df <- test.df[order(test.df$sigma),]
	sigma1 <- test.df$sigma[which(test.df$calib > 0.9 & test.df$alpha == 0.1)[1]]
	sigma2 <- test.df$sigma[which(test.df$calib > 0.95 & test.df$alpha == 0.05)[1]]
	sigma3 <- test.df$sigma[which(test.df$calib > 0.95 & test.df$alpha == 0.05)[1]]
	chosen.sigma <- min(c(sigma1, sigma2, sigma3))

	## ----newsims-------------------------------------------------------------
	### Now simulate
	newpolls <- c(0.43, 0.28, 0.11)
	newpolls <- c(newpolls, 1 - sum(newpolls))
	cormat <- outer(newpolls,
	newpolls,
	ecm)

	diag(cormat) <- 1
	cormat <- as.matrix(nearPD(cormat)$mat)
	diag(cormat) <- 1
	Sigma <- cor2cov(corMat = cormat, varVec = rep(chosen.sigma, ncol(cormat)))


	meanmat <- matrix(newpolls,
	nrow = 1000,
	ncol = length(newpolls),
	byrow = TRUE)
	deviates <- rmvnorm(n = 1000,
	mean = newpolls,
	sigma = Sigma)