Created
May 10, 2017 08:49
-
-
Save chrishanretty/4827cec6a2636481060acc4d3d439e1b to your computer and use it in GitHub Desktop.
Forecast election day results on the basis of final polls
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Forecast election day results on the basis of final polls | |
### Some of this code is clunky (particularly calibration) | |
### but it is a useful test bed if you want to move from | |
### a multivariate normal to another distribution (say, a | |
### multivariate t distribution) -- just change the code in | |
### check_calibration. Do, however, read | |
### https://journal.r-project.org/archive/2013-2/hofert.pdf | |
### first! | |
## ----loadlibs------------------------------------------------------------ | |
library(knitr) | |
library(pander) | |
library(Matrix) | |
library(mvtnorm) | |
## ----datin--------------------------------------------------------------- | |
polls <- structure(list(Conservatives = c(0.463137689855468, 0.456132838783667, | |
0.42634772561737, 0.375342469945955, 0.308737479390641, 0.322761590590601, | |
0.32407274487598, 0.352164318385373, 0.336886748984473), Labour = c(0.393273527306154, | |
0.266928720258727, 0.335530448967622, 0.391501599270666, 0.478126236740175, | |
0.458478005198966, 0.379647880476463, 0.274403670248034, 0.336584668910591 | |
), Liberal.Democrats = c(0.118964928480032, 0.254004731185892, | |
0.210710045749512, 0.190497666825356, 0.14310292507149, 0.156146559027138, | |
0.223943628451435, 0.273044090920638, 0.0849106722877976), Other = c(0.0246238543583452, | |
0.0229337097717135, 0.0274117796654951, 0.0426582639580229, 0.0700333587976943, | |
0.0626138451832949, 0.0723357461961213, 0.100387920445954, 0.241617909817138 | |
)), .Names = c("Conservatives", "Labour", "Liberal.Democrats", | |
"Other"), class = "data.frame", row.names = c("1979", "1983", | |
"1987", "1992", "1997", "2001", "2005", "2010", "2015")) | |
shares <- structure(list(Conservatives = c(0.447774470245795, 0.43485092464566, | |
0.432466614101659, 0.428498495558431, 0.314618578007995, 0.324463265750917, | |
0.332359474708807, 0.368804460197394, 0.376137629058115), Labour = c(0.376979254862873, | |
0.282617296534093, 0.31519111447088, 0.351497022738108, 0.442984244236696, | |
0.416369176542003, 0.361397077962564, 0.296545633515872, 0.311041250597578 | |
), Liberal.Democrats = c(0.141014904292, 0.260029558584563, 0.230726978209406, | |
0.182418283404197, 0.171809011855532, 0.186903837282912, 0.226447535045442, | |
0.235549351036152, 0.0804167859715201), Other = c(0.0342313705993318, | |
0.0225022202356833, 0.0216152932180549, 0.0375861982992636, 0.0705881658997772, | |
0.0722637204241681, 0.0797959122831868, 0.0991005552505823, 0.232404334372786 | |
)), .Names = c("Conservatives", "Labour", "Liberal.Democrats", | |
"Other"), class = "data.frame", row.names = c("1979", "1983", | |
"1987", "1992", "1997", "2001", "2005", "2010", "2015")) | |
## ----diffsout, results = "asis"------------------------------------------ | |
diffs <- shares - polls | |
diffs <- rbind(diffs, colMeans(diffs)) | |
rownames(diffs)[nrow(diffs)] <- "Average" | |
pander(diffs) | |
## ----ecm----------------------------------------------------------------- | |
example.shares <- c(0.44, 0.29, 0.11, 0.16) | |
ecm <- function(x, y) { ## expect correlation matrix | |
-1 * sqrt((x*y)/((1-x)*(1-y))) | |
} | |
mat <- outer(example.shares, example.shares, ecm) | |
diag(mat) <- 1 | |
pander(mat) | |
## ----cor2cov------------------------------------------------------------- | |
cor2cov = function(corMat, varVec) { | |
# test the input | |
if (!is.matrix(corMat)) stop("'corMat must be a matrix") | |
n = nrow(corMat) | |
if (ncol(corMat) != n) stop("'corMat' must be square") | |
if (mode(corMat) != "numeric") stop("'corMat must be numeric") | |
if (mode(varVec) != "numeric") stop("'varVec must be numeric") | |
if (!is.null(dim(varVec))) { | |
if (length(dim(varVec)) != 2) stop("'varVec' should be a vector") | |
if (any(dim(varVec)==1)) stop("'varVec' cannot be a matrix") | |
varVec = as.numeric(varVec) # convert row or col matrix to a vector | |
} | |
if (!all(diag(corMat) == 1)) stop("correlation matrices have 1 on the diagonal") | |
if (any(corMat < -1 | corMat > +1)) | |
stop("correlations must be between -1 and 1") | |
if (any(varVec <= 0)) stop("variances must be non-negative") | |
if (length(varVec) != n) stop("length of 'varMat' does not match 'corMat' size") | |
# Compute the covariance | |
sdMat = diag(sqrt(varVec)) | |
rtn = sdMat %*% corMat %*% t(sdMat) | |
if (det(rtn)<=0) warning("covariance matrix is not positive definite") | |
return(rtn) | |
} | |
## ----cv------------------------------------------------------------------ | |
check_calibration <- function(sigma = 0.01, alpha = 0.05, | |
p = polls, s = shares, nSims = 1000) { | |
calibs <- numeric(0) | |
for (i in 1:nrow(s)) { | |
errs <- s[-i,] - p[-i,] | |
delta <- colMeans(errs) | |
cormat <- outer(as.vector(unlist(p[i,])), | |
as.vector(unlist(p[i,])), | |
ecm) | |
diag(cormat) <- 1 | |
cormat <- as.matrix(nearPD(cormat)$mat) | |
diag(cormat) <- 1 | |
Sigma <- cor2cov(corMat = cormat, varVec = rep(sigma, ncol(cormat))) | |
deviates <- rmvnorm(n = nSims, | |
mean = as.vector(unlist(p[i,] + delta)), | |
sigma = Sigma) | |
ci <- apply(deviates, 2, quantile, c(alpha/2, 1 - (alpha/2))) | |
calib <- apply(s[i,] > ci[1,] & s[i,] < ci[2,], 2, min) | |
calibs <- c(calibs, sum(calib)) | |
} | |
return(sum(calibs) / length(unlist(shares))) | |
} | |
## ----plotfigs, fig = TRUE, fig.cap = "Calibration"----------------------- | |
### Let's evaluate this function for a range of values | |
test.df <- expand.grid(alpha = c(0.05, 0.1, 0.5), | |
sigma = seq(0.0001, 0.005, length.out = 100)) | |
test.df$calib <- apply(test.df, 1, function(x) | |
check_calibration(sigma = x["sigma"], alpha = x["alpha"])) | |
plot(test.df$sigma, test.df$calib, | |
type = "n") | |
with(subset(test.df, alpha == 0.05), | |
lines(sigma, calib, lty = 1, col = 'red')) | |
with(subset(test.df, alpha == 0.1), | |
lines(sigma, calib, lty = 2, col = 'darkblue')) | |
with(subset(test.df, alpha == 0.5), | |
lines(sigma, calib, lty = 3, col = 'forestgreen')) | |
for (a in c(0.5, 0.9, 0.95)) { | |
abline(h = a, lty = 1, col = 'grey') | |
} | |
legend("bottomright", | |
lty = 1:3, | |
col = c("red", "darkblue", "forestgreen"), | |
legend = c("95%", "90%", "50%")) | |
### Find out the actual values | |
test.df <- test.df[order(test.df$sigma),] | |
sigma1 <- test.df$sigma[which(test.df$calib > 0.9 & test.df$alpha == 0.1)[1]] | |
sigma2 <- test.df$sigma[which(test.df$calib > 0.95 & test.df$alpha == 0.05)[1]] | |
sigma3 <- test.df$sigma[which(test.df$calib > 0.95 & test.df$alpha == 0.05)[1]] | |
chosen.sigma <- min(c(sigma1, sigma2, sigma3)) | |
## ----newsims------------------------------------------------------------- | |
### Now simulate | |
newpolls <- c(0.43, 0.28, 0.11) | |
newpolls <- c(newpolls, 1 - sum(newpolls)) | |
cormat <- outer(newpolls, | |
newpolls, | |
ecm) | |
diag(cormat) <- 1 | |
cormat <- as.matrix(nearPD(cormat)$mat) | |
diag(cormat) <- 1 | |
Sigma <- cor2cov(corMat = cormat, varVec = rep(chosen.sigma, ncol(cormat))) | |
meanmat <- matrix(newpolls, | |
nrow = 1000, | |
ncol = length(newpolls), | |
byrow = TRUE) | |
deviates <- rmvnorm(n = 1000, | |
mean = newpolls, | |
sigma = Sigma) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment