Created
September 4, 2018 19:44
-
-
Save chrishanretty/c19c5feb06eabffcee2c80399946a492 to your computer and use it in GitHub Desktop.
Generate simulated Swedish election based on historical error patterns
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rvest) | |
library(tidyverse) | |
library(mvtnorm) | |
library(Matrix) | |
library(foreign) | |
library(reshape2) | |
library(pander) | |
## Now we can try and calibrate results according to past results | |
## To do that, we'll load information on past results | |
## You'll need to download the data from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/8421DX | |
load("~/Dropbox/forecasting/common/data/polling/LONG_MI_NATURE_20180111.RData") | |
hist <- x %>% | |
filter(countryid == 36) %>% ## Sweden only | |
filter(daysbeforeED == 7) ## One week before | |
## We'll convert this information into two matrices | |
## one matrix holding actual election shares | |
## one holding polling averages | |
polls <- acast(hist, elecdate ~ partyid, value.var = "poll_") | |
shares <- acast(hist, elecdate ~ partyid, value.var = "vote_") | |
polls <- polls / 100 | |
shares <- shares / 100 | |
## There are some missing values we'll fill in | |
## For missing polling information, we'll divide the left-over percentage equally amongst the parties | |
for (i in 1:nrow(polls)) { | |
left_over <- 1 - sum(polls[i, ], na.rm = TRUE) | |
missing_spots <- sum(is.na(polls[i,])) | |
polls[i, is.na(polls[i, ])] <- left_over / missing_spots | |
} | |
## For missing election outcomes I fill in the results | |
## For this one would need the codebook | |
shares["2014-09-14", "4"] <- 0.0457 # kristdemokraterna | |
shares["2006-09-17", "8"] <- 0.0293 # sd | |
shares["2002-09-15", "8"] <- 0.0144 # sd | |
shares["2002-09-15", "7"] <- 0.0465 # miljöp. | |
## I'll now need some helper functions | |
## | |
ecm <- function(x, y) { ## Generate the expected correlation matrix | |
## between two compositional vectors | |
-1 * sqrt((x * y) / ((1 - x) * (1 - y))) | |
} | |
cor2cov <- function(corMat, varVec) { | |
## Turn a correlation matrix into a scaled covariance matrix | |
# test the input | |
if (!is.matrix(corMat)) stop("'corMat must be a matrix") | |
n = nrow(corMat) | |
if (ncol(corMat) != n) stop("'corMat' must be square") | |
if (mode(corMat) != "numeric") stop("'corMat must be numeric") | |
if (mode(varVec) != "numeric") stop("'varVec must be numeric") | |
if (!is.null(dim(varVec))) { | |
if (length(dim(varVec)) != 2) stop("'varVec' should be a vector") | |
if (any(dim(varVec)==1)) stop("'varVec' cannot be a matrix") | |
varVec <- as.numeric(varVec) # convert row or col matrix to a vector | |
} | |
if (!all(diag(corMat) == 1)) stop("correlation matrices have 1 on the diagonal") | |
if (any(corMat < -1 | corMat > +1)) | |
stop("correlations must be between -1 and 1") | |
if (any(varVec <= 0)) stop("variances must be non-negative") | |
if (length(varVec) != n) stop("length of 'varMat' does not match 'corMat' size") | |
# Compute the covariance | |
sdMat = diag(sqrt(varVec)) | |
rtn = sdMat %*% corMat %*% t(sdMat) | |
if (det(rtn)<=0) warning("covariance matrix is not positive definite") | |
return(rtn) | |
} | |
## I now set up a function to check calibration on other results | |
## given sigma and some level of coverage (1-alpha) | |
check_calibration <- function(sigma = 0.01, alpha = 0.05, | |
p = polls, s = shares, nSims = 1000) { | |
## Set up a holder for each election containing calibration outcome | |
calibs <- numeric(0) | |
for (i in 1:nrow(s)) { | |
my_polls <- as.vector(unlist(p[i, ])) | |
my_shares <- as.vector(unlist(s[i, ])) | |
## Create the expected correlation matrix | |
cormat <- outer(my_shares, | |
my_shares, | |
ecm) | |
diag(cormat) <- 1 | |
## Fix non-singularities | |
cormat <- as.matrix(Matrix::nearPD(cormat)$mat) | |
diag(cormat) <- 1 | |
## Scale it according to sigma | |
Sigma <- cor2cov(corMat = cormat, | |
varVec = rep(sigma, ncol(cormat))) | |
## Generate some election "forecasts" on this basis | |
deviates <- rmvnorm(n = nSims, | |
mean = my_polls, | |
sigma = Sigma) | |
## Get the confidence interval | |
ci <- apply(deviates, 2, quantile, c(alpha/2, 1 - (alpha/2))) | |
## For each party, is the result contained within this interval? | |
calib <- (my_shares > ci[1,] & my_shares < ci[2,]) | |
## Add on to the holder the count of parties within the interval | |
calibs <- c(calibs, sum(calib)) | |
} | |
## Now, each entry in calib will be a count of "covered" figures | |
## We want the proportion over all party-year combos | |
return(sum(calibs) / (nrow(s) * ncol(s))) | |
} | |
### Let's evaluate this function for a range of values | |
test.df <- expand.grid(alpha = c(0.05, 0.1, 0.5), | |
sigma = seq(0.0001, 0.005, | |
length.out = 100)) | |
test.df$calib <- apply(test.df, 1, function(x) { | |
check_calibration(sigma = x["sigma"], | |
alpha = x["alpha"]) | |
}) | |
plot(test.df$sigma, test.df$calib, | |
type = "n") | |
with(subset(test.df, alpha == 0.05), | |
lines(sigma, calib, lty = 1, col = 'red')) | |
with(subset(test.df, alpha == 0.1), | |
lines(sigma, calib, lty = 2, col = 'darkblue')) | |
with(subset(test.df, alpha == 0.5), | |
lines(sigma, calib, lty = 3, col = 'forestgreen')) | |
for (a in c(0.5, 0.9, 0.95)) { | |
abline(h = a, lty = 1, col = 'grey') | |
} | |
legend("bottomright", | |
lty = 1:3, | |
col = c("red", "darkblue", "forestgreen"), | |
legend = c("95%", "90%", "50%")) | |
### Find out the actual values which ensure coverage | |
test.df <- test.df[order(test.df$sigma), ] | |
## Value for alpha = 0.1 | |
sigma1 <- test.df$sigma[which(test.df$calib > 0.9 & test.df$alpha == 0.1)[1]] | |
## Value for alpha = 0.05 | |
sigma2 <- test.df$sigma[which(test.df$calib > 0.95 & test.df$alpha == 0.05)[1]] | |
## Take the larger of the two | |
chosen.sigma <- max(c(sigma1, sigma2)) | |
## Now let's use this value to generate some simulations given current | |
## polling data | |
wiki_url <- "https://en.m.wikipedia.org/wiki/Opinion_polling_for_the_Swedish_general_election,_2018" | |
## Read in the polling table | |
polls <- wiki_url %>% | |
read_html() %>% | |
html_nodes(xpath="//table[1]") %>% | |
html_table(fill = TRUE) | |
## Get the most recent six polls, excluding some bumf at the top | |
polls <- polls[[1]] %>% | |
filter(row_number() > 2) %>% | |
head(6) | |
## Convert to numeric | |
for (i in 3:ncol(polls)) { | |
polls[,i] <- as.numeric(as.character(polls[,i])) | |
} | |
## Just get the shares for named parties | |
the.shares <- colMeans(polls[,c("S", "M", "SD", "MP", "C", "V", | |
"L", "KD", "Fi", "Others")]) | |
the.shares <- the.shares / sum(the.shares) | |
## Create the correlation matrix | |
mat <- outer(the.shares, the.shares, ecm) | |
diag(mat) <- 1 | |
## Fix the matrix | |
cormat <- as.matrix(nearPD(mat)$mat) | |
diag(cormat) <- 1 | |
## Get the covariance matrix | |
## Using the previously chosen value of sigma | |
covmat <- cor2cov(corMat = cormat, | |
varVec = rep(chosen.sigma, ncol(cormat))) | |
## Simulate some election outcomes | |
sims <- rmvnorm(n = 1000, | |
mean = the.shares, | |
sigma = covmat) | |
## What is the probability of SD > S? | |
mean(sims[,"SD"] > sims[,"S"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment