Skip to content

Instantly share code, notes, and snippets.

@chrishanretty
Created September 4, 2018 19:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chrishanretty/c19c5feb06eabffcee2c80399946a492 to your computer and use it in GitHub Desktop.
Save chrishanretty/c19c5feb06eabffcee2c80399946a492 to your computer and use it in GitHub Desktop.
Generate simulated Swedish election based on historical error patterns
library(rvest)
library(tidyverse)
library(mvtnorm)
library(Matrix)
library(foreign)
library(reshape2)
library(pander)
## Now we can try and calibrate results according to past results
## To do that, we'll load information on past results
## You'll need to download the data from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/8421DX
load("~/Dropbox/forecasting/common/data/polling/LONG_MI_NATURE_20180111.RData")
hist <- x %>%
filter(countryid == 36) %>% ## Sweden only
filter(daysbeforeED == 7) ## One week before
## We'll convert this information into two matrices
## one matrix holding actual election shares
## one holding polling averages
polls <- acast(hist, elecdate ~ partyid, value.var = "poll_")
shares <- acast(hist, elecdate ~ partyid, value.var = "vote_")
polls <- polls / 100
shares <- shares / 100
## There are some missing values we'll fill in
## For missing polling information, we'll divide the left-over percentage equally amongst the parties
for (i in 1:nrow(polls)) {
left_over <- 1 - sum(polls[i, ], na.rm = TRUE)
missing_spots <- sum(is.na(polls[i,]))
polls[i, is.na(polls[i, ])] <- left_over / missing_spots
}
## For missing election outcomes I fill in the results
## For this one would need the codebook
shares["2014-09-14", "4"] <- 0.0457 # kristdemokraterna
shares["2006-09-17", "8"] <- 0.0293 # sd
shares["2002-09-15", "8"] <- 0.0144 # sd
shares["2002-09-15", "7"] <- 0.0465 # miljöp.
## I'll now need some helper functions
##
ecm <- function(x, y) { ## Generate the expected correlation matrix
## between two compositional vectors
-1 * sqrt((x * y) / ((1 - x) * (1 - y)))
}
cor2cov <- function(corMat, varVec) {
## Turn a correlation matrix into a scaled covariance matrix
# test the input
if (!is.matrix(corMat)) stop("'corMat must be a matrix")
n = nrow(corMat)
if (ncol(corMat) != n) stop("'corMat' must be square")
if (mode(corMat) != "numeric") stop("'corMat must be numeric")
if (mode(varVec) != "numeric") stop("'varVec must be numeric")
if (!is.null(dim(varVec))) {
if (length(dim(varVec)) != 2) stop("'varVec' should be a vector")
if (any(dim(varVec)==1)) stop("'varVec' cannot be a matrix")
varVec <- as.numeric(varVec) # convert row or col matrix to a vector
}
if (!all(diag(corMat) == 1)) stop("correlation matrices have 1 on the diagonal")
if (any(corMat < -1 | corMat > +1))
stop("correlations must be between -1 and 1")
if (any(varVec <= 0)) stop("variances must be non-negative")
if (length(varVec) != n) stop("length of 'varMat' does not match 'corMat' size")
# Compute the covariance
sdMat = diag(sqrt(varVec))
rtn = sdMat %*% corMat %*% t(sdMat)
if (det(rtn)<=0) warning("covariance matrix is not positive definite")
return(rtn)
}
## I now set up a function to check calibration on other results
## given sigma and some level of coverage (1-alpha)
check_calibration <- function(sigma = 0.01, alpha = 0.05,
p = polls, s = shares, nSims = 1000) {
## Set up a holder for each election containing calibration outcome
calibs <- numeric(0)
for (i in 1:nrow(s)) {
my_polls <- as.vector(unlist(p[i, ]))
my_shares <- as.vector(unlist(s[i, ]))
## Create the expected correlation matrix
cormat <- outer(my_shares,
my_shares,
ecm)
diag(cormat) <- 1
## Fix non-singularities
cormat <- as.matrix(Matrix::nearPD(cormat)$mat)
diag(cormat) <- 1
## Scale it according to sigma
Sigma <- cor2cov(corMat = cormat,
varVec = rep(sigma, ncol(cormat)))
## Generate some election "forecasts" on this basis
deviates <- rmvnorm(n = nSims,
mean = my_polls,
sigma = Sigma)
## Get the confidence interval
ci <- apply(deviates, 2, quantile, c(alpha/2, 1 - (alpha/2)))
## For each party, is the result contained within this interval?
calib <- (my_shares > ci[1,] & my_shares < ci[2,])
## Add on to the holder the count of parties within the interval
calibs <- c(calibs, sum(calib))
}
## Now, each entry in calib will be a count of "covered" figures
## We want the proportion over all party-year combos
return(sum(calibs) / (nrow(s) * ncol(s)))
}
### Let's evaluate this function for a range of values
test.df <- expand.grid(alpha = c(0.05, 0.1, 0.5),
sigma = seq(0.0001, 0.005,
length.out = 100))
test.df$calib <- apply(test.df, 1, function(x) {
check_calibration(sigma = x["sigma"],
alpha = x["alpha"])
})
plot(test.df$sigma, test.df$calib,
type = "n")
with(subset(test.df, alpha == 0.05),
lines(sigma, calib, lty = 1, col = 'red'))
with(subset(test.df, alpha == 0.1),
lines(sigma, calib, lty = 2, col = 'darkblue'))
with(subset(test.df, alpha == 0.5),
lines(sigma, calib, lty = 3, col = 'forestgreen'))
for (a in c(0.5, 0.9, 0.95)) {
abline(h = a, lty = 1, col = 'grey')
}
legend("bottomright",
lty = 1:3,
col = c("red", "darkblue", "forestgreen"),
legend = c("95%", "90%", "50%"))
### Find out the actual values which ensure coverage
test.df <- test.df[order(test.df$sigma), ]
## Value for alpha = 0.1
sigma1 <- test.df$sigma[which(test.df$calib > 0.9 & test.df$alpha == 0.1)[1]]
## Value for alpha = 0.05
sigma2 <- test.df$sigma[which(test.df$calib > 0.95 & test.df$alpha == 0.05)[1]]
## Take the larger of the two
chosen.sigma <- max(c(sigma1, sigma2))
## Now let's use this value to generate some simulations given current
## polling data
wiki_url <- "https://en.m.wikipedia.org/wiki/Opinion_polling_for_the_Swedish_general_election,_2018"
## Read in the polling table
polls <- wiki_url %>%
read_html() %>%
html_nodes(xpath="//table[1]") %>%
html_table(fill = TRUE)
## Get the most recent six polls, excluding some bumf at the top
polls <- polls[[1]] %>%
filter(row_number() > 2) %>%
head(6)
## Convert to numeric
for (i in 3:ncol(polls)) {
polls[,i] <- as.numeric(as.character(polls[,i]))
}
## Just get the shares for named parties
the.shares <- colMeans(polls[,c("S", "M", "SD", "MP", "C", "V",
"L", "KD", "Fi", "Others")])
the.shares <- the.shares / sum(the.shares)
## Create the correlation matrix
mat <- outer(the.shares, the.shares, ecm)
diag(mat) <- 1
## Fix the matrix
cormat <- as.matrix(nearPD(mat)$mat)
diag(cormat) <- 1
## Get the covariance matrix
## Using the previously chosen value of sigma
covmat <- cor2cov(corMat = cormat,
varVec = rep(chosen.sigma, ncol(cormat)))
## Simulate some election outcomes
sims <- rmvnorm(n = 1000,
mean = the.shares,
sigma = covmat)
## What is the probability of SD > S?
mean(sims[,"SD"] > sims[,"S"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment