chrishanretty/gen_swedish_sims.R

## gen_swedish_sims.R
library(rvest)
library(tidyverse)
library(mvtnorm)
library(Matrix)
library(foreign)
library(reshape2)
library(pander)

## Now we can try and calibrate results according to past results
## To do that, we'll load information on past results
## You'll need to download the data from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/8421DX
load("~/Dropbox/forecasting/common/data/polling/LONG_MI_NATURE_20180111.RData")
hist <- x %>%
    filter(countryid == 36) %>% ## Sweden only
    filter(daysbeforeED == 7) ## One week before

## We'll convert this information into two matrices
## one matrix holding actual election shares
## one holding polling averages
polls <- acast(hist, elecdate ~ partyid, value.var = "poll_")
shares <- acast(hist, elecdate ~ partyid, value.var = "vote_")

polls <- polls / 100
shares <- shares / 100

## There are some missing values we'll fill in
## For missing polling information, we'll divide the left-over percentage equally amongst the parties
for (i in 1:nrow(polls)) {
    left_over <- 1 - sum(polls[i, ], na.rm = TRUE)
    missing_spots <- sum(is.na(polls[i,]))
    polls[i, is.na(polls[i, ])] <- left_over / missing_spots
}

## For missing election outcomes I fill in the results
## For this one would need the codebook
shares["2014-09-14", "4"] <- 0.0457 # kristdemokraterna
shares["2006-09-17", "8"] <- 0.0293 # sd
shares["2002-09-15", "8"] <- 0.0144 # sd
shares["2002-09-15", "7"] <- 0.0465 # miljöp.

## I'll now need some helper functions
##
ecm <- function(x, y) { ## Generate the expected correlation matrix
    ## between two compositional vectors
    -1 * sqrt((x * y) / ((1 - x) * (1 - y)))
}

cor2cov <- function(corMat, varVec) {
    ## Turn a correlation matrix into a scaled covariance matrix
  # test the input
  if (!is.matrix(corMat)) stop("'corMat must be a matrix")
  n = nrow(corMat)
  if (ncol(corMat) != n) stop("'corMat' must be square")
  if (mode(corMat) != "numeric") stop("'corMat must be numeric")
  if (mode(varVec) != "numeric") stop("'varVec must be numeric")
  if (!is.null(dim(varVec))) {
    if (length(dim(varVec)) != 2) stop("'varVec' should be a vector")
    if (any(dim(varVec)==1)) stop("'varVec' cannot be a matrix")
    varVec <- as.numeric(varVec) # convert row or col matrix to a vector
  }
  if (!all(diag(corMat) == 1)) stop("correlation matrices have 1 on the diagonal")
  if (any(corMat < -1 | corMat > +1))
    stop("correlations must be between -1 and 1")
  if (any(varVec <= 0)) stop("variances must be non-negative")
   if (length(varVec) != n) stop("length of 'varMat' does not match 'corMat' size")

  # Compute the covariance
  sdMat = diag(sqrt(varVec))
  rtn = sdMat %*% corMat %*% t(sdMat)
  if (det(rtn)<=0) warning("covariance matrix is not positive definite")
  return(rtn)
}

## I now set up a function to check calibration on other results
## given sigma and some level of coverage (1-alpha)
check_calibration <- function(sigma = 0.01, alpha = 0.05,
                              p = polls, s = shares, nSims = 1000) {

    ## Set up a holder for each election containing calibration outcome
    calibs <- numeric(0)
    for (i in 1:nrow(s)) {
        my_polls <- as.vector(unlist(p[i, ]))
        my_shares <- as.vector(unlist(s[i, ]))
        ## Create the expected correlation matrix
        cormat <- outer(my_shares,
                        my_shares,
                        ecm)
        diag(cormat) <- 1
        ## Fix non-singularities
        cormat <- as.matrix(Matrix::nearPD(cormat)$mat)
        diag(cormat) <- 1
        ## Scale it according to sigma
        Sigma <- cor2cov(corMat = cormat,
                         varVec = rep(sigma, ncol(cormat)))
        ## Generate some election "forecasts" on this basis
        deviates <- rmvnorm(n = nSims,
                            mean = my_polls,
                            sigma = Sigma)
        ## Get the confidence interval
        ci <- apply(deviates, 2, quantile, c(alpha/2, 1 - (alpha/2)))
        ## For each party, is the result contained within this interval?
        calib <- (my_shares > ci[1,] & my_shares < ci[2,])
        ## Add on to the holder the count of parties within the interval
        calibs <- c(calibs, sum(calib))
    }
    ## Now, each entry in calib will be a count of "covered" figures
    ## We want the proportion over all party-year combos
    return(sum(calibs) / (nrow(s) * ncol(s)))
}

### Let's evaluate this function for a range of values
test.df <- expand.grid(alpha = c(0.05, 0.1, 0.5),
                       sigma = seq(0.0001, 0.005,
                                   length.out = 100))
test.df$calib <- apply(test.df, 1, function(x) {
    check_calibration(sigma = x["sigma"],
                      alpha = x["alpha"])
})

plot(test.df$sigma, test.df$calib,
     type = "n")
with(subset(test.df, alpha == 0.05),
     lines(sigma, calib, lty = 1, col = 'red'))
with(subset(test.df, alpha == 0.1),
     lines(sigma, calib, lty = 2, col = 'darkblue'))
with(subset(test.df, alpha == 0.5),
     lines(sigma, calib, lty = 3, col = 'forestgreen'))

for (a in c(0.5, 0.9, 0.95)) {
    abline(h = a, lty = 1, col = 'grey')
}
legend("bottomright",
       lty = 1:3,
       col = c("red", "darkblue", "forestgreen"),
       legend = c("95%", "90%", "50%"))

### Find out the actual values which ensure coverage
test.df <- test.df[order(test.df$sigma), ]
## Value for alpha = 0.1
sigma1 <- test.df$sigma[which(test.df$calib > 0.9 & test.df$alpha == 0.1)[1]]
## Value for alpha = 0.05
sigma2 <- test.df$sigma[which(test.df$calib > 0.95 & test.df$alpha == 0.05)[1]]
## Take the larger of the two
chosen.sigma <- max(c(sigma1, sigma2))

## Now let's use this value to generate some simulations given current
## polling data
wiki_url <- "https://en.m.wikipedia.org/wiki/Opinion_polling_for_the_Swedish_general_election,_2018"

## Read in the polling table
polls <- wiki_url %>%
    read_html() %>%
    html_nodes(xpath="//table[1]") %>%
    html_table(fill = TRUE)

## Get the most recent six polls, excluding some bumf at the top
polls <- polls[[1]] %>%
    filter(row_number() > 2) %>%
    head(6)

## Convert to numeric
for (i in 3:ncol(polls)) {
    polls[,i] <- as.numeric(as.character(polls[,i]))
}

## Just get the shares for named parties
the.shares <- colMeans(polls[,c("S", "M", "SD", "MP", "C", "V",
                                "L", "KD", "Fi", "Others")])

the.shares <- the.shares / sum(the.shares)

## Create the correlation matrix
mat <- outer(the.shares, the.shares, ecm)
diag(mat) <- 1

## Fix the matrix
cormat <- as.matrix(nearPD(mat)$mat)
diag(cormat) <- 1

## Get the covariance matrix
## Using the previously chosen value of sigma
covmat <- cor2cov(corMat = cormat,
                  varVec = rep(chosen.sigma, ncol(cormat)))

## Simulate some election outcomes
sims <- rmvnorm(n = 1000,
                mean = the.shares,
                sigma = covmat)

## What is the probability of SD > S?
mean(sims[,"SD"] > sims[,"S"])
	library(rvest)
	library(tidyverse)
	library(mvtnorm)
	library(Matrix)
	library(foreign)
	library(reshape2)
	library(pander)

	## Now we can try and calibrate results according to past results
	## To do that, we'll load information on past results
	## You'll need to download the data from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/8421DX
	load("~/Dropbox/forecasting/common/data/polling/LONG_MI_NATURE_20180111.RData")
	hist <- x %>%
	filter(countryid == 36) %>% ## Sweden only
	filter(daysbeforeED == 7) ## One week before

	## We'll convert this information into two matrices
	## one matrix holding actual election shares
	## one holding polling averages
	polls <- acast(hist, elecdate ~ partyid, value.var = "poll_")
	shares <- acast(hist, elecdate ~ partyid, value.var = "vote_")

	polls <- polls / 100
	shares <- shares / 100

	## There are some missing values we'll fill in
	## For missing polling information, we'll divide the left-over percentage equally amongst the parties
	for (i in 1:nrow(polls)) {
	left_over <- 1 - sum(polls[i, ], na.rm = TRUE)
	missing_spots <- sum(is.na(polls[i,]))
	polls[i, is.na(polls[i, ])] <- left_over / missing_spots
	}

	## For missing election outcomes I fill in the results
	## For this one would need the codebook
	shares["2014-09-14", "4"] <- 0.0457 # kristdemokraterna
	shares["2006-09-17", "8"] <- 0.0293 # sd
	shares["2002-09-15", "8"] <- 0.0144 # sd
	shares["2002-09-15", "7"] <- 0.0465 # miljöp.

	## I'll now need some helper functions
	##
	ecm <- function(x, y) { ## Generate the expected correlation matrix
	## between two compositional vectors
	-1 * sqrt((x * y) / ((1 - x) * (1 - y)))
	}

	cor2cov <- function(corMat, varVec) {
	## Turn a correlation matrix into a scaled covariance matrix
	# test the input
	if (!is.matrix(corMat)) stop("'corMat must be a matrix")
	n = nrow(corMat)
	if (ncol(corMat) != n) stop("'corMat' must be square")
	if (mode(corMat) != "numeric") stop("'corMat must be numeric")
	if (mode(varVec) != "numeric") stop("'varVec must be numeric")
	if (!is.null(dim(varVec))) {
	if (length(dim(varVec)) != 2) stop("'varVec' should be a vector")
	if (any(dim(varVec)==1)) stop("'varVec' cannot be a matrix")
	varVec <- as.numeric(varVec) # convert row or col matrix to a vector
	}
	if (!all(diag(corMat) == 1)) stop("correlation matrices have 1 on the diagonal")
	if (any(corMat < -1 \| corMat > +1))
	stop("correlations must be between -1 and 1")
	if (any(varVec <= 0)) stop("variances must be non-negative")
	if (length(varVec) != n) stop("length of 'varMat' does not match 'corMat' size")

	# Compute the covariance
	sdMat = diag(sqrt(varVec))
	rtn = sdMat %% corMat %% t(sdMat)
	if (det(rtn)<=0) warning("covariance matrix is not positive definite")
	return(rtn)
	}

	## I now set up a function to check calibration on other results
	## given sigma and some level of coverage (1-alpha)
	check_calibration <- function(sigma = 0.01, alpha = 0.05,
	p = polls, s = shares, nSims = 1000) {

	## Set up a holder for each election containing calibration outcome
	calibs <- numeric(0)
	for (i in 1:nrow(s)) {
	my_polls <- as.vector(unlist(p[i, ]))
	my_shares <- as.vector(unlist(s[i, ]))
	## Create the expected correlation matrix
	cormat <- outer(my_shares,
	my_shares,
	ecm)
	diag(cormat) <- 1
	## Fix non-singularities
	cormat <- as.matrix(Matrix::nearPD(cormat)$mat)
	diag(cormat) <- 1
	## Scale it according to sigma
	Sigma <- cor2cov(corMat = cormat,
	varVec = rep(sigma, ncol(cormat)))
	## Generate some election "forecasts" on this basis
	deviates <- rmvnorm(n = nSims,
	mean = my_polls,
	sigma = Sigma)
	## Get the confidence interval
	ci <- apply(deviates, 2, quantile, c(alpha/2, 1 - (alpha/2)))
	## For each party, is the result contained within this interval?
	calib <- (my_shares > ci[1,] & my_shares < ci[2,])
	## Add on to the holder the count of parties within the interval
	calibs <- c(calibs, sum(calib))
	}
	## Now, each entry in calib will be a count of "covered" figures
	## We want the proportion over all party-year combos
	return(sum(calibs) / (nrow(s) * ncol(s)))
	}

	### Let's evaluate this function for a range of values
	test.df <- expand.grid(alpha = c(0.05, 0.1, 0.5),
	sigma = seq(0.0001, 0.005,
	length.out = 100))
	test.df$calib <- apply(test.df, 1, function(x) {
	check_calibration(sigma = x["sigma"],
	alpha = x["alpha"])
	})

	plot(test.df$sigma, test.df$calib,
	type = "n")
	with(subset(test.df, alpha == 0.05),
	lines(sigma, calib, lty = 1, col = 'red'))
	with(subset(test.df, alpha == 0.1),
	lines(sigma, calib, lty = 2, col = 'darkblue'))
	with(subset(test.df, alpha == 0.5),
	lines(sigma, calib, lty = 3, col = 'forestgreen'))

	for (a in c(0.5, 0.9, 0.95)) {
	abline(h = a, lty = 1, col = 'grey')
	}
	legend("bottomright",
	lty = 1:3,
	col = c("red", "darkblue", "forestgreen"),
	legend = c("95%", "90%", "50%"))

	### Find out the actual values which ensure coverage
	test.df <- test.df[order(test.df$sigma), ]
	## Value for alpha = 0.1
	sigma1 <- test.df$sigma[which(test.df$calib > 0.9 & test.df$alpha == 0.1)[1]]
	## Value for alpha = 0.05
	sigma2 <- test.df$sigma[which(test.df$calib > 0.95 & test.df$alpha == 0.05)[1]]
	## Take the larger of the two
	chosen.sigma <- max(c(sigma1, sigma2))

	## Now let's use this value to generate some simulations given current
	## polling data
	wiki_url <- "https://en.m.wikipedia.org/wiki/Opinion_polling_for_the_Swedish_general_election,_2018"

	## Read in the polling table
	polls <- wiki_url %>%
	read_html() %>%
	html_nodes(xpath="//table[1]") %>%
	html_table(fill = TRUE)

	## Get the most recent six polls, excluding some bumf at the top
	polls <- polls[[1]] %>%
	filter(row_number() > 2) %>%
	head(6)

	## Convert to numeric
	for (i in 3:ncol(polls)) {
	polls[,i] <- as.numeric(as.character(polls[,i]))
	}

	## Just get the shares for named parties
	the.shares <- colMeans(polls[,c("S", "M", "SD", "MP", "C", "V",
	"L", "KD", "Fi", "Others")])

	the.shares <- the.shares / sum(the.shares)

	## Create the correlation matrix
	mat <- outer(the.shares, the.shares, ecm)
	diag(mat) <- 1

	## Fix the matrix
	cormat <- as.matrix(nearPD(mat)$mat)
	diag(cormat) <- 1

	## Get the covariance matrix
	## Using the previously chosen value of sigma
	covmat <- cor2cov(corMat = cormat,
	varVec = rep(chosen.sigma, ncol(cormat)))

	## Simulate some election outcomes
	sims <- rmvnorm(n = 1000,
	mean = the.shares,
	sigma = covmat)

	## What is the probability of SD > S?
	mean(sims[,"SD"] > sims[,"S"])