Skip to content

Instantly share code, notes, and snippets.

@mattblackwell
Created October 15, 2013 01:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mattblackwell/6985383 to your computer and use it in GitHub Desktop.
Save mattblackwell/6985383 to your computer and use it in GitHub Desktop.
MCMC sampler for polling aggregation with covariance smoothing
## Function to implment the MCMC sampler
polling.gibbs.kal.fixedS <- function(y, size, dates, states, big, ## data
F, X=NULL, XL = NULL, Z=NULL, ## covariates
m.0, C.0, b = NULL, B = NULL, ## hyper-parameters
starts, phi, sigma.a, nu, beta, k, ## starting values
tune, A,
iters = 10000, thin = 1, burnin = 0) { ## mcmc stuff
aa <- proc.time()
## pre-gibbs cleaning (calculate variances, setup times, etc)
require(MCMCpack)
require(mvtnorm)
sigma.sq <- (y * (1-y))/size
days <- dates- min(dates) + 1
T <- max(days)
num.states <- length(unique(states))
state.names <- unique(states)
## starting values
## gammas is the matrix (time by state) of state effects
gammas <- matrix(starts, nrow = T, ncol = num.states)
## Create "projection"
S <- exp(-mh(Z, diag(phi, ncol(Z))))
Delta.0 <- sigma.a * S + diag(rep(nu, ncol(S)))
Sigma.a <- riwish(k, Delta.0)
## Calculate number of draws
draws <- (iters-burnin)/thin
## set up acceptance ratios
acc.a <- acc.nu <- 0
acc.phi <- rep(0, length(phi))
## holder matrices
gamma.hold <- array(NA, dim=c(T,num.states,draws))
phi.hold <- matrix(NA, nrow = ncol(Z), ncol = draws)
sigma.a.hold <- rep(NA, times = draws)
nu.hold <- rep(NA, times = draws)
if (!is.null(X)) {
beta.hold <- matrix(NA, nrow = ncol(XL), ncol = draws)
}
for (i in 1:iters) {
## update alpha.0, gamma.s0
a <- matrix(0, nrow = T, ncol = num.states)
m <- matrix(0, nrow = T, ncol = num.states)
C <- array(0, dim = c(num.states,num.states,T))
R <- array(0, dim = c(num.states,num.states,T))
m[1,] <- m.0
C[,,1] <- C.0
## Begin Forward Filtering
for (t in 2:T) {
## y[,t] = F[[t]] %*% gammas[,t] + Sigma.t
## F relates the states (gammas) to the observations
## so we only need the rows that correspond to the
## states actually being used today
a[t,] <- m[t-1,]
R[,,t] <- C[,,t-1] + Sigma.a
## slight modification for missing data.
## if there are no observations, then the posterior
## mean/variance is the prior mean/variance
if (nrow(big[[t]]) > 0) {
f <- F[[t]] %*% a[t,]
if (!is.null(X)) {
e <- big[[t]]$y - X[[t]] %*% beta - f
} else {
e <- big[[t]]$y - f
}
Sigma.t <- diag(big[[t]]$sigma.sq, nrow(big[[t]]))
Qt <- F[[t]] %*% R[, ,t] %*% t(F[[t]]) + Sigma.t
At <- R[,,t] %*% t(F[[t]]) %*% solve(Qt)
m[t,] <- a[t,] + At %*% e
C[,,t] <- R[,,t] - R[,,t] %*% t(F[[t]]) %*% solve(Qt) %*% F[[t]] %*% R[,,t]
} else {
m[t,] <- a[t,]
C[,,t] <- R[,,t]
}
}
## Begin Backward Sampling
gammas[T,] <- rmvnorm(1, m[T,], C[,,T])
if (!is.null(X)) {
beta.resid <- big[[T]]$y - F[[T]] %*% gammas[T,]
}
for (t in (T-1):1) {
B.t <- C[,,t] %*% solve(R[,,t+1])
h.t <- m[t,] + B.t %*% (gammas[t+1,] - a[t+1,])
H.t <- C[,,t] - B.t %*% R[,,t+1] %*% t(B.t)
gammas[t,] <- rmvnorm(1, h.t, H.t)
if (!is.null(X)) {
beta.resid <- c(beta.resid, big[[t]]$y - F[[t]] %*% gammas[t,])
}
}
if (!is.null(X)) {
## update betas (polling house effects)
Sigma.beta.star <- solve(t(XL) %*% diag(1/sigma.sq) %*% XL + B)
beta.star <- Sigma.beta.star %*% (t(XL) %*% diag(1/sigma.sq) %*% beta.resid + B %*% b)
beta <- mvrnorm(1, beta.star, Sigma.beta.star)
## normalize the house effects
beta <- beta - mean(beta)
}
## Inverse Wishart for Delta
devs <- gammas[2:nrow(gammas),]-gammas[1:(nrow(gammas)-1),]
s.cov <- cov(devs)
Delta.n <- Delta.0 + s.cov + (T-1)/(T)*(colMeans(devs)%*%t(colMeans(devs)))
k.n <- k + T - 1
Sigma.a <- riwish(k.n, Delta.n)
## ## MH step for sigma.a
## sigma.a.can <- rlnorm(1, log(sigma.a), sd = tune[1,1])
## Delta.can <- sigma.a.can * S + diag(rep(nu, ncol(S)))
## hold.can <- -dlnorm(sigma.a.can, log(sigma.a), tune[1,1], log = TRUE)
## hold.cur <- -dlnorm(sigma.a, log(sigma.a.can), tune[1,1], log = TRUE)
## hold.can <- hold.can + log(dinvgamma(sigma.a.can, c.0, d.0))
## hold.cur <- hold.cur + log(dinvgamma(sigma.a, c.0, d.0))
## for (t in 2:T) {
## hold.can <- hold.can + dmvnorm(gammas[t,], gammas[t-1,], Delta.can, log = TRUE)
## hold.cur <- hold.cur + dmvnorm(gammas[t,], gammas[t-1,], Delta, log = TRUE)
## }
## r.a <- exp(hold.can - hold.cur)
## if (runif(1) <= r.a) {
## sigma.a <- sigma.a.can
## acc.a <- acc.a + 1
## }
## Delta <- sigma.a * S + diag(rep(nu, ncol(S)))
## ## MH step for nu ##
## nu.can <- rlnorm(1, log(nu), sd = tune[2,2])
## Delta.can <- sigma.a * S + diag(rep(nu.can, ncol(S)))
## hold.can <- -dlnorm(nu.can, log(nu), tune[2, 2], log = TRUE)
## hold.cur <- -dlnorm(nu, log(nu.can), tune[2, 2], log = TRUE)
## hold.can <- hold.can + log(dinvgamma(nu.can, g.0, h.0))
## hold.cur <- hold.cur + log(dinvgamma(nu, g.0, h.0))
## for (t in 2:T) {
## hold.can <- hold.can + dmvnorm(gammas[t,], gammas[t-1,], Delta.can, log = TRUE)
## hold.cur <- hold.cur + dmvnorm(gammas[t,], gammas[t-1,], Delta, log = TRUE)
## }
## r.a <- exp(hold.can - hold.cur)
## if (runif(1) <= r.a) {
## nu <- nu.can
## acc.nu <- acc.nu + 1
## }
## Delta <- sigma.a * S + diag(rep(nu, ncol(S)))
## ## MH for phi
## for (p in 1:length(phi)) {
## newphi <- rlnorm(1, log(phi[p]), sd = tune[p + 2, p + 2])
## phi.can <- phi
## phi.can[p] <- newphi
## S.can <- exp(-mh(Z, diag(c(phi.can))))
## Delta.can <- sigma.a * S.can + diag(rep(nu, ncol(S)))
## ## Jumping disttribution discount ##
## hold.can <- -dlnorm(newphi, log(phi[p]), tune[p + 2, p + 2], log = TRUE)
## hold.cur <- -dlnorm(phi[p], log(newphi), tune[p + 2, p + 2], log = TRUE)
## ## Priors
## hold.can <- hold.can + log(dinvgamma(newphi, e.0, f.0))
## hold.cur <- hold.cur + log(dinvgamma(phi[p], e.0, f.0))
## for (t in 2:T) {
## hold.can <- hold.can + dmvnorm(gammas[t,], gammas[t-1,], Delta.can, log = TRUE)
## hold.cur <- hold.cur + dmvnorm(gammas[t,], gammas[t-1,], Delta, log = TRUE)
## }
## ## Update phi[p] and Delta
## r.a <- exp(hold.can - hold.cur)
## if (runif(1) <= r.a) {
## phi <- phi.can
## acc.phi[p] <- acc.phi[p] + 1
## S <- S.can
## }
## Delta <- sigma.a * S + diag(rep(nu, ncol(S)))
## }
#acc <- c(acc.a, acc.nu, acc.phi)/i
if ( (((i-burnin) %% thin)==0) & (i >burnin)) {
gamma.hold[,,((i-burnin)/thin)] <- gammas
#phi.hold[,((i-burnin)/thin)] <- phi
#sigma.a.hold[((i-burnin)/thin)] <- sigma.a
#nu.hold[((i-burnin)/thin)] <- nu
if (!is.null(X)) {
beta.hold[,((i-burnin)/thin)] <- beta
save(list = c("gamma.hold", "beta.hold"), #, "phi.hold", "sigma.a.hold", "nu.hold", "acc"),
file = "polls-out.RData")
}
}
if ((i %% 10)==0) cat("\n")
if ((i %% 100)==0) cat("Minutes Run: ",((proc.time()-aa)/60)[3],"\n")
cat(i," ")
#cat("\nCandidate: ",sqrt(sigma.a.can), sqrt(nu.can), phi.can, "\n")
#cat("New: ", sqrt(sigma.a), sqrt(nu), phi, "\n")
#cat("Acceptance rate: ", acc, "\n\n")
flush.console()
}
if (!is.null(X)) {
out <- list(gammas=gamma.hold, betas = beta.hold) #, phi = phi.hold,
#sigma.a = sigma.a.hold, nu = nu.hold,
#acc = acc)
} else {
out <- list(gammas=gamma.hold)#, phi = phi.hold,
#sigma.a = sigma.a.hold, nu = nu.hold,
#acc = acc)
}
return(out)
}
## Mahalanobis distance matrix
mh <- function(x, cov) {
S <- matrix(NA, nrow(x), nrow(x))
for (i in 1:nrow(S)) {
for (j in 1:i) {
S[i,j] <- sqrt(mahalanobis(x[i,], x[j,], cov=cov))
}
}
S[upper.tri(S)] <- t(S)[upper.tri(S)]
return(S)
}
dhalfcauchy <- function(x, A, log = FALSE) {
out <- -A - log(1 + x/(A^2))
if (!log) out <- exp(out)
return(out)
}
polls <- read.csv("allpolls.csv")
polls$pollster <- as.character(levels(polls$pollster)[polls$pollster])
polls$pollster <- gsub('^[[:space:]]+', '',polls$pollster)
polls$pollster <- gsub('[[:space:]]+$', '', polls$pollster)
polls$pollster <- as.factor(polls$pollster)
levels(polls$pollster)[levels(polls$pollster)=="CBS/Times"]<- "CBS/NYT"
levels(polls$pollster)[levels(polls$pollster)=="CBS"]<- "CBS/NYT"
levels(polls$pollster)[levels(polls$pollster)=="Reuters/ C-SPAN/ Zogby"]<- "Zogby"
levels(polls$pollster)[levels(polls$pollster)=="Reuters/Zogby"]<- "Zogby"
levels(polls$pollster)[levels(polls$pollster)=="Research 2000/DailyKos.com (D)"]<- "Research 2000"
levels(polls$pollster)[levels(polls$pollster)=="DailyKos.com (D)/Research 2000"]<- "Research 2000"
levels(polls$pollster)[levels(polls$pollster)=="DailyKos.com (D)/Research 2000"]<- "Research 2000"
levels(polls$pollster)[levels(polls$pollster)=="Post-Dispatch/Research 2000"]<- "Research 2000"
levels(polls$pollster)[levels(polls$pollster)=="Herald-Leader/WKYT/Research 2000"]<- "Research 2000"
levels(polls$pollster)[levels(polls$pollster)=="FOX/Rasmussen"]<- "Rasmussen"
levels(polls$pollster)[levels(polls$pollster)=="Fox/Rasmussen"]<- "Rasmussen"
levels(polls$pollster)[levels(polls$pollster)=="Quinn (R-Graham)"]<- "Quinnipiac"
levels(polls$pollster)[levels(polls$pollster)=="Quinnipiac/WSJ/Post"]<- "Quinnipiac"
levels(polls$pollster)[levels(polls$pollster)=="CNN/Time"]<- "CNN"
levels(polls$pollster)[levels(polls$pollster)=="Economist/YouGov"]<- "YouGov/Polimetrix"
levels(polls$pollster)[levels(polls$pollster)=="Fabrizio, McLaughlin (R)"]<- "McLaughlin (R)"
levels(polls$pollster)[levels(polls$pollster)=="Florida CoC/Kitchens (D)"]<- "FL Chamber of Commerce"
levels(polls$pollster)[levels(polls$pollster)=="Franklin and Marshall/Hearst-Argyle"]<- "Franklin and Marshall College"
levels(polls$pollster)[levels(polls$pollster)=="Marist"]<- "Marist College"
levels(polls$pollster)[levels(polls$pollster)=="NBC/Mason-Dixon"]<- "Mason-Dixon"
levels(polls$pollster)[levels(polls$pollster)=="Politico/InsiderAdvantage"]<- "InsiderAdvantage"
levels(polls$pollster)[levels(polls$pollster)=="POS (R-Chambliss)"]<- "Marist College"
levels(polls$pollster)[levels(polls$pollster)=="POS (R-NRSC)"]<- "Public Opinion Strategies (R)"
levels(polls$pollster)[levels(polls$pollster)=="POS (R-Roberts)"]<- "Public Opinion Strategies (R)"
levels(polls$pollster)[levels(polls$pollster)=="StrategicVision (R)"]<- "Strategic Vision (R)"
levels(polls$pollster)[levels(polls$pollster)=="SurveyUSA/POS (R-Roberts)"]<- "Public Opinion Strategies (R)"
levels(polls$pollster)[levels(polls$pollster)=="POS (R-Chambliss)"]<- "Marist College"
polls$date <- as.numeric(format(as.Date(polls$begin,format="%m/%d/%Y"), format = "%j"))
polls$size <- as.numeric(levels(polls$size)[polls$size])
## drop anything before a certain cutoff
cutpoint <- as.Date("2008-07-01")
polls <- polls[as.Date(polls$begin,format="%m/%d/%Y") > cutpoint,]
polls <- polls[!is.na(polls$size),]
polls$state <- as.factor(as.character(polls$state))
##controversial: group all polls under 5
levels(polls$pollster)[which(table(polls$pollster) < 5)] <- "Less than 5"
days <- polls$date-min(polls$date)+1 ## the 1 here is to have a "0" period
T <- max(days)
pollster.contrast <- model.matrix(~pollster-1, data=polls)
y <- polls$obama/(polls$obama + polls$mccain)
polls$y <- y
polls$sigma.sq <- (polls$y * (1 - polls$y)) / polls$size
reg <- read.csv("regvoters.csv")
reg2 <- as.character(reg[,2])
reg2 <- as.numeric(gsub(",","",reg2))
names(reg2) <- reg[,1]
X <- read.csv("state-variables.csv")
rownames(X) <- X$state
X <- X[as.character(unique(polls$state)),]
X <- X[,c("gop2004","black.share","hisp.share")]
X <- rbind(NA, X)
X[1,] <- c(50.7, .135, .148)
rownames(X)[1] <- "US"
X <- as.matrix(X)
X[,"gop2004"] <- X[,"gop2004"]/100
state.vars <- X
X <- X[as.character(polls$state),]
X <- pollster.contrast
## create list for F.t
F <- diag(length(unique(polls$state)))
rownames(F) <- as.character(unique(polls$state))
colnames(F) <- as.character(unique(polls$state))
F.noco <- F
F.t <- list()
big <- list()
X <- list()
for (t in 1:T) {
big[[t]] <- polls[days == t, c("y", "sigma.sq", "state"), drop = FALSE]
X[[t]] <- pollster.contrast[days == t,, drop = FALSE]
Ft <- F[as.character(polls$state[days==t]),,drop=FALSE]
F.t[[t]] <- Ft
}
W <- diag(ncol(F.t[[1]]))
N <- ncol(F.t[[1]])
## first day priors
m.0 <- c(rep(.5, times = N))
C.0 <- diag(c(rep(.2^2, times = N)))
## house effects
b <- rep(0, times = ncol(pollster.contrast))
B <- diag(rep(1/(.1^2), times = ncol(pollster.contrast)))
## common covariance
sigma.a <- 0.1^2
## relative values of sigma.a and nu here give different weights to
## within versus across states. Their sum gives some idea of how much
## overall smoothing there will be.
## nugget
nu <- 0.1^2
# MH weights
Z <- state.vars[colnames(F.t[[1]]),]
Z <- t((t(Z)-colMeans(Z))/apply(Z,2,sd)) ##
phi <- rep(1, times = ncol(Z))
k <- 53
out <- polling.gibbs.kal.fixedS(y = y, dates = polls$date, size = polls$size, big = big,states = polls$state, F = F.t, Z = Z,
X = X, XL = pollster.contrast, b = b, B = B,
m.0 =m.0,C.0 = C.0,
starts = 0, phi = phi, sigma.a = sigma.a, k = k,
beta = rep(0, ncol(pollster.contrast)), nu = nu,
tune = tune, A = 5,
iters = 500, burnin=0,thin=1)
save(out, file = "polls-out-fixeddist.RData")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment