Skip to content

Instantly share code, notes, and snippets.

@khakieconomics
Created March 1, 2017 01:52
Show Gist options
  • Save khakieconomics/34dbffcd1341e3d3f9eef831c036cec5 to your computer and use it in GitHub Desktop.
Save khakieconomics/34dbffcd1341e3d3f9eef831c036cec5 to your computer and use it in GitHub Desktop.
Imputing values that add up to (known) totals when totals are known for all.
# load the Stan library and set it up to use all cores
library(rstan)
options(mc.cores = parallel::detectCores())
# Create some data
softmax <- function(x) exp(x)/sum(exp(x))
N <- 50
P <- 5
theta <- rnorm(P)
props <- softmax(theta)
scale <- 15
# Simulate some proportions
shares <- gtools::rdirichlet(N, props*scale)
# and some totals
totals <- rnorm(N, 1e4, 1000)
# Now create our matrix of accounting inputs that add up to the totals
contributions <- matrix(NA, N, P)
for(i in 1:P) {
contributions[,i] <- shares[,i] * totals
}
# Check we did everything right
all.equal(rowSums(contributions), totals, tol = 1e-6)
# Now make some missing
contributions_2 <- contributions
contributions_2[sample(1:N, 10),] <- NA
# Stan doesn't like NAs so let's convert them all to -9
contributions_2[is.na(contributions_2)] <- -9
# A little Stan model
mod <- "
data {
int N;
int P;
matrix[N, P] contributions;
vector[N] totals;
}
transformed data {
matrix[N, P] shares;
for(n in 1:N) {
if(contributions[n,1]==-9) {
shares[n] = rep_row_vector(-9, P);
} else {
shares[n] = contributions[n]/totals[n];
}
}
}
parameters {
vector[P] theta;
real<lower = 0> scale;
}
model {
theta ~ normal(0, 1);
scale ~ cauchy(0, 2);
for(n in 1:N) {
if(shares[n,1]>0) {
// of course you'd probably have some model for theta
shares[n]' ~ dirichlet(softmax(theta)*scale);
}
}
}
generated quantities {
matrix[N, P] out;
for(n in 1:N) {
if(contributions[n, 1]==-9) {
out[n] = (softmax(theta)*totals[n])';
} else {
out[n] = contributions[n];
}
}
}
"
# Compile the model (normally we'd put the model in a .stan file)
compiled_model <- stan_model(model_code = mod)
# Run MCMC to get samples from the posterior
est_model <- sampling(compiled_model, data = list(N = N, P = P, contributions = contributions_2, totals = totals))
# Get the means of your parameter estimates (since you can only use one value
# in your microsimulation model)
means <- get_posterior_mean(est_model)[,5]
# Extract just the output
outs <- means[grepl("out", names(means))]
out <- matrix(outs, N, P, byrow = T)
# Does is make sense?
all.equal(rowSums(out), totals, tol = 1e-6)
# plot estimate against known
plot(as.vector(out), as.vector(contributions))
# Check that the theta estimated well
theta_est <- means[grepl("theta", names(means))]
plot(theta_est, theta)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment