khakieconomics/adding_up_imputation.R

## adding_up_imputation.R
# load the Stan library and set it up to use all cores
library(rstan)
options(mc.cores = parallel::detectCores())

# Create some data

softmax <- function(x) exp(x)/sum(exp(x))
N <- 50
P <- 5
theta <- rnorm(P)

props <- softmax(theta)

scale <- 15

# Simulate some proportions
shares <- gtools::rdirichlet(N, props*scale)

# and some totals
totals <- rnorm(N, 1e4, 1000)

# Now create our matrix of accounting inputs that add up to the totals
contributions <- matrix(NA, N, P)
for(i in 1:P) {
  contributions[,i] <- shares[,i] * totals
}

# Check we did everything right
all.equal(rowSums(contributions), totals, tol = 1e-6)

# Now make some missing

contributions_2 <- contributions

contributions_2[sample(1:N, 10),] <- NA

# Stan doesn't like NAs so let's convert them all to -9
contributions_2[is.na(contributions_2)] <- -9

# A little Stan model

mod <- "
data {
  int N;
  int P;
  matrix[N, P] contributions;
  vector[N] totals;
}
transformed data {
  matrix[N, P] shares;
  for(n in 1:N) {
    if(contributions[n,1]==-9) {
      shares[n] = rep_row_vector(-9, P);
    } else {
      shares[n] = contributions[n]/totals[n];
    }
  }
}
parameters {
  vector[P] theta;
  real<lower = 0> scale;
}
model {
  theta ~ normal(0, 1);
  scale ~ cauchy(0, 2);

  for(n in 1:N) {
    if(shares[n,1]>0) {
      // of course you'd probably have some model for theta
      shares[n]' ~ dirichlet(softmax(theta)*scale);
    }
  }
}
generated quantities {
  matrix[N, P] out;
  for(n in 1:N) {
    if(contributions[n, 1]==-9) {
      out[n] = (softmax(theta)*totals[n])';
    } else {
      out[n] = contributions[n];
    }

  }
}
"

# Compile the model (normally we'd put the model in a .stan file)
compiled_model <- stan_model(model_code = mod)

# Run MCMC to get samples from the posterior
est_model <- sampling(compiled_model, data = list(N = N, P = P, contributions = contributions_2, totals = totals))

# Get the means of your parameter estimates (since you can only use one value
# in your microsimulation model)
means <- get_posterior_mean(est_model)[,5]

# Extract just the output
outs <- means[grepl("out", names(means))]

out <- matrix(outs, N, P, byrow = T)
# Does is make sense?
all.equal(rowSums(out), totals, tol = 1e-6)
# plot estimate against known
plot(as.vector(out), as.vector(contributions))


# Check that the theta estimated well
theta_est <- means[grepl("theta", names(means))]
plot(theta_est, theta)
	# load the Stan library and set it up to use all cores
	library(rstan)
	options(mc.cores = parallel::detectCores())

	# Create some data

	softmax <- function(x) exp(x)/sum(exp(x))
	N <- 50
	P <- 5
	theta <- rnorm(P)

	props <- softmax(theta)

	scale <- 15

	# Simulate some proportions
	shares <- gtools::rdirichlet(N, props*scale)

	# and some totals
	totals <- rnorm(N, 1e4, 1000)

	# Now create our matrix of accounting inputs that add up to the totals
	contributions <- matrix(NA, N, P)
	for(i in 1:P) {
	contributions[,i] <- shares[,i] * totals
	}

	# Check we did everything right
	all.equal(rowSums(contributions), totals, tol = 1e-6)

	# Now make some missing

	contributions_2 <- contributions

	contributions_2[sample(1:N, 10),] <- NA

	# Stan doesn't like NAs so let's convert them all to -9
	contributions_2[is.na(contributions_2)] <- -9

	# A little Stan model

	mod <- "
	data {
	int N;
	int P;
	matrix[N, P] contributions;
	vector[N] totals;
	}
	transformed data {
	matrix[N, P] shares;
	for(n in 1:N) {
	if(contributions[n,1]==-9) {
	shares[n] = rep_row_vector(-9, P);
	} else {
	shares[n] = contributions[n]/totals[n];
	}
	}
	}
	parameters {
	vector[P] theta;
	real<lower = 0> scale;
	}
	model {
	theta ~ normal(0, 1);
	scale ~ cauchy(0, 2);

	for(n in 1:N) {
	if(shares[n,1]>0) {
	// of course you'd probably have some model for theta
	shares[n]' ~ dirichlet(softmax(theta)*scale);
	}
	}
	}
	generated quantities {
	matrix[N, P] out;
	for(n in 1:N) {
	if(contributions[n, 1]==-9) {
	out[n] = (softmax(theta)*totals[n])';
	} else {
	out[n] = contributions[n];
	}

	}
	}
	"

	# Compile the model (normally we'd put the model in a .stan file)
	compiled_model <- stan_model(model_code = mod)

	# Run MCMC to get samples from the posterior
	est_model <- sampling(compiled_model, data = list(N = N, P = P, contributions = contributions_2, totals = totals))

	# Get the means of your parameter estimates (since you can only use one value
	# in your microsimulation model)
	means <- get_posterior_mean(est_model)[,5]

	# Extract just the output
	outs <- means[grepl("out", names(means))]

	out <- matrix(outs, N, P, byrow = T)
	# Does is make sense?
	all.equal(rowSums(out), totals, tol = 1e-6)
	# plot estimate against known
	plot(as.vector(out), as.vector(contributions))


	# Check that the theta estimated well
	theta_est <- means[grepl("theta", names(means))]
	plot(theta_est, theta)