Skip to content

Instantly share code, notes, and snippets.

@dmontecino
Last active October 20, 2021 17:40
Show Gist options
  • Save dmontecino/b804853e4b36a57990a7108a35201cf5 to your computer and use it in GitHub Desktop.
Save dmontecino/b804853e4b36a57990a7108a35201cf5 to your computer and use it in GitHub Desktop.
STAN Imputation of a categorical covariate when non-observed and modeling of a binary outcome
### Categorical missing data in Stan ###
library(rstan)
N <- 10000
N_missing <- 100
K <- 3 # number of categories
x <- sample( seq(from=1,to=K), size=N, replace=TRUE ) # unordered cateogrical covariate
# Simulating bivariate respose as function of a categorcial variable
y <- rep(NA, N)
for (i in 1:N) {
if (x[i] == 1)
y[i] = rbinom(n = 1, size = 1 , prob = 0.7 )
else if (x[i] == 2)
y[i] = rbinom(n = 1, size = 1 , prob = 0.4 )
else # x = 3
y[i] = rbinom(n = 1, size = 1 , prob = 0.1 )
}
# Following McElreath (https://gist.github.com/rmcelreath/9406643583a8c99304e459e644762f82), simulate missing
i_miss <- sample( 1:N , size=N_missing )
x_obs <- x
x_obs[i_miss] <- (-1) # placeholder, Stan will not accept NA values
x_NA <- x_obs
x_NA[i_miss] <- NA
x_miss <- ifelse( 1:N %in% i_miss , 1 , 0 )
cov_for_imp_x<-NA
cov_for_imp_x[x_obs==1]=rbinom(length(cov_for_imp_x[x_obs==1]), size=1, prob=0.5)
cov_for_imp_x[x_obs==2]=rbinom(length(cov_for_imp_x[x_obs==2]), size=1, prob=0.5)
cov_for_imp_x[x_obs==3]=rbinom(length(cov_for_imp_x[x_obs==3]), size=1, prob=0.5)
cov_for_imp_x[is.na(cov_for_imp_x)]=rbinom(length(cov_for_imp_x[is.na(cov_for_imp_x)]), size=1, prob=0.5)
#covariate does not predict the reproductive season
x_cat_2=ifelse(x_obs==2,1,0)
x_cat_3=ifelse(x_obs==3,1,0)
cov_for_imp_x_for_miss_cat=cov_for_imp_x[x_miss==1]
stan_model <- "
data{
int N; //number of observations
int K; //number of categories
int y[N]; // binary outcome
int x_obs[N]; // categorical variable when observed
int x_miss[N]; // categorical variable when unobserved (index with 1's and zero's). 1 if unobserved
int cov_for_imp_x[N]; // a binary covariate for the prediction of x categorical variable
int x_cat_2[N]; // dummy variable when x has the second level
int x_cat_3[N]; // dummy variable when x has the first level
}
parameters{
real alpha; // coefficient of the binary outcome as a function of the categorical variable level 1 (dummy 1 only zeros)
real beta1; // coefficient of the binary outcome as a function of the categorical variable level 2 (dummy 2)
real beta2; // coefficient of the binary outcome as a function of the categorical variable level 3 (dummy 3)
vector[K] a_imp; // intercept for the imputation model
vector[K] b1_imp; // coefficent for the imputation model
}
model{
// priors
alpha ~ normal(0,1); // explained above
beta1 ~ normal(0,1); // explained above
beta2 ~ normal(0,1); // explained above
a_imp ~ normal(0,1); // explained above
b1_imp2 ~ normal(0,1);
//Data
for (i in 1:N) {
vector[K] p;
vector[K] theta;
p[2] = a_imp[2] + b1_imp[2]*cov_for_imp_x[i]; // modeling the prob 2 as a function of the covariate to model the category
p[3] = a_imp[3] + b1_imp[3]*cov_for_imp_x[i]; // modeling the prob 3 as a function of the covariate to model the category
// x not missing fit the binary variable as a function of the categories
if (x_miss[i] == 0) {
y[i] ~ bernoulli_logit(alpha+
beta1*x_cat_2[i]+
beta2*x_cat_3[i]);
theta[1] = 0;
theta[2] = p[2];
theta[3] = p[3];
x_obs[i] ~ categorical( softmax(theta));
}
// x missing model the category and posteiorly model the binary outcome as a function of the imputed category
else {
vector[n_cat] lp;
theta[1] = 0;
theta[2] = p[2];
theta[3] = p[3];
lp[1] = log_softmax(theta)[1] + bernoulli_logit_lpmf( y[i] | alpha + beta1);
lp[2] = log_softmax(theta)[2] + bernoulli_logit_lpmf( y[i] | alpha);
lp[3] = log_softmax(theta)[3] + bernoulli_logit_lpmf( y[i] | alpha + beta2);
target += log_sum_exp(lp);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment