dmontecino/gist:b804853e4b36a57990a7108a35201cf5

## gistfile1.txt
### Categorical missing data in Stan ###
library(rstan)


N <- 10000
N_missing <- 100
K <- 3 # number of categories
x <- sample( seq(from=1,to=K), size=N, replace=TRUE ) # unordered cateogrical covariate

# Simulating bivariate respose as function of a categorcial variable
y <- rep(NA, N)
for (i in 1:N) {
  if (x[i] == 1)
    y[i] = rbinom(n = 1, size = 1 , prob = 0.7 )

  else if (x[i] == 2)
    y[i] = rbinom(n = 1, size = 1 , prob = 0.4 )

  else # x = 3
    y[i] = rbinom(n = 1, size = 1 , prob = 0.1 )
}

# Following McElreath (https://gist.github.com/rmcelreath/9406643583a8c99304e459e644762f82), simulate missing
i_miss <- sample( 1:N , size=N_missing )
x_obs <- x
x_obs[i_miss] <- (-1) # placeholder, Stan will not accept NA values
x_NA <- x_obs
x_NA[i_miss] <- NA
x_miss <- ifelse( 1:N %in% i_miss , 1 , 0 )
cov_for_imp_x<-NA
cov_for_imp_x[x_obs==1]=rbinom(length(cov_for_imp_x[x_obs==1]), size=1, prob=0.5)
cov_for_imp_x[x_obs==2]=rbinom(length(cov_for_imp_x[x_obs==2]), size=1, prob=0.5)
cov_for_imp_x[x_obs==3]=rbinom(length(cov_for_imp_x[x_obs==3]), size=1, prob=0.5)
cov_for_imp_x[is.na(cov_for_imp_x)]=rbinom(length(cov_for_imp_x[is.na(cov_for_imp_x)]), size=1, prob=0.5)
#covariate does not predict the reproductive season
x_cat_2=ifelse(x_obs==2,1,0)
x_cat_3=ifelse(x_obs==3,1,0)
cov_for_imp_x_for_miss_cat=cov_for_imp_x[x_miss==1]

stan_model <- "
data{
int N; //number of observations
int K; //number of categories
int y[N]; // binary outcome
int x_obs[N]; // categorical variable when observed
int x_miss[N]; // categorical variable when unobserved (index with 1's and zero's). 1 if unobserved
int cov_for_imp_x[N];  // a binary covariate for the prediction of x categorical variable
int x_cat_2[N]; // dummy variable when x has the second level
int x_cat_3[N]; // dummy variable when x has the first level
}

parameters{
real alpha; // coefficient of the binary outcome as a function of the categorical variable level 1 (dummy 1 only zeros)
real beta1; // coefficient of the binary outcome as a function of the categorical variable level 2 (dummy 2)
real beta2; // coefficient of the binary outcome as a function of the categorical variable level 3 (dummy 3)
vector[K] a_imp; // intercept for the imputation model
vector[K] b1_imp; // coefficent for the imputation model
}


model{
// priors
alpha ~ normal(0,1); // explained above
beta1 ~ normal(0,1); // explained above
beta2 ~ normal(0,1); // explained above
a_imp ~ normal(0,1); // explained above
b1_imp2 ~ normal(0,1);

//Data
for (i in 1:N) {
vector[K] p;
vector[K] theta;

p[2] = a_imp[2] + b1_imp[2]*cov_for_imp_x[i]; // modeling the prob 2 as a function of the covariate to model the category
p[3] = a_imp[3] + b1_imp[3]*cov_for_imp_x[i]; // modeling the prob 3 as a function of the covariate to model the category


// x not missing fit the binary variable as a function of the categories
if (x_miss[i] == 0) {
y[i] ~ bernoulli_logit(alpha+
       beta1*x_cat_2[i]+
       beta2*x_cat_3[i]);

theta[1] = 0;
theta[2] = p[2];
theta[3] = p[3];
x_obs[i] ~ categorical( softmax(theta));

}

// x missing model the category and posteiorly model the binary outcome as a function of the imputed category

else {
vector[n_cat] lp;

theta[1] = 0;
theta[2] = p[2];
theta[3] = p[3];

lp[1] = log_softmax(theta)[1] + bernoulli_logit_lpmf( y[i] | alpha  + beta1);
lp[2] = log_softmax(theta)[2] + bernoulli_logit_lpmf( y[i] | alpha);
lp[3] = log_softmax(theta)[3] + bernoulli_logit_lpmf( y[i] | alpha  + beta2);

target += log_sum_exp(lp);

}
}
}
	### Categorical missing data in Stan ###
	library(rstan)


	N <- 10000
	N_missing <- 100
	K <- 3 # number of categories
	x <- sample( seq(from=1,to=K), size=N, replace=TRUE ) # unordered cateogrical covariate

	# Simulating bivariate respose as function of a categorcial variable
	y <- rep(NA, N)
	for (i in 1:N) {
	if (x[i] == 1)
	y[i] = rbinom(n = 1, size = 1 , prob = 0.7 )

	else if (x[i] == 2)
	y[i] = rbinom(n = 1, size = 1 , prob = 0.4 )

	else # x = 3
	y[i] = rbinom(n = 1, size = 1 , prob = 0.1 )
	}

	# Following McElreath (https://gist.github.com/rmcelreath/9406643583a8c99304e459e644762f82), simulate missing
	i_miss <- sample( 1:N , size=N_missing )
	x_obs <- x
	x_obs[i_miss] <- (-1) # placeholder, Stan will not accept NA values
	x_NA <- x_obs
	x_NA[i_miss] <- NA
	x_miss <- ifelse( 1:N %in% i_miss , 1 , 0 )
	cov_for_imp_x<-NA
	cov_for_imp_x[x_obs==1]=rbinom(length(cov_for_imp_x[x_obs==1]), size=1, prob=0.5)
	cov_for_imp_x[x_obs==2]=rbinom(length(cov_for_imp_x[x_obs==2]), size=1, prob=0.5)
	cov_for_imp_x[x_obs==3]=rbinom(length(cov_for_imp_x[x_obs==3]), size=1, prob=0.5)
	cov_for_imp_x[is.na(cov_for_imp_x)]=rbinom(length(cov_for_imp_x[is.na(cov_for_imp_x)]), size=1, prob=0.5)
	#covariate does not predict the reproductive season
	x_cat_2=ifelse(x_obs==2,1,0)
	x_cat_3=ifelse(x_obs==3,1,0)
	cov_for_imp_x_for_miss_cat=cov_for_imp_x[x_miss==1]

	stan_model <- "
	data{
	int N; //number of observations
	int K; //number of categories
	int y[N]; // binary outcome
	int x_obs[N]; // categorical variable when observed
	int x_miss[N]; // categorical variable when unobserved (index with 1's and zero's). 1 if unobserved
	int cov_for_imp_x[N]; // a binary covariate for the prediction of x categorical variable
	int x_cat_2[N]; // dummy variable when x has the second level
	int x_cat_3[N]; // dummy variable when x has the first level
	}

	parameters{
	real alpha; // coefficient of the binary outcome as a function of the categorical variable level 1 (dummy 1 only zeros)
	real beta1; // coefficient of the binary outcome as a function of the categorical variable level 2 (dummy 2)
	real beta2; // coefficient of the binary outcome as a function of the categorical variable level 3 (dummy 3)
	vector[K] a_imp; // intercept for the imputation model
	vector[K] b1_imp; // coefficent for the imputation model
	}


	model{
	// priors
	alpha ~ normal(0,1); // explained above
	beta1 ~ normal(0,1); // explained above
	beta2 ~ normal(0,1); // explained above
	a_imp ~ normal(0,1); // explained above
	b1_imp2 ~ normal(0,1);

	//Data
	for (i in 1:N) {
	vector[K] p;
	vector[K] theta;

	p[2] = a_imp[2] + b1_imp[2]*cov_for_imp_x[i]; // modeling the prob 2 as a function of the covariate to model the category
	p[3] = a_imp[3] + b1_imp[3]*cov_for_imp_x[i]; // modeling the prob 3 as a function of the covariate to model the category


	// x not missing fit the binary variable as a function of the categories
	if (x_miss[i] == 0) {
	y[i] ~ bernoulli_logit(alpha+
	beta1*x_cat_2[i]+
	beta2*x_cat_3[i]);

	theta[1] = 0;
	theta[2] = p[2];
	theta[3] = p[3];
	x_obs[i] ~ categorical( softmax(theta));

	}

	// x missing model the category and posteiorly model the binary outcome as a function of the imputed category

	else {
	vector[n_cat] lp;

	theta[1] = 0;
	theta[2] = p[2];
	theta[3] = p[3];

	lp[1] = log_softmax(theta)[1] + bernoulli_logit_lpmf( y[i] \| alpha + beta1);
	lp[2] = log_softmax(theta)[2] + bernoulli_logit_lpmf( y[i] \| alpha);
	lp[3] = log_softmax(theta)[3] + bernoulli_logit_lpmf( y[i] \| alpha + beta2);

	target += log_sum_exp(lp);

	}
	}
	}