mike-lawrence/cfa.stan

## cfa.stan
data{
	# n_subj: number of subjects
	int n_subj ;
	# n_y: number of outcomes
	int n_y ;
	# y: matrix of outcomes
	matrix[n_subj,n_y] y ;
	# n_fac: number of latent factors
	int n_fac ;
	# y_fac: list of which factor is associated with each outcome
	int<lower=1,upper=n_fac> y_fac[n_y] ;
}
transformed data{
	# scaled_y: z-scaled outcomes
	matrix[n_subj,n_y] scaled_y ;
	for(i in 1:n_y){
		scaled_y[,i] = (y[,i]-mean(y[,i])) / sd(y[,i]) ;
	}
}
parameters{
	# normal01: a helper variable for more efficient non-centered multivariate
	#  sampling of subj_facs
	matrix[n_fac, n_subj] normal01;
	# fac_cor_helper: correlations (on cholesky factor scale) amongst latent
	#  factors
	cholesky_factor_corr[n_fac] fac_cor_helper ;
	# scaled_y_means: means for each outcome
	vector[n_y] scaled_y_means ;
	# scaled_y_noise: measurement noise for each outcome
	vector<lower=0>[n_y] scaled_y_noise ;
	# betas: how each factor loads on to each outcome.
	#  Must be postive for identifiability.
	vector<lower=0>[n_y] betas ;
}
transformed parameters{
	# subj_facs: matrix of values for each factor associated with each subject
	matrix[n_subj,n_fac] subj_facs ;
	# the following calculation implies that rows of subj_facs are sampled from
	#  a multivariate normal distribution with mean of 0 and sd of 1 in all
	#  dimensions and a correlation matrix of fac_cor.
	subj_facs = transpose(fac_cor_helper * normal01) ;
}
model{
	# normal01 must have normal(0,1) prior for "non-centered" parameterization
	#  on the multivariate distribution of latent factors
	to_vector(normal01) ~ normal(0,1) ;
	# flat prior on correlations
	fac_cor_helper ~ lkj_corr_cholesky(1) ;
	# normal prior on y means
	scaled_y_means ~ normal(0,1) ;
	# weibull prior on measurement noise
	scaled_y_noise ~ weibull(2,1) ;
	# normal(0,1) priors on betas
	betas ~ normal(0,1) ;
	# each outcome as normal, influenced by its associated latent factor
	for(i in 1:n_y){
		scaled_y[,i] ~ normal(
			scaled_y_means[i] + subj_facs[,y_fac[i]] * betas[i]
			, scaled_y_noise[i]
		) ;
	}
}
generated quantities{
	#cors: correlations amongst within-subject predictors' effects
	corr_matrix[n_fac] fac_cor ;
	# y_means: outcome means on the original scale of Y
	vector[n_y] y_means ;
	# y_noise: outcome noise on the original scale of Y
	vector[n_y] y_noise ;
	fac_cor = multiply_lower_tri_self_transpose(fac_cor_helper) ;
	for(i in 1:n_y){
		y_means[i] = scaled_y_means[i]*sd(y[,i]) + mean(y[,i]) ;
		y_noise[i] = scaled_y_noise[i]*sd(y[,i]) ;
	}
}

## cfa_demo.r
library(tidyverse)
library(rstan)
rstan_options(auto_write = TRUE)


# get some data from the lavaan package
# install.packages('lavaan')
library(lavaan)
data(HolzingerSwineford1939)
# a visual factor measured by 3 variables: x1, x2 and x3
# a textual factor measured by 3 variables: x4, x5 and x6
# a speed factor measured by 3 variables: x7, x8 and x9

#subset the data down to a few columns
HolzingerSwineford1939 %>%
	dplyr::select(
		x1,x2,x3,x4,x5,x6,x7,x8,x9
	) -> dat

#prep the list of data to pass to Stan
data_for_stan = list(
	# n_y: number of outcomes
	n_y = ncol(dat)
	# n_subj: number of subjects
	, n_subj = nrow(dat)
	# y: matrix of outcomes
	, y = as.matrix(dat)
	# n_fac: number of latent factors
	, n_fac = 3
	# y_fac: list of which factor is associated with each outcome
	, y_fac = c(1,1,1,2,2,2,3,3,3)
)

# Sample the model
post = rstan::stan(
	file = 'cfa.stan'
	, data = data_for_stan
	, seed = 1
	, cores = 4
	, chains = 4
	, iter = 2e3
	, pars = c('normal01','fac_cor_helper')
	, include = FALSE
)

# betas: magnitude of relationship between each outcome & it's associated
#  latent factor
print(
	post
	, pars = 'betas'
	, probs = c(.025,.975)
	, digits = 2
	, use_cache = FALSE
)

# fac_cor: correlations amongst latent factors
print(
	post
	, pars = 'fac_cor'
	, probs = c(.025,.975)
	, digits = 2
	, use_cache = FALSE
)

# y_means: outcome means
print(
	post
	, pars = 'y_means'
	, probs = c(.025,.975)
	, digits = 2
	, use_cache = FALSE
)

# y_means: outcome measurement noise
print(
	post
	, pars = 'y_noise'
	, probs = c(.025,.975)
	, digits = 2
	, use_cache = FALSE
)
	data{
	# n_subj: number of subjects
	int n_subj ;
	# n_y: number of outcomes
	int n_y ;
	# y: matrix of outcomes
	matrix[n_subj,n_y] y ;
	# n_fac: number of latent factors
	int n_fac ;
	# y_fac: list of which factor is associated with each outcome
	int<lower=1,upper=n_fac> y_fac[n_y] ;
	}
	transformed data{
	# scaled_y: z-scaled outcomes
	matrix[n_subj,n_y] scaled_y ;
	for(i in 1:n_y){
	scaled_y[,i] = (y[,i]-mean(y[,i])) / sd(y[,i]) ;
	}
	}
	parameters{
	# normal01: a helper variable for more efficient non-centered multivariate
	# sampling of subj_facs
	matrix[n_fac, n_subj] normal01;
	# fac_cor_helper: correlations (on cholesky factor scale) amongst latent
	# factors
	cholesky_factor_corr[n_fac] fac_cor_helper ;
	# scaled_y_means: means for each outcome
	vector[n_y] scaled_y_means ;
	# scaled_y_noise: measurement noise for each outcome
	vector<lower=0>[n_y] scaled_y_noise ;
	# betas: how each factor loads on to each outcome.
	# Must be postive for identifiability.
	vector<lower=0>[n_y] betas ;
	}
	transformed parameters{
	# subj_facs: matrix of values for each factor associated with each subject
	matrix[n_subj,n_fac] subj_facs ;
	# the following calculation implies that rows of subj_facs are sampled from
	# a multivariate normal distribution with mean of 0 and sd of 1 in all
	# dimensions and a correlation matrix of fac_cor.
	subj_facs = transpose(fac_cor_helper * normal01) ;
	}
	model{
	# normal01 must have normal(0,1) prior for "non-centered" parameterization
	# on the multivariate distribution of latent factors
	to_vector(normal01) ~ normal(0,1) ;
	# flat prior on correlations
	fac_cor_helper ~ lkj_corr_cholesky(1) ;
	# normal prior on y means
	scaled_y_means ~ normal(0,1) ;
	# weibull prior on measurement noise
	scaled_y_noise ~ weibull(2,1) ;
	# normal(0,1) priors on betas
	betas ~ normal(0,1) ;
	# each outcome as normal, influenced by its associated latent factor
	for(i in 1:n_y){
	scaled_y[,i] ~ normal(
	scaled_y_means[i] + subj_facs[,y_fac[i]] * betas[i]
	, scaled_y_noise[i]
	) ;
	}
	}
	generated quantities{
	#cors: correlations amongst within-subject predictors' effects
	corr_matrix[n_fac] fac_cor ;
	# y_means: outcome means on the original scale of Y
	vector[n_y] y_means ;
	# y_noise: outcome noise on the original scale of Y
	vector[n_y] y_noise ;
	fac_cor = multiply_lower_tri_self_transpose(fac_cor_helper) ;
	for(i in 1:n_y){
	y_means[i] = scaled_y_means[i]*sd(y[,i]) + mean(y[,i]) ;
	y_noise[i] = scaled_y_noise[i]*sd(y[,i]) ;
	}
	}
	library(tidyverse)
	library(rstan)
	rstan_options(auto_write = TRUE)


	# get some data from the lavaan package
	# install.packages('lavaan')
	library(lavaan)
	data(HolzingerSwineford1939)
	# a visual factor measured by 3 variables: x1, x2 and x3
	# a textual factor measured by 3 variables: x4, x5 and x6
	# a speed factor measured by 3 variables: x7, x8 and x9

	#subset the data down to a few columns
	HolzingerSwineford1939 %>%
	dplyr::select(
	x1,x2,x3,x4,x5,x6,x7,x8,x9
	) -> dat

	#prep the list of data to pass to Stan
	data_for_stan = list(
	# n_y: number of outcomes
	n_y = ncol(dat)
	# n_subj: number of subjects
	, n_subj = nrow(dat)
	# y: matrix of outcomes
	, y = as.matrix(dat)
	# n_fac: number of latent factors
	, n_fac = 3
	# y_fac: list of which factor is associated with each outcome
	, y_fac = c(1,1,1,2,2,2,3,3,3)
	)

	# Sample the model
	post = rstan::stan(
	file = 'cfa.stan'
	, data = data_for_stan
	, seed = 1
	, cores = 4
	, chains = 4
	, iter = 2e3
	, pars = c('normal01','fac_cor_helper')
	, include = FALSE
	)

	# betas: magnitude of relationship between each outcome & it's associated
	# latent factor
	print(
	post
	, pars = 'betas'
	, probs = c(.025,.975)
	, digits = 2
	, use_cache = FALSE
	)

	# fac_cor: correlations amongst latent factors
	print(
	post
	, pars = 'fac_cor'
	, probs = c(.025,.975)
	, digits = 2
	, use_cache = FALSE
	)

	# y_means: outcome means
	print(
	post
	, pars = 'y_means'
	, probs = c(.025,.975)
	, digits = 2
	, use_cache = FALSE
	)

	# y_means: outcome measurement noise
	print(
	post
	, pars = 'y_noise'
	, probs = c(.025,.975)
	, digits = 2
	, use_cache = FALSE
	)