Last active
January 10, 2024 14:52
-
-
Save mihaiconstantin/1e28fda267438667cad8b71226576048 to your computer and use it in GitHub Desktop.
A step-by-step tutorial for fitting SEM models in base `R`.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load libraries. | |
library(lavaan) | |
# #region Helpers. | |
# These functions exist to make it more convenient for you to carry out the | |
# model specification without dealing with unintuitive `R` syntax and naming | |
# conventions. You can ignore this section and focus on the steps that follow | |
# (i.e., the actual model specification and estimation). | |
# Helper function to make matrices with quoted symbols. | |
model_matrix <- function(dimensions, ...) { | |
# Make model matrix | |
model_matrix <- bquote( | |
matrix( | |
.(substitute(c(...))), | |
nrow = .(dimensions[1]), | |
ncol = .(dimensions[2]), | |
byrow = TRUE | |
) | |
) | |
# Add class to model matrix. | |
class(model_matrix) <- "model_matrix" | |
# Return the model matrix. | |
return(model_matrix) | |
} | |
# Add print method for the model matrices. | |
print.model_matrix <- function(x) { | |
# Obtain the evaluated model matrix. | |
print(eval(x)) | |
# Remain silent. | |
invisible() | |
} | |
# Update the matrix based on parameter values. | |
update <- function(model_matrix, ...) { | |
# Evaluate the model matrix. | |
model_matrix_update <- eval(model_matrix, as.list(...)) | |
# Return the update. | |
return(model_matrix_update) | |
} | |
# Define trace function. | |
trace <- function(matrix) { | |
# Compute the trace. | |
trace_value <- sum(diag(matrix)) | |
# Return. | |
return(trace_value) | |
} | |
# Define matrix inverse. | |
inverse <- solve | |
# Define transpose. | |
transpose <- t | |
# #endregion | |
# #region Model formulation. | |
# Our model for for the vector `y` of random variables is: | |
# | |
# y = Λη + ε, | |
# | |
# where is further modeled as: | |
# | |
# η = (I - B)^-1 ζ, | |
# | |
# with assumptions: | |
# | |
# y ~ N(0, Σ) | |
# ζ ~ N(0, Ψ) | |
# ε ~ N(0, Θ) | |
# #endregion | |
# #region Two observed variables. | |
# Create a subset of the data. | |
data <- PoliticalDemocracy[, c("y1", "y2")] | |
# Determine sample size. | |
n <- nrow(data) | |
# Create the observed covariance matrix. | |
# | |
# Note. | |
# | |
# `R` computes by default the unbiased version of the covariance matrix (i.e., | |
# divided by `n - 1`). However, during the lecture we used the Maximum | |
# Likelihood (i.e., only divided by `n`). In general, `lavaan` uses the unbiased | |
# covariance matrix, while during our computations we use the Maximum Likelihood | |
# version. This distinction is only important in our case to make sure we can | |
# exactly replicate the output of `lavaan` (i.e., it applies the bias correction | |
# by default). | |
# | |
# Unbiased observed covariance matrix. | |
S_unbiased <- cov(data) | |
# Observed covariance matrix. | |
S <- ((n - 1) / n) * S_unbiased | |
# Let's specify a simple regression model in the SEM framework. | |
# | |
# Note. | |
# | |
# Recall slide 52 from lecture 2, where we discussed that observed variables get | |
# "upgraded" to latent variables in the SEM framework. | |
# Create factor loadings matrix `Λ`. | |
Λ <- model_matrix(dimensions = c(2, 2), | |
# η1 # η2 | |
1, 0, | |
0, 1 | |
) | |
# Create latent variables covariance matrix `Ψ`. | |
Ψ <- model_matrix(dimensions = c(2, 2), | |
# ζ1 # ζ2 | |
φ11, 0, | |
0, φ22 | |
) | |
# Create matrix of observed residuals `Θ`. | |
# | |
# Note. | |
# | |
# Since we "specialize" our observed variables to latent variables, we do not | |
# have a measurement model. Instead we use our latent variables as proxies to | |
# explain our observed variables perfectly. As a result, the residual covariance | |
# matrix `Θ` is filled with zeros. | |
Θ <- model_matrix(dimensions = c(2, 2), | |
# ε1 # ε2 | |
0, 0, | |
0, 0 | |
) | |
# Create matrix of structural paths `B`. | |
B <- model_matrix(dimensions = c(2, 2), | |
# η1 # η2 | |
0, 0, | |
β21, 0 | |
) | |
# Create the identity matrix `I`. | |
I <- model_matrix(dimensions = c(2, 2), | |
# η1 # η2 | |
1, 0, | |
0, 1 | |
) | |
# Create a list with starting values for the model parameters specified above. | |
parameters <- c( | |
φ11 = 0.5, | |
φ22 = 0.5, | |
β21 = 0.5 | |
) | |
# Let's take a look at the matrices we have so far. | |
# | |
# Note. | |
# | |
# For the matrices we specified "unknown" model parameters (i.e., the Greek | |
# letters). Therefore, we first need to update the matrices by replacing these | |
# letters with the numerical values in `parameters` vector. Later we will see | |
# that the computer (i.e., actually optimizer) will do this for us repeatedly | |
# until it finds the best set of values for the parameters in the `parameters` | |
# vector. | |
# Print `Λ`. | |
Λ | |
# Update `Ψ`. | |
update(Ψ, parameters) | |
# Print `Θ`. | |
Θ | |
# Update `B`. | |
update(B, parameters) | |
# Print `I`. | |
I | |
# Now, let's create a function to compute our model-implied covariance matrix | |
# `Σ` based on the beautiful full SEM framework formula (i.e., the long and | |
# scary looking one that we derived using covariance algebra). This function | |
# essentially processes the model matrices above according to the formula to | |
# produce the model-implied covariance matrix. | |
# | |
# Recall that `Σ` is a function of model parameters, thereby we also need to | |
# provide some starting values, or guesses for those parameters (i.e., using the | |
# `update` function as we did above). | |
# | |
# Note. | |
# | |
# In `R`, matrix multiplication uses the `%*%` operator, not the `*` one. | |
# Create function to compute `Σ`. | |
calculate_sigma <- function(Λ, Ψ, Θ, B, I, parameters) { | |
# Update model matrices parameters with given values. | |
Λ <- update(Λ, parameters) | |
Ψ <- update(Ψ, parameters) | |
Θ <- update(Θ, parameters) | |
B <- update(B, parameters) | |
I <- update(I, parameters) | |
# Calculate the model-implied covariance matrix `Σ`. | |
Σ <- Λ %*% inverse(I - B) %*% Ψ %*% transpose(inverse(I - B)) %*% transpose(Λ) + Θ | |
# Return `Σ`. | |
return(Σ) | |
} | |
# Calculate `Σ`. | |
Σ <- calculate_sigma(Λ, Ψ, Θ, B, I, parameters) | |
# Compare it to our observed covariance matrix `S`. | |
S | |
# It doesn't look like we are too close. Let's go ahead and write a function to | |
# compute the value of the fit function (i.e., `F_ML`) and quantify how far away | |
# are we from reproducing the observed covariance matrix `S` using our | |
# model-implied covariance matrix `Σ`. | |
# | |
# Note. | |
# - `det` stands for matrix determinant | |
# - `nrow(S)` gives us the number of variables `p` | |
# Calculate function to compute the fit function. | |
calculate_fit <- function(S, Σ) { | |
# Calculate fit function value. | |
fit_function_value <- trace(S %*% inverse(Σ)) - log(det(S %*% inverse(Σ))) - nrow(S) | |
# Return fit function value. | |
return(fit_function_value) | |
} | |
# Calculate the fit function value. | |
calculate_fit(S, Σ) | |
# It looks like our fit function is quite far above zero, indicating that our | |
# set of values for the parameters in the `parameters` vector are not that | |
# great. | |
# You can try to adjust the model parameters manually to improve the fit. | |
parameters <- c( | |
φ11 = 6.87, | |
φ22 = .5, | |
β21 = .80 | |
) | |
# Compute `Σ` given new parameter values. | |
Σ <- calculate_sigma(Λ, Ψ, Θ, B, I, parameters) | |
# Calculate updated fit. | |
calculate_fit(S, Σ) | |
# However, that is a next-to-impossible feat! So, we can instead pass some | |
# information to an optimizer to try values for us until it finds the optimal | |
# (i.e., best) ones using clever strategies. Specifically, we need to pass the | |
# following to the optimizer: | |
# - the model-implied covariance matrix `Σ` | |
# - some initial guesses for the model parameters we specified | |
# - the fit function definition that quantifies the discrepancy between the | |
# observed covariance matrix `S` and the model-implied covariance matrix `Σ` | |
# - essentially, the optimizer (e.g., `nlminb`) repeatedly tries different | |
# values for the model parameters and checks if the fit function value | |
# goes in the right direction (e.g., decreases) | |
# - and it continues this process until it thinks it found the best set of | |
# parameters that produce the smallest value for our fit function. | |
# | |
# Note. | |
# | |
# For convenience, we wrap the computation of the model-implied covariance | |
# matrix `Σ` and fit function into a single function that we call our | |
# "objective". | |
# Define the objective function for the optimizer. | |
objective_function <- function(parameters, S, Λ, Ψ, Θ, B, I) { | |
# Calculate `Σ`. | |
Σ <- calculate_sigma(Λ, Ψ, Θ, B, I, parameters) | |
# Calculate fit function value. | |
fit_value <- calculate_fit(S, Σ) | |
# Return the fit value. | |
return(fit_value) | |
} | |
# Test the objective function and see that we get the same values as above. | |
objective_function(parameters, S, Λ, Ψ, Θ, B, I) | |
# Finally, we can pass it to the optimizer. | |
# | |
# Note. | |
# | |
# We can either use `optim` or, even better, let's use the same one that | |
# `lavaan` uses, which is `nlminb`. | |
solution <- nlminb( | |
start = parameters, | |
objective = objective_function, | |
S = S, Λ = Λ, Ψ = Ψ, Θ = Θ, B = B, I = I | |
) | |
# Let's extract the optimal parameter values from the solution. | |
optimal_parameters <- solution$par | |
# Let's print the optimal parameter values rounded to three decimals. | |
round(optimal_parameters, 3) | |
# Now, let's check if what we did is correct. For this, we can `lavaan` as the | |
# "source of truth". You will see that both specifying the model and estimating | |
# values for the model parameters is drastically easier with `lavaan`... | |
# | |
# But, we don't do these things because it's easy, rather because they are fun! | |
# Specify the model using `lavaan` syntax. | |
model_syntax <- " | |
y2 ~ y1 | |
" | |
# Estimate values for the model parameters using `lavaan`. | |
fit <- lavaan::sem(model_syntax, data = data) | |
# Print the model fit summary. | |
summary(fit) | |
# Let's now print our optimal values. | |
round(optimal_parameters, 3) | |
# Looks like we obtained similar values! | |
# Note. | |
# | |
# For complicated models, the values may not be exactly the same. In reality, | |
# `lavaan` uses clever strategies for selecting starting values and provides the | |
# gradient to the optimizer to aid the parameter search. Instead, we are lazy | |
# and rely on numerical approximations of the gradient. La, la, la... | |
# Let's also compute the model-implied covariance matrix `Σ` and the value of | |
# the fit function using the best set of parameters we found. | |
# Compute `Σ` given the optimal parameter values. | |
Σ <- calculate_sigma(Λ, Ψ, Θ, B, I, optimal_parameters) | |
# Calculate updated fit. | |
calculate_fit(S, Σ) | |
# We can see that the fit function is essentially zero! But why is the fit | |
# function zero? | |
# | |
# Because we have a saturated model and we can reproduce the observed covariance | |
# matrix `S` perfectly. | |
# Congratulations, you just learned the key elements behind fitting SEM models! | |
# You should really be proud of yourself at at this point. I am serious! | |
# #endregion | |
# #region A CFA model with two latent variables. | |
# Are you up for a challenge? See if you can extend the code above to two | |
# exogenous latent variables with three indicators each (i.e., as seen on slide | |
# 58 in lecture two). | |
# Tip. You don't need to update the helper or calculation functions. They are | |
# designed to be general. Instead, you need to update the data and the model | |
# matrices by specifying the correct parameters for which you want to estimate | |
# values | |
# Extend the data. | |
data <- PoliticalDemocracy[, c("y1", "y2", "y3", "x1", "x2", "x3")] | |
# Determine sample size. | |
n <- nrow(data) | |
# Create unbiased observed covariance matrix. | |
S_unbiased <- cov(data) | |
# Observed covariance matrix. | |
S <- ((n - 1) / n) * S_unbiased | |
# Create factor loadings matrix `Λ`. | |
Λ <- model_matrix(dimensions = c(6, 2), | |
# η1 # η1 | |
1, 0, | |
λ21, 0, | |
λ31, 0, | |
0, 1, | |
0, λ52, | |
0, λ62 | |
) | |
# Note. | |
# | |
# A covariance matrix is symmetrical, and that also includes the covariance | |
# matrix `Ψ` for the latent variables `η`. Therefore, we need to use the same | |
# "symbol" (i.e., Greek letter) for the parameters that are the same. Otherwise, | |
# the optimizer will try to find values for parameters that it shouldn't. In | |
# this case, `φ21` and `φ12` should have the same value. For this reason, we can | |
# indicate in the model matrix that `φ12` is the same as `φ21`. | |
# | |
# This is a bit confusing, but this is exactly the reason, and many others, why | |
# `lavaan` is intuitive to use. These things have been well thought through... | |
# But, since this an educational exercise, let's not mind the small | |
# inconsistency. | |
# Create latent variables covariance matrix `Ψ`. | |
Ψ <- model_matrix(dimensions = c(2, 2), | |
# ζ1 # ζ2 | |
φ11, φ21, | |
φ21, φ22 | |
) | |
# Create matrix of observed residuals `Θ`. | |
Θ <- model_matrix(dimensions = c(6, 6), | |
# ε1 # ε2 # ε2 # ε4 # ε5 # ε6 | |
θ11, 0, 0, 0, 0, 0, | |
0, θ22, 0, 0, 0, 0, | |
0, 0, θ33, 0, 0, 0, | |
0, 0, 0, θ44, 0, 0, | |
0, 0, 0, 0, θ55, 0, | |
0, 0, 0, 0, 0, θ66 | |
) | |
# Create matrix of structural paths `B`. | |
B <- model_matrix(dimensions = c(2, 2), | |
# η1 # η2 | |
0, 0, | |
0, 0 | |
) | |
# Create the identity matrix `I`. | |
I <- model_matrix(dimensions = c(2, 2), | |
# η1 # η2 | |
1, 0, | |
0, 1 | |
) | |
# Create a list with starting values for the model parameters specified above. | |
parameters <- c( | |
# Loadings. | |
λ21 = 0.5, | |
λ31 = 0.5, | |
λ52 = 0.5, | |
λ62 = 0.5, | |
# Latent variances. | |
φ11 = 0.5, | |
φ22 = 0.5, | |
φ21 = 0.5, | |
# Residual variances. | |
θ11 = 0.5, | |
θ22 = 0.5, | |
θ33 = 0.5, | |
θ44 = 0.5, | |
θ55 = 0.5, | |
θ66 = 0.5 | |
) | |
# Fit the model using `lavaan`. | |
# Specify the `lavaan` model syntax. | |
model_syntax <- " | |
# Measurement model. | |
η1 =~ y1 + y2 + y3 | |
η2 =~ x1 + x2 + x3 | |
# Covariance between exogenous latent variables. | |
η2 ~~ η1 | |
" | |
# Estimate values for the model parameters using `lavaan`. | |
fit <- lavaan::sem(model_syntax, data = data) | |
# Print the model fit summary. | |
summary(fit) | |
# Fit the model using our knowledge of the SEM framework! | |
# Again, we use the same optimizer as `lavaan`. | |
solution <- nlminb( | |
start = parameters, | |
objective = objective_function, | |
S = S, Λ = Λ, Ψ = Ψ, Θ = Θ, B = B, I = I | |
) | |
# Let's extract the optimal parameter values from the solution. | |
optimal_parameters <- solution$par | |
# Let's print the optimal parameter values rounded to three decimals. | |
round(optimal_parameters, 3) | |
# Now also print the `lavaan` fit for comparison. | |
summary(fit) | |
# We see that, again, we are pretty close! | |
# #endregion | |
# #region Two latent variables and a regression path. | |
# What we just did was a CFA model. But, in this course, we are doing SEM, so | |
# let's go ahead and estimate a structural path (i.e., a regression) between the | |
# latent variables. | |
# | |
# Update the code above such that `η2` is an endogenous variable predicted by | |
# the exogenous `η1`. | |
# Update latent variables covariance matrix `Ψ`. | |
Ψ <- model_matrix(dimensions = c(2, 2), | |
# ζ1 # ζ2 | |
φ11, 0, | |
0, φ22 | |
) | |
# Update matrix of structural paths `B`. | |
B <- model_matrix(dimensions = c(2, 2), | |
# η1 # η2 | |
0, 0, | |
β21, 0 | |
) | |
# Guess some starting values for the optimizer. | |
parameters <- c( | |
# Loadings. | |
λ21 = 0.5, | |
λ31 = 0.5, | |
λ52 = 0.5, | |
λ62 = 0.5, | |
# Latent variances. | |
φ11 = 0.5, | |
φ22 = 0.5, | |
# Regression paths. | |
β21 = 0.5, | |
# Residual variances. | |
θ11 = 0.5, | |
θ22 = 0.5, | |
θ33 = 0.5, | |
θ44 = 0.5, | |
θ55 = 0.5, | |
θ66 = 0.5 | |
) | |
# Again, we use the same optimizer as `lavaan`. | |
solution <- nlminb( | |
start = parameters, | |
objective = objective_function, | |
S = S, Λ = Λ, Ψ = Ψ, Θ = Θ, B = B, I = I | |
) | |
# Let's extract the optimal parameter values from the solution. | |
optimal_parameters <- solution$par | |
# Let's print the optimal parameter values rounded to three decimals. | |
round(optimal_parameters, 3) | |
# Now, let's verify our answer using `lavaan`. | |
# Specify the `lavaan` model syntax. | |
model_syntax <- " | |
# Measurement model. | |
η1 =~ y1 + y2 + y3 | |
η2 =~ x1 + x2 + x3 | |
# Regression path. | |
η2 ~ η1 | |
" | |
# Estimate values for the model parameters using `lavaan`. | |
fit <- lavaan::sem(model_syntax, data = data) | |
# Print the model fit summary. | |
summary(fit) | |
# Print our solution for comparison. | |
round(optimal_parameters, 3) | |
# #endregion | |
# #region Some model fit calculations. | |
# We can also compare our solution to `lavaan`` the as follows. | |
lavInspect(fit, what = "est") | |
list( | |
lambda = update(Λ, optimal_parameters), | |
theta = update(Θ, optimal_parameters), | |
psi = update(Ψ, optimal_parameters), | |
beta = update(B, optimal_parameters) | |
) | |
# Compute `Σ` given the optimal parameter values. | |
Σ <- calculate_sigma(Λ, Ψ, Θ, B, I, optimal_parameters) | |
# Calculate updated fit. | |
F_ML <- calculate_fit(S, Σ) | |
# Compare model-implied covariances. | |
Σ | |
lavInspect(fit, what = "sigma") | |
# Extract fit measures from `lavaan`. | |
fitMeasures(fit)[c("fmin", "chisq")] | |
# Calculate `fmin` using our `F_ML` value. | |
F_ML / 2 | |
# Calculate the test statistic `T`, i.e., the χ^2 test statistic. | |
# Determine sample size. | |
n <- nrow(data) | |
T <- n * F_ML | |
T | |
# Extract degrees of freedom from `lavaan`. | |
fitMeasures(fit)[c("df")] | |
# Calculate our own degrees of freedom. We start with determining how many | |
# unique elements we have in the observed covariance matrix `S`. | |
# | |
# Define the number of variables. | |
p <- nrow(S) | |
# | |
# Define the number of unique elements. | |
k <- (p * (p + 1)) / 2 | |
# | |
# Define number of free model parameters. | |
q <- length(parameters) | |
# | |
# Calculate degrees of freedom. | |
DF <- k - q | |
DF | |
# P value for chi square. | |
pchisq(T, DF, lower.tail = FALSE) | |
# From `lavaan`. | |
fitMeasures(fit)[c("chisq", "pvalue")] | |
# Extract `RMSEA` from `lavaan`. | |
fitMeasures(fit)[c("rmsea")] | |
# Calculate our own `RMSEA`. | |
sqrt((T - DF) / n * DF) | |
# If you reached this point... pat yourself on the back. | |
# What you did is remarkable! Well done! | |
# #endregion |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment