Skip to content

Instantly share code, notes, and snippets.

@mihaiconstantin
Last active January 10, 2024 14:52
Show Gist options
  • Save mihaiconstantin/1e28fda267438667cad8b71226576048 to your computer and use it in GitHub Desktop.
Save mihaiconstantin/1e28fda267438667cad8b71226576048 to your computer and use it in GitHub Desktop.
A step-by-step tutorial for fitting SEM models in base `R`.
# Load libraries.
library(lavaan)
# #region Helpers.
# These functions exist to make it more convenient for you to carry out the
# model specification without dealing with unintuitive `R` syntax and naming
# conventions. You can ignore this section and focus on the steps that follow
# (i.e., the actual model specification and estimation).
# Helper function to make matrices with quoted symbols.
model_matrix <- function(dimensions, ...) {
# Make model matrix
model_matrix <- bquote(
matrix(
.(substitute(c(...))),
nrow = .(dimensions[1]),
ncol = .(dimensions[2]),
byrow = TRUE
)
)
# Add class to model matrix.
class(model_matrix) <- "model_matrix"
# Return the model matrix.
return(model_matrix)
}
# Add print method for the model matrices.
print.model_matrix <- function(x) {
# Obtain the evaluated model matrix.
print(eval(x))
# Remain silent.
invisible()
}
# Update the matrix based on parameter values.
update <- function(model_matrix, ...) {
# Evaluate the model matrix.
model_matrix_update <- eval(model_matrix, as.list(...))
# Return the update.
return(model_matrix_update)
}
# Define trace function.
trace <- function(matrix) {
# Compute the trace.
trace_value <- sum(diag(matrix))
# Return.
return(trace_value)
}
# Define matrix inverse.
inverse <- solve
# Define transpose.
transpose <- t
# #endregion
# #region Model formulation.
# Our model for for the vector `y` of random variables is:
#
# y = Λη + ε,
#
# where is further modeled as:
#
# η = (I - B)^-1 ζ,
#
# with assumptions:
#
# y ~ N(0, Σ)
# ζ ~ N(0, Ψ)
# ε ~ N(0, Θ)
# #endregion
# #region Two observed variables.
# Create a subset of the data.
data <- PoliticalDemocracy[, c("y1", "y2")]
# Determine sample size.
n <- nrow(data)
# Create the observed covariance matrix.
#
# Note.
#
# `R` computes by default the unbiased version of the covariance matrix (i.e.,
# divided by `n - 1`). However, during the lecture we used the Maximum
# Likelihood (i.e., only divided by `n`). In general, `lavaan` uses the unbiased
# covariance matrix, while during our computations we use the Maximum Likelihood
# version. This distinction is only important in our case to make sure we can
# exactly replicate the output of `lavaan` (i.e., it applies the bias correction
# by default).
#
# Unbiased observed covariance matrix.
S_unbiased <- cov(data)
# Observed covariance matrix.
S <- ((n - 1) / n) * S_unbiased
# Let's specify a simple regression model in the SEM framework.
#
# Note.
#
# Recall slide 52 from lecture 2, where we discussed that observed variables get
# "upgraded" to latent variables in the SEM framework.
# Create factor loadings matrix `Λ`.
Λ <- model_matrix(dimensions = c(2, 2),
# η1 # η2
1, 0,
0, 1
)
# Create latent variables covariance matrix `Ψ`.
Ψ <- model_matrix(dimensions = c(2, 2),
# ζ1 # ζ2
φ11, 0,
0, φ22
)
# Create matrix of observed residuals `Θ`.
#
# Note.
#
# Since we "specialize" our observed variables to latent variables, we do not
# have a measurement model. Instead we use our latent variables as proxies to
# explain our observed variables perfectly. As a result, the residual covariance
# matrix `Θ` is filled with zeros.
Θ <- model_matrix(dimensions = c(2, 2),
# ε1 # ε2
0, 0,
0, 0
)
# Create matrix of structural paths `B`.
B <- model_matrix(dimensions = c(2, 2),
# η1 # η2
0, 0,
β21, 0
)
# Create the identity matrix `I`.
I <- model_matrix(dimensions = c(2, 2),
# η1 # η2
1, 0,
0, 1
)
# Create a list with starting values for the model parameters specified above.
parameters <- c(
φ11 = 0.5,
φ22 = 0.5,
β21 = 0.5
)
# Let's take a look at the matrices we have so far.
#
# Note.
#
# For the matrices we specified "unknown" model parameters (i.e., the Greek
# letters). Therefore, we first need to update the matrices by replacing these
# letters with the numerical values in `parameters` vector. Later we will see
# that the computer (i.e., actually optimizer) will do this for us repeatedly
# until it finds the best set of values for the parameters in the `parameters`
# vector.
# Print `Λ`.
Λ
# Update `Ψ`.
update(Ψ, parameters)
# Print `Θ`.
Θ
# Update `B`.
update(B, parameters)
# Print `I`.
I
# Now, let's create a function to compute our model-implied covariance matrix
# `Σ` based on the beautiful full SEM framework formula (i.e., the long and
# scary looking one that we derived using covariance algebra). This function
# essentially processes the model matrices above according to the formula to
# produce the model-implied covariance matrix.
#
# Recall that `Σ` is a function of model parameters, thereby we also need to
# provide some starting values, or guesses for those parameters (i.e., using the
# `update` function as we did above).
#
# Note.
#
# In `R`, matrix multiplication uses the `%*%` operator, not the `*` one.
# Create function to compute `Σ`.
calculate_sigma <- function(Λ, Ψ, Θ, B, I, parameters) {
# Update model matrices parameters with given values.
Λ <- update(Λ, parameters)
Ψ <- update(Ψ, parameters)
Θ <- update(Θ, parameters)
B <- update(B, parameters)
I <- update(I, parameters)
# Calculate the model-implied covariance matrix `Σ`.
Σ <- Λ %*% inverse(I - B) %*% Ψ %*% transpose(inverse(I - B)) %*% transpose(Λ) + Θ
# Return `Σ`.
return(Σ)
}
# Calculate `Σ`.
Σ <- calculate_sigma(Λ, Ψ, Θ, B, I, parameters)
# Compare it to our observed covariance matrix `S`.
S
# It doesn't look like we are too close. Let's go ahead and write a function to
# compute the value of the fit function (i.e., `F_ML`) and quantify how far away
# are we from reproducing the observed covariance matrix `S` using our
# model-implied covariance matrix `Σ`.
#
# Note.
# - `det` stands for matrix determinant
# - `nrow(S)` gives us the number of variables `p`
# Calculate function to compute the fit function.
calculate_fit <- function(S, Σ) {
# Calculate fit function value.
fit_function_value <- trace(S %*% inverse(Σ)) - log(det(S %*% inverse(Σ))) - nrow(S)
# Return fit function value.
return(fit_function_value)
}
# Calculate the fit function value.
calculate_fit(S, Σ)
# It looks like our fit function is quite far above zero, indicating that our
# set of values for the parameters in the `parameters` vector are not that
# great.
# You can try to adjust the model parameters manually to improve the fit.
parameters <- c(
φ11 = 6.87,
φ22 = .5,
β21 = .80
)
# Compute `Σ` given new parameter values.
Σ <- calculate_sigma(Λ, Ψ, Θ, B, I, parameters)
# Calculate updated fit.
calculate_fit(S, Σ)
# However, that is a next-to-impossible feat! So, we can instead pass some
# information to an optimizer to try values for us until it finds the optimal
# (i.e., best) ones using clever strategies. Specifically, we need to pass the
# following to the optimizer:
# - the model-implied covariance matrix `Σ`
# - some initial guesses for the model parameters we specified
# - the fit function definition that quantifies the discrepancy between the
# observed covariance matrix `S` and the model-implied covariance matrix `Σ`
# - essentially, the optimizer (e.g., `nlminb`) repeatedly tries different
# values for the model parameters and checks if the fit function value
# goes in the right direction (e.g., decreases)
# - and it continues this process until it thinks it found the best set of
# parameters that produce the smallest value for our fit function.
#
# Note.
#
# For convenience, we wrap the computation of the model-implied covariance
# matrix `Σ` and fit function into a single function that we call our
# "objective".
# Define the objective function for the optimizer.
objective_function <- function(parameters, S, Λ, Ψ, Θ, B, I) {
# Calculate `Σ`.
Σ <- calculate_sigma(Λ, Ψ, Θ, B, I, parameters)
# Calculate fit function value.
fit_value <- calculate_fit(S, Σ)
# Return the fit value.
return(fit_value)
}
# Test the objective function and see that we get the same values as above.
objective_function(parameters, S, Λ, Ψ, Θ, B, I)
# Finally, we can pass it to the optimizer.
#
# Note.
#
# We can either use `optim` or, even better, let's use the same one that
# `lavaan` uses, which is `nlminb`.
solution <- nlminb(
start = parameters,
objective = objective_function,
S = S, Λ = Λ, Ψ = Ψ, Θ = Θ, B = B, I = I
)
# Let's extract the optimal parameter values from the solution.
optimal_parameters <- solution$par
# Let's print the optimal parameter values rounded to three decimals.
round(optimal_parameters, 3)
# Now, let's check if what we did is correct. For this, we can `lavaan` as the
# "source of truth". You will see that both specifying the model and estimating
# values for the model parameters is drastically easier with `lavaan`...
#
# But, we don't do these things because it's easy, rather because they are fun!
# Specify the model using `lavaan` syntax.
model_syntax <- "
y2 ~ y1
"
# Estimate values for the model parameters using `lavaan`.
fit <- lavaan::sem(model_syntax, data = data)
# Print the model fit summary.
summary(fit)
# Let's now print our optimal values.
round(optimal_parameters, 3)
# Looks like we obtained similar values!
# Note.
#
# For complicated models, the values may not be exactly the same. In reality,
# `lavaan` uses clever strategies for selecting starting values and provides the
# gradient to the optimizer to aid the parameter search. Instead, we are lazy
# and rely on numerical approximations of the gradient. La, la, la...
# Let's also compute the model-implied covariance matrix `Σ` and the value of
# the fit function using the best set of parameters we found.
# Compute `Σ` given the optimal parameter values.
Σ <- calculate_sigma(Λ, Ψ, Θ, B, I, optimal_parameters)
# Calculate updated fit.
calculate_fit(S, Σ)
# We can see that the fit function is essentially zero! But why is the fit
# function zero?
#
# Because we have a saturated model and we can reproduce the observed covariance
# matrix `S` perfectly.
# Congratulations, you just learned the key elements behind fitting SEM models!
# You should really be proud of yourself at at this point. I am serious!
# #endregion
# #region A CFA model with two latent variables.
# Are you up for a challenge? See if you can extend the code above to two
# exogenous latent variables with three indicators each (i.e., as seen on slide
# 58 in lecture two).
# Tip. You don't need to update the helper or calculation functions. They are
# designed to be general. Instead, you need to update the data and the model
# matrices by specifying the correct parameters for which you want to estimate
# values
# Extend the data.
data <- PoliticalDemocracy[, c("y1", "y2", "y3", "x1", "x2", "x3")]
# Determine sample size.
n <- nrow(data)
# Create unbiased observed covariance matrix.
S_unbiased <- cov(data)
# Observed covariance matrix.
S <- ((n - 1) / n) * S_unbiased
# Create factor loadings matrix `Λ`.
Λ <- model_matrix(dimensions = c(6, 2),
# η1 # η1
1, 0,
λ21, 0,
λ31, 0,
0, 1,
0, λ52,
0, λ62
)
# Note.
#
# A covariance matrix is symmetrical, and that also includes the covariance
# matrix `Ψ` for the latent variables `η`. Therefore, we need to use the same
# "symbol" (i.e., Greek letter) for the parameters that are the same. Otherwise,
# the optimizer will try to find values for parameters that it shouldn't. In
# this case, `φ21` and `φ12` should have the same value. For this reason, we can
# indicate in the model matrix that `φ12` is the same as `φ21`.
#
# This is a bit confusing, but this is exactly the reason, and many others, why
# `lavaan` is intuitive to use. These things have been well thought through...
# But, since this an educational exercise, let's not mind the small
# inconsistency.
# Create latent variables covariance matrix `Ψ`.
Ψ <- model_matrix(dimensions = c(2, 2),
# ζ1 # ζ2
φ11, φ21,
φ21, φ22
)
# Create matrix of observed residuals `Θ`.
Θ <- model_matrix(dimensions = c(6, 6),
# ε1 # ε2 # ε2 # ε4 # ε5 # ε6
θ11, 0, 0, 0, 0, 0,
0, θ22, 0, 0, 0, 0,
0, 0, θ33, 0, 0, 0,
0, 0, 0, θ44, 0, 0,
0, 0, 0, 0, θ55, 0,
0, 0, 0, 0, 0, θ66
)
# Create matrix of structural paths `B`.
B <- model_matrix(dimensions = c(2, 2),
# η1 # η2
0, 0,
0, 0
)
# Create the identity matrix `I`.
I <- model_matrix(dimensions = c(2, 2),
# η1 # η2
1, 0,
0, 1
)
# Create a list with starting values for the model parameters specified above.
parameters <- c(
# Loadings.
λ21 = 0.5,
λ31 = 0.5,
λ52 = 0.5,
λ62 = 0.5,
# Latent variances.
φ11 = 0.5,
φ22 = 0.5,
φ21 = 0.5,
# Residual variances.
θ11 = 0.5,
θ22 = 0.5,
θ33 = 0.5,
θ44 = 0.5,
θ55 = 0.5,
θ66 = 0.5
)
# Fit the model using `lavaan`.
# Specify the `lavaan` model syntax.
model_syntax <- "
# Measurement model.
η1 =~ y1 + y2 + y3
η2 =~ x1 + x2 + x3
# Covariance between exogenous latent variables.
η2 ~~ η1
"
# Estimate values for the model parameters using `lavaan`.
fit <- lavaan::sem(model_syntax, data = data)
# Print the model fit summary.
summary(fit)
# Fit the model using our knowledge of the SEM framework!
# Again, we use the same optimizer as `lavaan`.
solution <- nlminb(
start = parameters,
objective = objective_function,
S = S, Λ = Λ, Ψ = Ψ, Θ = Θ, B = B, I = I
)
# Let's extract the optimal parameter values from the solution.
optimal_parameters <- solution$par
# Let's print the optimal parameter values rounded to three decimals.
round(optimal_parameters, 3)
# Now also print the `lavaan` fit for comparison.
summary(fit)
# We see that, again, we are pretty close!
# #endregion
# #region Two latent variables and a regression path.
# What we just did was a CFA model. But, in this course, we are doing SEM, so
# let's go ahead and estimate a structural path (i.e., a regression) between the
# latent variables.
#
# Update the code above such that `η2` is an endogenous variable predicted by
# the exogenous `η1`.
# Update latent variables covariance matrix `Ψ`.
Ψ <- model_matrix(dimensions = c(2, 2),
# ζ1 # ζ2
φ11, 0,
0, φ22
)
# Update matrix of structural paths `B`.
B <- model_matrix(dimensions = c(2, 2),
# η1 # η2
0, 0,
β21, 0
)
# Guess some starting values for the optimizer.
parameters <- c(
# Loadings.
λ21 = 0.5,
λ31 = 0.5,
λ52 = 0.5,
λ62 = 0.5,
# Latent variances.
φ11 = 0.5,
φ22 = 0.5,
# Regression paths.
β21 = 0.5,
# Residual variances.
θ11 = 0.5,
θ22 = 0.5,
θ33 = 0.5,
θ44 = 0.5,
θ55 = 0.5,
θ66 = 0.5
)
# Again, we use the same optimizer as `lavaan`.
solution <- nlminb(
start = parameters,
objective = objective_function,
S = S, Λ = Λ, Ψ = Ψ, Θ = Θ, B = B, I = I
)
# Let's extract the optimal parameter values from the solution.
optimal_parameters <- solution$par
# Let's print the optimal parameter values rounded to three decimals.
round(optimal_parameters, 3)
# Now, let's verify our answer using `lavaan`.
# Specify the `lavaan` model syntax.
model_syntax <- "
# Measurement model.
η1 =~ y1 + y2 + y3
η2 =~ x1 + x2 + x3
# Regression path.
η2 ~ η1
"
# Estimate values for the model parameters using `lavaan`.
fit <- lavaan::sem(model_syntax, data = data)
# Print the model fit summary.
summary(fit)
# Print our solution for comparison.
round(optimal_parameters, 3)
# #endregion
# #region Some model fit calculations.
# We can also compare our solution to `lavaan`` the as follows.
lavInspect(fit, what = "est")
list(
lambda = update(Λ, optimal_parameters),
theta = update(Θ, optimal_parameters),
psi = update(Ψ, optimal_parameters),
beta = update(B, optimal_parameters)
)
# Compute `Σ` given the optimal parameter values.
Σ <- calculate_sigma(Λ, Ψ, Θ, B, I, optimal_parameters)
# Calculate updated fit.
F_ML <- calculate_fit(S, Σ)
# Compare model-implied covariances.
Σ
lavInspect(fit, what = "sigma")
# Extract fit measures from `lavaan`.
fitMeasures(fit)[c("fmin", "chisq")]
# Calculate `fmin` using our `F_ML` value.
F_ML / 2
# Calculate the test statistic `T`, i.e., the χ^2 test statistic.
# Determine sample size.
n <- nrow(data)
T <- n * F_ML
T
# Extract degrees of freedom from `lavaan`.
fitMeasures(fit)[c("df")]
# Calculate our own degrees of freedom. We start with determining how many
# unique elements we have in the observed covariance matrix `S`.
#
# Define the number of variables.
p <- nrow(S)
#
# Define the number of unique elements.
k <- (p * (p + 1)) / 2
#
# Define number of free model parameters.
q <- length(parameters)
#
# Calculate degrees of freedom.
DF <- k - q
DF
# P value for chi square.
pchisq(T, DF, lower.tail = FALSE)
# From `lavaan`.
fitMeasures(fit)[c("chisq", "pvalue")]
# Extract `RMSEA` from `lavaan`.
fitMeasures(fit)[c("rmsea")]
# Calculate our own `RMSEA`.
sqrt((T - DF) / n * DF)
# If you reached this point... pat yourself on the back.
# What you did is remarkable! Well done!
# #endregion
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment