SachaEpskamp/rivm_lgc.R

## rivm_lgc.R
# This script contains some functions to automate things:

cat("THIS FUNCTION IS OUTDATED, PLEASE SEE https://github.com/SachaEpskamp/RIVMgrowth")

# Function to automize dummy encoding:
rivm_lgc_dummy <- function(
  data, # Dataset
  design, # Design matrix, as in psychonetrics
  type = c("non-linear","linear"), # Type of analysis to do
  predictor_var, # Variable name of predictor (only one supported at the moment)
  time # time vector encoding distances in time
){
  library("dplyr")

  # Check data:
  stopifnot(is.data.frame(data))

  # Check design matrix:
  allVars <- unlist(design)
  allVars <- allVars[!is.na(allVars)]
  if (!all(allVars %in% names(data))){
    stop("Not all names in the design matrix correspond to variables in the dataset.")
  }

  # Extract from design:
  if (!is.matrix(design)){
    design <- t(design)
  }
  nVar <- nrow(design)
  nWave <- ncol(design)

  # Names of design:
  if (is.null(rownames(design))){
    rownames(design) <- paste0("Var_",seq_len(nVar))
  }
  variables <- rownames(design)

  # Time vector:
  if (missing(time)){
    time <- seq_len(nWave)
  }

  # Type of analysis:
  type <- match.arg(type)

  # Check if predictor is present:
  use_predictor <- !missing(predictor_var)

  # Setup dummy:
  if (use_predictor){
    # Check data:
    if (!predictor_var %in% names(data)){
      stop("predictor_var does not correspond to a variable in the dataset")
    }

    # Make factor if needed:
    if (!is.factor(data[[predictor_var]])){
      data[[predictor_var]] <- as.factor(data[[predictor_var]])
    }

    # Make dummy variables:
    predictor_levels <- levels(data[[predictor_var]])
    nlevels_predictor <- length(predictor_levels)
    nDummy <- nlevels_predictor-1

    # Dummy coding:
    dummyvars <- paste0("DUMMY_",seq_len(nDummy))
    for (d in seq_len(nDummy)){
      data[[dummyvars[[d]]]] <- 1* (data[[predictor_var]] == predictor_levels[d+1])
    }

  } else {
    nDummy <- 0
    dummyvars <- character(0)
  }

  # Names of parameters needed in the model:
  intercepts <- paste0("int_",variables)
  slopes <- paste0("slope_",variables)
  residual <- paste0("resid_",variables)

  # Slope parameters:
  if (type == "non-linear"){
    slope_pars <- lapply(variables,function(x){
      c(0,1,paste0("slope_",x,"_",3:nWave))
    })
    names(slope_pars) <- variables
  } else {
    slope_pars <- lapply(variables,function(x){
      time - time[1]
    })
    names(slope_pars) <- variables
  }

  # Intercept mean parameters:
  int_mean_pars <- lapply(variables,function(x){
    paste0("intercept_model_",x,"_",seq_len(nDummy+1)-1)
  })
  names(int_mean_pars) <- variables

  # Slope mean parameters:
  slope_mean_pars <- lapply(variables,function(x){
    paste0("slope_model_",x,"_",seq_len(nDummy+1)-1)
  })
  names(slope_mean_pars) <- variables

  # Start forming the model:
  mod <- character(0)
  cur_line <- 0

  # loop over variables:
  for (v in seq_len(nVar)){
    # Variables from the design matrix:
    curvars <- design[v,]

    # Which are not NA?
    incl <- !is.na(curvars)
    inclvars <- which(incl)

    # Cut out NA:
    curvars <- curvars[!is.na(curvars)]

    # Update the line:
    cur_line <- cur_line + 2

    # Header:
    mod[cur_line] <- paste0("# Intercepts for variable ",variables[v])

    # Update the line:
    cur_line <- cur_line + 1

    # Write intercepts:
    mod[cur_line] <- paste0(intercepts[v], " =~ ", paste0("1 * ",curvars, collapse = " + "))

    # Update the line:
    cur_line <- cur_line + 1

    # Now model the intercept:
    mod[cur_line] <- paste0(intercepts[v], " ~ ", paste0(int_mean_pars[[v]],"*",c("1", dummyvars), collapse = " + "))

    # Update the line:
    cur_line <- cur_line + 2

    # Header:
    mod[cur_line] <- paste0("# Slopes for variable ",variables[v])

    # Update the line:
    cur_line <- cur_line + 1

    # Write slopes:
    mod[cur_line] <- paste0(slopes[v], " =~ ", paste0(slope_pars[[v]][incl]," * ",curvars, collapse = " + "))

    # Update the line:
    cur_line <- cur_line + 1

    # Now model the slopes:
    mod[cur_line] <- paste0(slopes[v], " ~ ", paste0(slope_mean_pars[[v]],"*",c("1", dummyvars), collapse = " + "))

    # Update the line:
    cur_line <- cur_line + 2

    # Header:
    mod[cur_line] <- paste0("# Residuals for variable ",variables[v])

    # Write residuals:
    for (i in seq_along(curvars)){
      # Update the line:
      cur_line <- cur_line + 1

      mod[cur_line] <- paste0(curvars[i], " ~~ ", residual[v]," * ",curvars[i])
    }

  }


  # Now add all expected values of all groups at all waves for all variables:
  if (use_predictor){
    ev_df <- expand.grid(
      predictor = seq_along(predictor_levels),
      wave = seq_len(nWave),
      var = variables,
      stringsAsFactors = FALSE
    )
    names(ev_df) <- c("predictor","wave","var")

    # Add dummies:
    for (d in seq_len(nDummy)){
      ev_df[[dummyvars[[d]]]] <- 1* (predictor_levels[ev_df$predictor] == predictor_levels[d+1])
    }

    # Parameter name:
    ev_df$par <-  paste0(ev_df$var,"_wave",ev_df$wave,"_",ev_df$predictor)


  } else {
    ev_df <- expand.grid(
      wave = seq_len(nWave),
      var = variables,
      stringsAsFactors = FALSE
    )

    # Parameter name:
    ev_df$par <-  paste0(ev_df$var,"_wave",ev_df$wave)

  }

  # Update the line:
  cur_line <- cur_line + 2

  # Header:
  mod[cur_line] <- "# Dummy encoding"


  # Loop over these to add:
  for (i in seq_len(nrow(ev_df))){
    var <- ev_df$var[i]
    wave <- ev_df$wave[i]
    dummies <- unlist(ev_df[i,dummyvars])


    # Current parameter:
    curpar <- ev_df$par[i]

    # Update the line:
    cur_line <- cur_line + 1

    # Write the line:
    mod[cur_line] <-  paste0(
      curpar, " := ", paste0(
        int_mean_pars[[var]], "*", c("1",dummies),
        collapse = " + "
      ), " + ",
      paste0(
        slope_mean_pars[[var]], "*", c("1",dummies), " * ", slope_pars[[v]][wave],
        collapse = " + "
      )
    )
  }

  # Fix empty lines:
  mod[is.na(mod)] <- ""

  # Aggregate:
  model <- paste0(mod, collapse = "\n")


  # Run lavaan model:
  fit <- growth(model, data, missing = "fiml")

  # Connect the parameter estimates to the expected value table:
  parTable <- parameterEstimates(fit, standardized = TRUE)

  # Join with the expected values:
  ev_df <- ev_df %>% left_join(parTable, by = c("par" = "label"))

  # Add time:
  ev_df$time_of_wave <- time[ev_df$wave]

  # Fix predictor:
  if (use_predictor){
    ev_df$predictor <- predictor_levels[ev_df$predictor]
  }

  # Select columns:
  if (use_predictor){
    ev_df <- ev_df[,c("var","wave","time_of_wave","predictor","est","se","ci.lower","ci.upper")]
  } else {
    ev_df <- ev_df[,c("var","wave","time_of_wave","est","se","ci.lower","ci.upper")]
  }


  # Significance of the regressions:
  reg_df <- parTable %>% filter(op %in% c("~1", "~"), lhs %in% c(intercepts, slopes)) %>%
    select(-label)

  # Correlations between slopes:
  cor_df <- parTable %>% filter(op == "~~", lhs %in% c(intercepts, slopes), rhs %in% c(intercepts, slopes)) %>%
    select(lhs, op, rhs, est, se, z, pvalue, ci.lower, ci.upper, std.all)

  # Make list:
  results <- list(
    lavresult = fit,
    expected_values = ev_df,
    regressions = reg_df,
    correlations = cor_df,
    time = time,
    use_predictor = use_predictor
  )
  class(results) <- "rivm_lgc_dummy"


  # Return:
  return(results)
}

# Print method:
print.rivm_lgc_dummy <- function(x){
  cat("=== RIVM Latent Growth Curve Analysis ===","\n\n")

  cat("Lavaan fit result: \n")
  print(x$lavresult)

  cat("\n\nLavaan fit measures: \n")
  fit <- round(fitMeasures(x$lavresult)[c("chisq", "df", "pvalue", "cfi", "tli", "nnfi", "rfi",
                            "nfi", "pnfi", "ifi", "rni", "rmsea", "rmsea.ci.lower", "rmsea.ci.upper",
                            "rmsea.pvalue", "srmr", "gfi", "agfi")], 2)

  df <- data.frame(
    index = names(fit),
    value = fit
  )
  rownames(df) <- NULL
  print(df, digits = 2)

  cat("\n\nRegression coefficients: \n")
  print(x$regressions, digits = 2)

  cat("\n\n(co)variance and correlation coefficients: \n")
  print(x$correlations, digits = 2)

  cat("\n\nExpected values: \n")
  print(x$expected_values, digits = 2)

  cat("\n\n=========================================")
}


# Plot method:
plot.rivm_lgc_dummy <- function(x, predictor_label = "Predictor", wave_label = "Wave "){
  library("ggplot2")

  if (x$use_predictor){
    p <- ggplot(x$expected_values,

                # With aesthetics:
                aes(
                  x = time_of_wave, # This is the variable that enocdes the time of the wave.
                  y = est, # The estimate
                  ymin = ci.lower, # If we want CIs
                  ymax = ci.upper, # If we want CIs
                  colour = predictor,
                  fill = predictor
                )) +

      # Nicer legend (and colorblind theme):
      scale_colour_manual(predictor_label,
                          values = qgraph:::colorblind(length(unique(x$expected_values$predictor))))


  } else {
    p <- ggplot(x$expected_values,

                # With aesthetics:
                aes(
                  x = time_of_wave, # This is the variable that enocdes the time of the wave.
                  y = est, # The estimate
                  ymin = ci.lower, # If we want CIs
                  ymax = ci.upper # If we want CIs
                ))
  }

  p <- p +

    # Seperate plot per variable:
    facet_grid(var ~ ., scales = "free_y") +

    # Add confidence region (uncomment to remove):
    geom_ribbon(alpha = 0.1, show.legend = FALSE, colour = NA) +

    # Add lines:
    geom_line(lwd=1.2) +

    # Add points:
    geom_point(cex = 3, pch = 21, fill = "white", stroke = 1.5) +

    # Nicer theme:
    theme_bw() +

    # Set x axis scale to have waves instead of numbers:
    scale_x_continuous(breaks = sort(unique(x$expected_values$time_of_wave)),
                       labels = paste0(wave_label," ",sort(unique(x$expected_values$wave)))) +

    # No labels:
    xlab("") +
    ylab("") +

    # Larger fonts:
    theme(text = element_text(size = 20), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))


  return(p)
}


# Correlation plot:
corplot <- function(x, legend.position = "topright"){
  library("dplyr")
  library("qgraph")

  cors <- x$correlations
  cors <- cors %>% filter(lhs != rhs) %>% select(
    lhs = lhs, rhs = rhs, weight = std.all, pvalue = pvalue
  ) %>%
    mutate(
      lhs = gsub("\\_","\n",lhs),
      rhs = gsub("\\_","\n",rhs)
    ) %>% mutate(
      lhs = gsub("int\n","intercept\n",lhs),
      rhs = gsub("int\n","intercept\n",rhs)
    )

  cors$lty <- 3
  cors$lty[cors$pvalue < 0.05] <- 2
  cors$lty[cors$pvalue < 0.01] <- 1

  qgraph(cors[,1:3], directed = FALSE, theme = "colorblind",
         edge.labels = TRUE, lty = cors$lty, vsize = 14)

  legend(legend.position, lty = c(1,2,3),legend = c("p < 0.01","p < 0.05","p > 0.05"),
         bty = "n", cex = 1.5, lwd = 3)

}
	# This script contains some functions to automate things:

	cat("THIS FUNCTION IS OUTDATED, PLEASE SEE https://github.com/SachaEpskamp/RIVMgrowth")

	# Function to automize dummy encoding:
	rivm_lgc_dummy <- function(
	data, # Dataset
	design, # Design matrix, as in psychonetrics
	type = c("non-linear","linear"), # Type of analysis to do
	predictor_var, # Variable name of predictor (only one supported at the moment)
	time # time vector encoding distances in time
	){
	library("dplyr")

	# Check data:
	stopifnot(is.data.frame(data))

	# Check design matrix:
	allVars <- unlist(design)
	allVars <- allVars[!is.na(allVars)]
	if (!all(allVars %in% names(data))){
	stop("Not all names in the design matrix correspond to variables in the dataset.")
	}

	# Extract from design:
	if (!is.matrix(design)){
	design <- t(design)
	}
	nVar <- nrow(design)
	nWave <- ncol(design)

	# Names of design:
	if (is.null(rownames(design))){
	rownames(design) <- paste0("Var_",seq_len(nVar))
	}
	variables <- rownames(design)

	# Time vector:
	if (missing(time)){
	time <- seq_len(nWave)
	}

	# Type of analysis:
	type <- match.arg(type)

	# Check if predictor is present:
	use_predictor <- !missing(predictor_var)

	# Setup dummy:
	if (use_predictor){
	# Check data:
	if (!predictor_var %in% names(data)){
	stop("predictor_var does not correspond to a variable in the dataset")
	}

	# Make factor if needed:
	if (!is.factor(data[[predictor_var]])){
	data[[predictor_var]] <- as.factor(data[[predictor_var]])
	}

	# Make dummy variables:
	predictor_levels <- levels(data[[predictor_var]])
	nlevels_predictor <- length(predictor_levels)
	nDummy <- nlevels_predictor-1

	# Dummy coding:
	dummyvars <- paste0("DUMMY_",seq_len(nDummy))
	for (d in seq_len(nDummy)){
	data[[dummyvars[[d]]]] <- 1* (data[[predictor_var]] == predictor_levels[d+1])
	}

	} else {
	nDummy <- 0
	dummyvars <- character(0)
	}

	# Names of parameters needed in the model:
	intercepts <- paste0("int_",variables)
	slopes <- paste0("slope_",variables)
	residual <- paste0("resid_",variables)

	# Slope parameters:
	if (type == "non-linear"){
	slope_pars <- lapply(variables,function(x){
	c(0,1,paste0("slope_",x,"_",3:nWave))
	})
	names(slope_pars) <- variables
	} else {
	slope_pars <- lapply(variables,function(x){
	time - time[1]
	})
	names(slope_pars) <- variables
	}

	# Intercept mean parameters:
	int_mean_pars <- lapply(variables,function(x){
	paste0("intercept_model_",x,"_",seq_len(nDummy+1)-1)
	})
	names(int_mean_pars) <- variables

	# Slope mean parameters:
	slope_mean_pars <- lapply(variables,function(x){
	paste0("slope_model_",x,"_",seq_len(nDummy+1)-1)
	})
	names(slope_mean_pars) <- variables

	# Start forming the model:
	mod <- character(0)
	cur_line <- 0

	# loop over variables:
	for (v in seq_len(nVar)){
	# Variables from the design matrix:
	curvars <- design[v,]

	# Which are not NA?
	incl <- !is.na(curvars)
	inclvars <- which(incl)

	# Cut out NA:
	curvars <- curvars[!is.na(curvars)]

	# Update the line:
	cur_line <- cur_line + 2

	# Header:
	mod[cur_line] <- paste0("# Intercepts for variable ",variables[v])

	# Update the line:
	cur_line <- cur_line + 1

	# Write intercepts:
	mod[cur_line] <- paste0(intercepts[v], " =~ ", paste0("1 * ",curvars, collapse = " + "))

	# Update the line:
	cur_line <- cur_line + 1

	# Now model the intercept:
	mod[cur_line] <- paste0(intercepts[v], " ~ ", paste0(int_mean_pars[[v]],"*",c("1", dummyvars), collapse = " + "))

	# Update the line:
	cur_line <- cur_line + 2

	# Header:
	mod[cur_line] <- paste0("# Slopes for variable ",variables[v])

	# Update the line:
	cur_line <- cur_line + 1

	# Write slopes:
	mod[cur_line] <- paste0(slopes[v], " =~ ", paste0(slope_pars[[v]][incl]," * ",curvars, collapse = " + "))

	# Update the line:
	cur_line <- cur_line + 1

	# Now model the slopes:
	mod[cur_line] <- paste0(slopes[v], " ~ ", paste0(slope_mean_pars[[v]],"*",c("1", dummyvars), collapse = " + "))

	# Update the line:
	cur_line <- cur_line + 2

	# Header:
	mod[cur_line] <- paste0("# Residuals for variable ",variables[v])

	# Write residuals:
	for (i in seq_along(curvars)){
	# Update the line:
	cur_line <- cur_line + 1

	mod[cur_line] <- paste0(curvars[i], " ~~ ", residual[v]," * ",curvars[i])
	}

	}


	# Now add all expected values of all groups at all waves for all variables:
	if (use_predictor){
	ev_df <- expand.grid(
	predictor = seq_along(predictor_levels),
	wave = seq_len(nWave),
	var = variables,
	stringsAsFactors = FALSE
	)
	names(ev_df) <- c("predictor","wave","var")

	# Add dummies:
	for (d in seq_len(nDummy)){
	ev_df[[dummyvars[[d]]]] <- 1* (predictor_levels[ev_df$predictor] == predictor_levels[d+1])
	}

	# Parameter name:
	ev_df$par <- paste0(ev_df$var,"_wave",ev_df$wave,"_",ev_df$predictor)


	} else {
	ev_df <- expand.grid(
	wave = seq_len(nWave),
	var = variables,
	stringsAsFactors = FALSE
	)

	# Parameter name:
	ev_df$par <- paste0(ev_df$var,"_wave",ev_df$wave)

	}

	# Update the line:
	cur_line <- cur_line + 2

	# Header:
	mod[cur_line] <- "# Dummy encoding"


	# Loop over these to add:
	for (i in seq_len(nrow(ev_df))){
	var <- ev_df$var[i]
	wave <- ev_df$wave[i]
	dummies <- unlist(ev_df[i,dummyvars])


	# Current parameter:
	curpar <- ev_df$par[i]

	# Update the line:
	cur_line <- cur_line + 1

	# Write the line:
	mod[cur_line] <- paste0(
	curpar, " := ", paste0(
	int_mean_pars[[var]], "*", c("1",dummies),
	collapse = " + "
	), " + ",
	paste0(
	slope_mean_pars[[var]], "", c("1",dummies), " ", slope_pars[[v]][wave],
	collapse = " + "
	)
	)
	}

	# Fix empty lines:
	mod[is.na(mod)] <- ""

	# Aggregate:
	model <- paste0(mod, collapse = "\n")



	# Run lavaan model:
	fit <- growth(model, data, missing = "fiml")

	# Connect the parameter estimates to the expected value table:
	parTable <- parameterEstimates(fit, standardized = TRUE)

	# Join with the expected values:
	ev_df <- ev_df %>% left_join(parTable, by = c("par" = "label"))

	# Add time:
	ev_df$time_of_wave <- time[ev_df$wave]

	# Fix predictor:
	if (use_predictor){
	ev_df$predictor <- predictor_levels[ev_df$predictor]
	}

	# Select columns:
	if (use_predictor){
	ev_df <- ev_df[,c("var","wave","time_of_wave","predictor","est","se","ci.lower","ci.upper")]
	} else {
	ev_df <- ev_df[,c("var","wave","time_of_wave","est","se","ci.lower","ci.upper")]
	}



	# Significance of the regressions:
	reg_df <- parTable %>% filter(op %in% c("~1", "~"), lhs %in% c(intercepts, slopes)) %>%
	select(-label)

	# Correlations between slopes:
	cor_df <- parTable %>% filter(op == "~~", lhs %in% c(intercepts, slopes), rhs %in% c(intercepts, slopes)) %>%
	select(lhs, op, rhs, est, se, z, pvalue, ci.lower, ci.upper, std.all)

	# Make list:
	results <- list(
	lavresult = fit,
	expected_values = ev_df,
	regressions = reg_df,
	correlations = cor_df,
	time = time,
	use_predictor = use_predictor
	)
	class(results) <- "rivm_lgc_dummy"


	# Return:
	return(results)
	}

	# Print method:
	print.rivm_lgc_dummy <- function(x){
	cat("=== RIVM Latent Growth Curve Analysis ===","\n\n")

	cat("Lavaan fit result: \n")
	print(x$lavresult)

	cat("\n\nLavaan fit measures: \n")
	fit <- round(fitMeasures(x$lavresult)[c("chisq", "df", "pvalue", "cfi", "tli", "nnfi", "rfi",
	"nfi", "pnfi", "ifi", "rni", "rmsea", "rmsea.ci.lower", "rmsea.ci.upper",
	"rmsea.pvalue", "srmr", "gfi", "agfi")], 2)

	df <- data.frame(
	index = names(fit),
	value = fit
	)
	rownames(df) <- NULL
	print(df, digits = 2)

	cat("\n\nRegression coefficients: \n")
	print(x$regressions, digits = 2)

	cat("\n\n(co)variance and correlation coefficients: \n")
	print(x$correlations, digits = 2)

	cat("\n\nExpected values: \n")
	print(x$expected_values, digits = 2)

	cat("\n\n=========================================")
	}


	# Plot method:
	plot.rivm_lgc_dummy <- function(x, predictor_label = "Predictor", wave_label = "Wave "){
	library("ggplot2")

	if (x$use_predictor){
	p <- ggplot(x$expected_values,

	# With aesthetics:
	aes(
	x = time_of_wave, # This is the variable that enocdes the time of the wave.
	y = est, # The estimate
	ymin = ci.lower, # If we want CIs
	ymax = ci.upper, # If we want CIs
	colour = predictor,
	fill = predictor
	)) +

	# Nicer legend (and colorblind theme):
	scale_colour_manual(predictor_label,
	values = qgraph:::colorblind(length(unique(x$expected_values$predictor))))


	} else {
	p <- ggplot(x$expected_values,

	# With aesthetics:
	aes(
	x = time_of_wave, # This is the variable that enocdes the time of the wave.
	y = est, # The estimate
	ymin = ci.lower, # If we want CIs
	ymax = ci.upper # If we want CIs
	))
	}

	p <- p +

	# Seperate plot per variable:
	facet_grid(var ~ ., scales = "free_y") +

	# Add confidence region (uncomment to remove):
	geom_ribbon(alpha = 0.1, show.legend = FALSE, colour = NA) +

	# Add lines:
	geom_line(lwd=1.2) +

	# Add points:
	geom_point(cex = 3, pch = 21, fill = "white", stroke = 1.5) +

	# Nicer theme:
	theme_bw() +

	# Set x axis scale to have waves instead of numbers:
	scale_x_continuous(breaks = sort(unique(x$expected_values$time_of_wave)),
	labels = paste0(wave_label," ",sort(unique(x$expected_values$wave)))) +

	# No labels:
	xlab("") +
	ylab("") +

	# Larger fonts:
	theme(text = element_text(size = 20), axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))


	return(p)
	}


	# Correlation plot:
	corplot <- function(x, legend.position = "topright"){
	library("dplyr")
	library("qgraph")

	cors <- x$correlations
	cors <- cors %>% filter(lhs != rhs) %>% select(
	lhs = lhs, rhs = rhs, weight = std.all, pvalue = pvalue
	) %>%
	mutate(
	lhs = gsub("\\_","\n",lhs),
	rhs = gsub("\\_","\n",rhs)
	) %>% mutate(
	lhs = gsub("int\n","intercept\n",lhs),
	rhs = gsub("int\n","intercept\n",rhs)
	)

	cors$lty <- 3
	cors$lty[cors$pvalue < 0.05] <- 2
	cors$lty[cors$pvalue < 0.01] <- 1

	qgraph(cors[,1:3], directed = FALSE, theme = "colorblind",
	edge.labels = TRUE, lty = cors$lty, vsize = 14)

	legend(legend.position, lty = c(1,2,3),legend = c("p < 0.01","p < 0.05","p > 0.05"),
	bty = "n", cex = 1.5, lwd = 3)

	}