stonegold546/sum_s.R

## sum_s.R
library(lavaan)
library(psych)

sim.fun <- function (lv, lambda, nrep = 2e3) {
  np <- nrow(lv)
  t(replicate(nrep, {
    X <- lv %*% lambda +
      matrix(rnorm(np * length(lambda), 0, sqrt(1 - lambda ^ 2)), np, byrow = TRUE)
    # summary(cfa(paste("F =~", paste0("V", 1:length(lambda), collapse = " + ")), X, std.lv = TRUE))

    ss <- rowSums(X)
    ss.fit <- unname(fitmeasures(
      cfa(paste(paste("F =~ a *", paste0("V", 1:length(lambda), collapse = " + a * ")),
                paste0("V", 1:length(lambda), " ~~ b * V", 1:length(lambda), collapse = "\n"),
                sep = "\n"), X),
      c("chisq", "df", "pvalue")))
    fs <- predict(cfa(paste("F =~", paste0("V", 1:length(lambda), collapse = " + ")), X))

    c(alpha = psych::alpha(X)$total$raw_alpha, ss = cor(lv, ss), fs = cor(lv, fs),
      chisq = ss.fit[1], df = ss.fit[2], pvalue = ss.fit[3])
  }))
}

# c(.7, .8, .6, .7, .75, .5, .85, .55, .65)
np <- 6e1
lat <- matrix(qnorm((1:np - .5) / np))

(loadings.1 <- seq(.9, .6, length.out = 9)) ^ 2
res.1 <- sim.fun(lat, loadings.1)
colMeans(res.1)
mean(res.1[, "pvalue"] < .05)
apply(res.1[, 1:3], 2, function (x) mean((x - 1) ^ 2))

(loadings.2 <- seq(.9, .6, length.out = 3)) ^ 2
res.2 <- sim.fun(lat, loadings.2)
colMeans(res.2)
mean(res.2[, "pvalue"] < .05)
apply(res.2[, 1:3], 2, function (x) mean((x - 1) ^ 2))

(loadings.3 <- seq(.5, .3, length.out = 9)) ^ 2
res.3 <- sim.fun(lat, loadings.3)
colMeans(res.3)
mean(res.3[, "pvalue"] < .05)
apply(res.3[, 1:3], 2, function (x) mean((x - 1) ^ 2))

(loadings.4 <- seq(.5, .3, length.out = 3)) ^ 2
res.4 <- sim.fun(lat, loadings.4)
colMeans(res.4)
mean(res.4[, "pvalue"] < .05)
apply(res.4[, 1:3], 2, function (x) mean((x - 1) ^ 2))

(loadings.5 <- seq(.9, .3, length.out = 9)) ^ 2
res.5 <- sim.fun(lat, loadings.5)
colMeans(res.5)
mean(res.5[, "pvalue"] < .05)
apply(res.5[, 1:3], 2, function (x) mean((x - 1) ^ 2))

library(dplyr)
library(ggplot2)
library(scales)
library(directlabels)

theme_set(theme_classic())

res.all <- bind_rows(lapply(list(res.1, res.2, res.3, res.4, res.5), as.data.frame), .id = "design") %>%
  mutate(design = case_when(
    design == 1 ~ "9 items strong",
    design == 2 ~ "3 items strong",
    design == 3 ~ "9 items weak",
    design == 4 ~ "3 items weak",
    design == 5 ~ "9 items hetero"
  ), design = factor(design, levels = c("3 items weak", "9 items weak", "3 items strong",
                                        "9 items strong", "9 items hetero")))

cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

res.all %>%
  reshape(direction = "long", varying = 3:4, times = c("ss", "fs"), v.names = "correlation") %>%
  ggplot(aes(abs(correlation), fill = time)) + geom_density(alpha = .5) +
  geom_rug(aes(col = time), alpha = .25) +
  facet_wrap(~ design, scales = "free") +
  scale_fill_manual(values = cbbPalette, labels = c("Factor score", "Sum score")) +
  scale_color_manual(values = cbbPalette, labels = c("Factor score", "Sum score")) +
  theme(legend.position = c(.8, .3)) +
  labs(x = "", fill = "Approach", col = "Approach",
       subtitle = "Distribution of correlation to latent variable",
       caption = "The axes are different for each panel")
ggsave("cor.pdf", height = 5, width = 7)
ggsave("cor.png", height = 5, width = 7)

res.all %>%
  group_by(design) %>% mutate(power = mean(pvalue < .05)) %>%
  ggplot(aes(x = design, y = power)) +
  geom_segment(aes(xend = design, y = 0, yend = power), alpha = .00625) +
  geom_rug(aes(y = pvalue), x = NA, length = unit(.25, "npc"), alpha = .0625) +
  facet_wrap(~ reorder(design, -power), ncol = 1, scales = "free_y") +
  geom_point(shape = 1, alpha = .00625) + coord_flip() +
  geom_text(aes(label = percent(power, .1)), vjust = -1, alpha = .00625) +
  scale_y_continuous(labels = percent_format()) +
  labs(y = "Statistical power to detect misspecification of sum score model; alpha = .05", x = "",
       caption = "Vertical strips represent distribution of p-values") +
  theme(axis.line.y = element_blank(), axis.ticks.y = element_blank(),
        strip.background = element_blank(), strip.text = element_blank())
ggsave("power_misspec.pdf", height = 4, width = 7)
ggsave("power_misspec.png", height = 4, width = 7)

(tmp <- res.all %>%
    group_by(design) %>%
    summarise(alpha = mean(alpha), ss = mean((ss - 1) ^ 2),
              fs = mean((abs(fs) - 1) ^ 2),
              ratio = number(fs / ss, .01)))
tmp %>%
  ggplot(aes(ss, fs)) + geom_point(aes(size = ratio), shape = 1) + coord_fixed() +
  geom_dl(aes(label = design), method = "smart.grid") +
  scale_y_continuous(trans = log_trans(), breaks = round(tmp$fs, 4)) +
  scale_x_continuous(trans = log_trans(), breaks = round(tmp$ss, 4)) +
  scale_size_discrete(guide = guide_legend(reverse = TRUE)) +
  theme(legend.box.just = 0:1, legend.position = c(.25, .75)) +
  labs(x = "Sum MSE", y = "Factor MSE", size = "Factor MSE / Sum MSE",
       subtitle = "Mean squared error of correlation to latent variable",
       caption = "Axes are on natural log scale")
ggsave("mse.pdf", height = 5.5, width = 5)
ggsave("mse.png", height = 5.5, width = 5)

lmat <- data.frame(loadings = c(loadings.1, loadings.2, loadings.3, loadings.4, loadings.5),
                   design = c(rep("9 items strong", 9), rep("3 items strong", 3), rep("9 items weak", 9),
                              rep("3 items weak", 3), rep("9 items hetero", 9))) %>%
  mutate(design = factor(design, levels = rev(c("9 items strong", "3 items strong", "9 items weak",
                                                "3 items weak", "9 items hetero"))))

lmat %>%
  ggplot(aes(design, loadings)) + geom_point(shape = 1) + coord_flip() +
  scale_y_continuous(breaks = seq(.3, .9, .05),
                     sec.axis = sec_axis(trans = ~ . ^ 2, breaks = seq(.3, .9, .075) ^ 2,
                                         labels = percent_format(), name = "R-square")) +
  labs(x = "Loading set", y = "Loadings")
ggsave("loadings.pdf", height = 3, width = 7)
ggsave("loadings.png", height = 3, width = 7)
	library(lavaan)
	library(psych)

	sim.fun <- function (lv, lambda, nrep = 2e3) {
	np <- nrow(lv)
	t(replicate(nrep, {
	X <- lv %*% lambda +
	matrix(rnorm(np * length(lambda), 0, sqrt(1 - lambda ^ 2)), np, byrow = TRUE)
	# summary(cfa(paste("F =~", paste0("V", 1:length(lambda), collapse = " + ")), X, std.lv = TRUE))

	ss <- rowSums(X)
	ss.fit <- unname(fitmeasures(
	cfa(paste(paste("F =~ a ", paste0("V", 1:length(lambda), collapse = " + a ")),
	paste0("V", 1:length(lambda), " ~~ b * V", 1:length(lambda), collapse = "\n"),
	sep = "\n"), X),
	c("chisq", "df", "pvalue")))
	fs <- predict(cfa(paste("F =~", paste0("V", 1:length(lambda), collapse = " + ")), X))

	c(alpha = psych::alpha(X)$total$raw_alpha, ss = cor(lv, ss), fs = cor(lv, fs),
	chisq = ss.fit[1], df = ss.fit[2], pvalue = ss.fit[3])
	}))
	}

	# c(.7, .8, .6, .7, .75, .5, .85, .55, .65)
	np <- 6e1
	lat <- matrix(qnorm((1:np - .5) / np))

	(loadings.1 <- seq(.9, .6, length.out = 9)) ^ 2
	res.1 <- sim.fun(lat, loadings.1)
	colMeans(res.1)
	mean(res.1[, "pvalue"] < .05)
	apply(res.1[, 1:3], 2, function (x) mean((x - 1) ^ 2))

	(loadings.2 <- seq(.9, .6, length.out = 3)) ^ 2
	res.2 <- sim.fun(lat, loadings.2)
	colMeans(res.2)
	mean(res.2[, "pvalue"] < .05)
	apply(res.2[, 1:3], 2, function (x) mean((x - 1) ^ 2))

	(loadings.3 <- seq(.5, .3, length.out = 9)) ^ 2
	res.3 <- sim.fun(lat, loadings.3)
	colMeans(res.3)
	mean(res.3[, "pvalue"] < .05)
	apply(res.3[, 1:3], 2, function (x) mean((x - 1) ^ 2))

	(loadings.4 <- seq(.5, .3, length.out = 3)) ^ 2
	res.4 <- sim.fun(lat, loadings.4)
	colMeans(res.4)
	mean(res.4[, "pvalue"] < .05)
	apply(res.4[, 1:3], 2, function (x) mean((x - 1) ^ 2))

	(loadings.5 <- seq(.9, .3, length.out = 9)) ^ 2
	res.5 <- sim.fun(lat, loadings.5)
	colMeans(res.5)
	mean(res.5[, "pvalue"] < .05)
	apply(res.5[, 1:3], 2, function (x) mean((x - 1) ^ 2))

	library(dplyr)
	library(ggplot2)
	library(scales)
	library(directlabels)

	theme_set(theme_classic())

	res.all <- bind_rows(lapply(list(res.1, res.2, res.3, res.4, res.5), as.data.frame), .id = "design") %>%
	mutate(design = case_when(
	design == 1 ~ "9 items strong",
	design == 2 ~ "3 items strong",
	design == 3 ~ "9 items weak",
	design == 4 ~ "3 items weak",
	design == 5 ~ "9 items hetero"
	), design = factor(design, levels = c("3 items weak", "9 items weak", "3 items strong",
	"9 items strong", "9 items hetero")))

	cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

	res.all %>%
	reshape(direction = "long", varying = 3:4, times = c("ss", "fs"), v.names = "correlation") %>%
	ggplot(aes(abs(correlation), fill = time)) + geom_density(alpha = .5) +
	geom_rug(aes(col = time), alpha = .25) +
	facet_wrap(~ design, scales = "free") +
	scale_fill_manual(values = cbbPalette, labels = c("Factor score", "Sum score")) +
	scale_color_manual(values = cbbPalette, labels = c("Factor score", "Sum score")) +
	theme(legend.position = c(.8, .3)) +
	labs(x = "", fill = "Approach", col = "Approach",
	subtitle = "Distribution of correlation to latent variable",
	caption = "The axes are different for each panel")
	ggsave("cor.pdf", height = 5, width = 7)
	ggsave("cor.png", height = 5, width = 7)

	res.all %>%
	group_by(design) %>% mutate(power = mean(pvalue < .05)) %>%
	ggplot(aes(x = design, y = power)) +
	geom_segment(aes(xend = design, y = 0, yend = power), alpha = .00625) +
	geom_rug(aes(y = pvalue), x = NA, length = unit(.25, "npc"), alpha = .0625) +
	facet_wrap(~ reorder(design, -power), ncol = 1, scales = "free_y") +
	geom_point(shape = 1, alpha = .00625) + coord_flip() +
	geom_text(aes(label = percent(power, .1)), vjust = -1, alpha = .00625) +
	scale_y_continuous(labels = percent_format()) +
	labs(y = "Statistical power to detect misspecification of sum score model; alpha = .05", x = "",
	caption = "Vertical strips represent distribution of p-values") +
	theme(axis.line.y = element_blank(), axis.ticks.y = element_blank(),
	strip.background = element_blank(), strip.text = element_blank())
	ggsave("power_misspec.pdf", height = 4, width = 7)
	ggsave("power_misspec.png", height = 4, width = 7)

	(tmp <- res.all %>%
	group_by(design) %>%
	summarise(alpha = mean(alpha), ss = mean((ss - 1) ^ 2),
	fs = mean((abs(fs) - 1) ^ 2),
	ratio = number(fs / ss, .01)))
	tmp %>%
	ggplot(aes(ss, fs)) + geom_point(aes(size = ratio), shape = 1) + coord_fixed() +
	geom_dl(aes(label = design), method = "smart.grid") +
	scale_y_continuous(trans = log_trans(), breaks = round(tmp$fs, 4)) +
	scale_x_continuous(trans = log_trans(), breaks = round(tmp$ss, 4)) +
	scale_size_discrete(guide = guide_legend(reverse = TRUE)) +
	theme(legend.box.just = 0:1, legend.position = c(.25, .75)) +
	labs(x = "Sum MSE", y = "Factor MSE", size = "Factor MSE / Sum MSE",
	subtitle = "Mean squared error of correlation to latent variable",
	caption = "Axes are on natural log scale")
	ggsave("mse.pdf", height = 5.5, width = 5)
	ggsave("mse.png", height = 5.5, width = 5)

	lmat <- data.frame(loadings = c(loadings.1, loadings.2, loadings.3, loadings.4, loadings.5),
	design = c(rep("9 items strong", 9), rep("3 items strong", 3), rep("9 items weak", 9),
	rep("3 items weak", 3), rep("9 items hetero", 9))) %>%
	mutate(design = factor(design, levels = rev(c("9 items strong", "3 items strong", "9 items weak",
	"3 items weak", "9 items hetero"))))

	lmat %>%
	ggplot(aes(design, loadings)) + geom_point(shape = 1) + coord_flip() +
	scale_y_continuous(breaks = seq(.3, .9, .05),
	sec.axis = sec_axis(trans = ~ . ^ 2, breaks = seq(.3, .9, .075) ^ 2,
	labels = percent_format(), name = "R-square")) +
	labs(x = "Loading set", y = "Loadings")
	ggsave("loadings.pdf", height = 3, width = 7)
	ggsave("loadings.png", height = 3, width = 7)