ito4303/unequal_variance.R

## unequal_variance.R
## ANOVA and Kruskal-Wallis test for data with unequal variances

library(ggplot2)
library(purrr)
library(dplyr)

# Normal distribution
test_norm <- function(R = 2000, N = c(30, 30, 30),
                      mu = c(0, 0, 0), sigma = c(1, 1, 1)) {
  y <- purrr::map(seq_len(R), function(i) {
    x <- c(rnorm(N[1], mean = mu[1], sd = sigma[1]),
           rnorm(N[2], mean = mu[2], sd = sigma[2]),
           rnorm(N[3], mean = mu[3], sd = sigma[3]))
    g <- c(rep("g1", N[1]), rep("g2", N[2]), rep("g3", N[3]))
    d <- data.frame(g, x)
    an <- summary(aov(x ~ g, data = d))[[1]]["Pr(>F)"][1, ]
    ks <- kruskal.test(x ~ g, data = d)$p.value
    c(an, ks)
  })
  df <- data.frame(p_value = purrr::simplify(y),
                   method = rep(c("ANOVA", "Kruskal-Wallis"), R))
  plot <- ggplot(df, aes(p_value, fill = method)) +
    geom_histogram(binwidth = 0.05, boundary = -0.5,
                   position = "dodge")
  print(plot)
  summary <- df %>%
    dplyr::mutate(lt5 = if_else(p_value < 0.05, 1 / (n() / 2), 0)) %>%
    dplyr::group_by(method) %>%
    dplyr::summarize_at("lt5", sum)
  print(summary)
}

set.seed(20180415)
test_norm(R = 10000, N = c(15, 30, 45), mu = c(0, 0, 0), sigma = c(1, 1, 1))
test_norm(R = 10000, N = c(30, 30, 30), mu = c(0, 0, 0), sigma = c(1, 1, 1))
test_norm(R = 10000, N = c(45, 30, 15), mu = c(0, 0, 0), sigma = c(1, 1, 1))

set.seed(20180416)
test_norm(R = 10000, N = c(15, 30, 45), mu = c(0, 0, 0), sigma = c(1, 2, 4))
test_norm(R = 10000, N = c(30, 30, 30), mu = c(0, 0, 0), sigma = c(1, 2, 4))
test_norm(R = 10000, N = c(45, 30, 15), mu = c(0, 0, 0), sigma = c(1, 2, 4))

set.seed(20180417)
test_norm(R = 10000, N = c(30, 60, 90), mu = c(0, 0, 0), sigma = c(1, 2, 4))
test_norm(R = 10000, N = c(60, 60, 60), mu = c(0, 0, 0), sigma = c(1, 2, 4))
test_norm(R = 10000, N = c(90, 60, 30), mu = c(0, 0, 0), sigma = c(1, 2, 4))
	## ANOVA and Kruskal-Wallis test for data with unequal variances

	library(ggplot2)
	library(purrr)
	library(dplyr)

	# Normal distribution
	test_norm <- function(R = 2000, N = c(30, 30, 30),
	mu = c(0, 0, 0), sigma = c(1, 1, 1)) {
	y <- purrr::map(seq_len(R), function(i) {
	x <- c(rnorm(N[1], mean = mu[1], sd = sigma[1]),
	rnorm(N[2], mean = mu[2], sd = sigma[2]),
	rnorm(N[3], mean = mu[3], sd = sigma[3]))
	g <- c(rep("g1", N[1]), rep("g2", N[2]), rep("g3", N[3]))
	d <- data.frame(g, x)
	an <- summary(aov(x ~ g, data = d))[[1]]["Pr(>F)"][1, ]
	ks <- kruskal.test(x ~ g, data = d)$p.value
	c(an, ks)
	})
	df <- data.frame(p_value = purrr::simplify(y),
	method = rep(c("ANOVA", "Kruskal-Wallis"), R))
	plot <- ggplot(df, aes(p_value, fill = method)) +
	geom_histogram(binwidth = 0.05, boundary = -0.5,
	position = "dodge")
	print(plot)
	summary <- df %>%
	dplyr::mutate(lt5 = if_else(p_value < 0.05, 1 / (n() / 2), 0)) %>%
	dplyr::group_by(method) %>%
	dplyr::summarize_at("lt5", sum)
	print(summary)
	}

	set.seed(20180415)
	test_norm(R = 10000, N = c(15, 30, 45), mu = c(0, 0, 0), sigma = c(1, 1, 1))
	test_norm(R = 10000, N = c(30, 30, 30), mu = c(0, 0, 0), sigma = c(1, 1, 1))
	test_norm(R = 10000, N = c(45, 30, 15), mu = c(0, 0, 0), sigma = c(1, 1, 1))

	set.seed(20180416)
	test_norm(R = 10000, N = c(15, 30, 45), mu = c(0, 0, 0), sigma = c(1, 2, 4))
	test_norm(R = 10000, N = c(30, 30, 30), mu = c(0, 0, 0), sigma = c(1, 2, 4))
	test_norm(R = 10000, N = c(45, 30, 15), mu = c(0, 0, 0), sigma = c(1, 2, 4))

	set.seed(20180417)
	test_norm(R = 10000, N = c(30, 60, 90), mu = c(0, 0, 0), sigma = c(1, 2, 4))
	test_norm(R = 10000, N = c(60, 60, 60), mu = c(0, 0, 0), sigma = c(1, 2, 4))
	test_norm(R = 10000, N = c(90, 60, 30), mu = c(0, 0, 0), sigma = c(1, 2, 4))