tomhopper/central_limit_theorem.R

## central_limit_theorem.R
## Demonstration of central limit theorem
## Based on code in an anonymous comment to the blog post at \url{https://sas-and-r.blogspot.com/2012/01/example-919-demonstrating-central-limit.html}

## Libraries ####
library(nortest)
library(dplyr)
library(ggplot2)

## Data used ####
# right-triangle distribution (peak at 0; minimum at 1)
data_vec <- rbeta(10000, shape1 = 1, shape2 = 2);

## Function ####
#' @title Multiple comparisons of a given data vector to the normal distribution
#' @param data_vec Data to sample for calculating means
#' @param sample_size Sample size to use for calculating means
#' @return A number representing the proportion of p-values < 0.05
#' @description Given a data vector and a sample size, computes 200 means
#'   based on \code{sample_size}, then compares the means to the normal distribution
#'  on the hypothesis H0: data_vec is normal using \code{sf.test()} from
#'  \code{library(nortest)}. Finally, computes and returns the fraction of p-values
#'  that reject the null hypothesis. Given a non-normal distribution, the Central
#'  Limit Theorem tells us that this proportion
#'  should decrease as \code{sample_size} increases.
clt <- function(data_vec, sample_size) {

  means<-array(0, 200)
  pvals <- array(0, 200)

  for(i in 1:200) {

    for(j in i:200) { means[j] <- mean(sample(data, sample_size)) }
    pvals[i] <- sf.test(means)$p.val
  }
  length(pvals[pvals<0.05])/200
}

## Run \code{clt()} multiple times for a range of sample sizes. ####
sample_size <- c(rep(2, times = 100),
                 rep(10, times = 100),
                 rep(100, times = 100),
                 rep(1000, times = 100))

prop_pvalue <- data_frame(size = sample_size,
                          pvalue = rep(NA, length(sample_size)))
for(x in 1:length(sample_size)) {
  prop_pvalue[x, 2] <- clt(data = data_vec, sample_size = sample_size[x])
}

## Graph results ####
prop_pvalue %>%
  mutate(size = as.factor(size)) %>%
  ggplot(aes(x = size, y = pvalue)) +
  geom_boxplot() +
  coord_flip() +
  labs(title = "Central Limit Theorem",
       x = expression(paste("Sample size, ", italic("n"))),
       y = "Proportion of non-normal distributions") +
  theme_minimal()
	## Demonstration of central limit theorem
	## Based on code in an anonymous comment to the blog post at \url{https://sas-and-r.blogspot.com/2012/01/example-919-demonstrating-central-limit.html}

	## Libraries ####
	library(nortest)
	library(dplyr)
	library(ggplot2)

	## Data used ####
	# right-triangle distribution (peak at 0; minimum at 1)
	data_vec <- rbeta(10000, shape1 = 1, shape2 = 2);

	## Function ####
	#' @title Multiple comparisons of a given data vector to the normal distribution
	#' @param data_vec Data to sample for calculating means
	#' @param sample_size Sample size to use for calculating means
	#' @return A number representing the proportion of p-values < 0.05
	#' @description Given a data vector and a sample size, computes 200 means
	#' based on \code{sample_size}, then compares the means to the normal distribution
	#' on the hypothesis H0: data_vec is normal using \code{sf.test()} from
	#' \code{library(nortest)}. Finally, computes and returns the fraction of p-values
	#' that reject the null hypothesis. Given a non-normal distribution, the Central
	#' Limit Theorem tells us that this proportion
	#' should decrease as \code{sample_size} increases.
	clt <- function(data_vec, sample_size) {

	means<-array(0, 200)
	pvals <- array(0, 200)

	for(i in 1:200) {

	for(j in i:200) { means[j] <- mean(sample(data, sample_size)) }
	pvals[i] <- sf.test(means)$p.val
	}
	length(pvals[pvals<0.05])/200
	}

	## Run \code{clt()} multiple times for a range of sample sizes. ####
	sample_size <- c(rep(2, times = 100),
	rep(10, times = 100),
	rep(100, times = 100),
	rep(1000, times = 100))

	prop_pvalue <- data_frame(size = sample_size,
	pvalue = rep(NA, length(sample_size)))
	for(x in 1:length(sample_size)) {
	prop_pvalue[x, 2] <- clt(data = data_vec, sample_size = sample_size[x])
	}

	## Graph results ####
	prop_pvalue %>%
	mutate(size = as.factor(size)) %>%
	ggplot(aes(x = size, y = pvalue)) +
	geom_boxplot() +
	coord_flip() +
	labs(title = "Central Limit Theorem",
	x = expression(paste("Sample size, ", italic("n"))),
	y = "Proportion of non-normal distributions") +
	theme_minimal()