t-redactyl/bootstrap_plot_1.R

## bootstrap_plot_1.R
require(ggplot2); require(gridExtra)

# Set the colours for the graphs
barfill <- "#4271AE"
barlines <- "#1F3552"
line1 <- "black"
line2 <- "#FF3721"

# Plotting histogram of sample of daily page views
g1 <- ggplot(data=as.data.frame(sample), aes(sample)) +
        geom_histogram(aes(y = ..density..), binwidth = 6,
                 col = barlines,
                 fill = barfill) +
        xlab("Number of page views per day") +
        ylab("Density") +
        theme_bw() +
        ggtitle("Sample distribution of page views over 30 days") +
        theme(plot.title = element_text(lineheight=.8, face="bold"))

print(g1)

## bootstrapped_standard_errors.R
# Plotting a histogram with the +/- 1 and 2 standard error intervals.
sem1 <- data.frame(SEMs="+/- 1 SEM",
                   vals = c(mean(mn_vector) - sd(mn_vector), mean(mn_vector) + sd(mn_vector)))
sem2 <- data.frame(SEMs="+/- 2 SEMs",
                    vals = c(mean(mn_vector) - 2 * sd(mn_vector), mean(mn_vector) + 2 * sd(mn_vector)))
sems <- rbind(sem1, sem2)

g1 <- ggplot(data=as.data.frame(mn_vector), aes(mn_vector)) +
        geom_histogram(aes(y = ..density..), binwidth = 1,
                 col = barlines,
                 fill = barfill) +
        xlab("Mean of each resample") +
        ylab("Density") +
        theme_bw() +
        ggtitle("Distribution of Means of 1,000 Resamples") +
        theme(plot.title = element_text(lineheight=.8, face="bold")) +
        geom_vline(data=sems, aes(xintercept=vals, linetype = SEMs,
                            colour = SEMs), size = 1, show_guide = TRUE) +
        scale_color_manual(values=c("+/- 1 SEM" = line1,
                                    "+/- 2 SEMs" = line2))

print(g1)

## bootstrapping_resamples_from_original_sample.R
# Setting seed
set.seed(567)

# Generate the mean of each resample and store in a vector, and store each resample in a dataframe
mn_vector <- NULL
resample_frame <- data.frame(row.names = seq(from = 1, to = 30, by = 1))
for (i in 1 : 1000) {
    s <- sample(sample, 30, replace = TRUE)
    resample_frame <- cbind(resample_frame, s)
    mn_vector <- c(mn_vector, mean(s))
}

# Name the columns in the resample dataframe
names(resample_frame) <- paste0("n", seq(from = 1, to = 1000, by = 1))

## calculating_bootstrapped_and_formula_sem.R
# Setting seed and drawing non-Poisson sample
set.seed(567)
sample_np <- c(rpois(10, lambda = 220), runif(20, min = 180, max = 245))

# Generate the mean of each resample and store in a vector
mn_vector_np <- NULL
for (i in 1 : 1000) {
    s <- sample(sample_np, 30, replace = TRUE)
    mn_vector_np <- c(mn_vector_np, mean(s))
}

# Generate bootstrapped SEM
b_sem <- sd(mn_vector_np)

# Generate formula-derived SEM
f_sem <- sqrt(mean(sample_np) / 30)

## calculating_sem_formula.R
# Defining lambda and n
lambda <- mean(sample)
n <- 30

# Calculating SEM
sem <- sqrt(lambda / n)

## comparing_bootstrapped_resamples.R
mean1 <- data.frame(Means="Population mean", vals = 220)
mean2 <- data.frame(Means="Sample mean", vals = mean(resample_frame$n1))
means <- rbind(mean1, mean2)

g1 <- ggplot(data=resample_frame, aes(resample_frame$n1)) +
        geom_histogram(aes(y = ..density..),
                       binwidth = 5, fill = barfill, colour = barlines) +
        xlab("Daily page views") +
        ylab("Density") +
        theme_bw() +
        ggtitle("Sample 1") +
        theme(plot.title = element_text(lineheight=1.1, face="bold")) +
        geom_vline(data=means, aes(xintercept=vals, linetype = Means,
                             colour = Means), size = 1, show_guide = TRUE) +
         scale_color_manual(values=c("Population mean" = line1, "Sample mean" = line2))

mean1 <- data.frame(Means="Population mean", vals = 220)
mean2 <- data.frame(Means="Sample mean", vals = mean(resample_frame$n2))
means <- rbind(mean1, mean2)

g2 <- ggplot(data=resample_frame, aes(resample_frame$n2)) +
        geom_histogram(aes(y = ..density..),
                       binwidth = 5, fill = barfill, colour = barlines) +
        xlab("Daily page views") +
        ylab("Density") +
        theme_bw() +
        ggtitle("Sample 2") +
        theme(plot.title = element_text(lineheight=1.1, face="bold")) +
        geom_vline(data=means, aes(xintercept=vals, linetype = Means,
                             colour = Means), size = 1, show_guide = TRUE) +
        scale_color_manual(values=c("Population mean" = line1, "Sample mean" = line2))

grid.arrange(g1, g2, nrow = 1, ncol = 2)

## generating_original_sample.R
# Setting seed and generating a single sample
set.seed(567)
sample <- rpois(30, lambda = 220)

## plotting_bootstrapped_and_formula_sems.R
# Plotting a histogram with the +/- 1 standard error intervals.
sem1 <- data.frame(SEMs="+/- 1 SEM (bootstrap)",
                   vals = c(mean(mn_vector_np) - b_sem, mean(mn_vector_np) + b_sem))
sem2 <- data.frame(SEMs="+/- 1 SEM (formula)",
                    vals = c(mean(mn_vector_np) - f_sem, mean(mn_vector_np) + f_sem))
sems <- rbind(sem1, sem2)

g1 <- ggplot(data=as.data.frame(mn_vector_np), aes(mn_vector_np)) +
        geom_histogram(aes(y = ..density..), binwidth = 1,
                 col = barlines,
                 fill = barfill) +
        xlab("Mean of each resample") +
        ylab("Density") +
        theme_bw() +
        ggtitle("Distribution of Means of 1,000 Resamples") +
        theme(plot.title = element_text(lineheight=.8, face="bold")) +
        geom_vline(data=sems, aes(xintercept=vals, linetype = SEMs,
                            colour = SEMs), size = 1, show_guide = TRUE) +
        scale_color_manual(values=c("+/- 1 SEM (bootstrap)" = line1,
                                    "+/- 1 SEM (formula)" = line2))

print(g1)
	require(ggplot2); require(gridExtra)

	# Set the colours for the graphs
	barfill <- "#4271AE"
	barlines <- "#1F3552"
	line1 <- "black"
	line2 <- "#FF3721"

	# Plotting histogram of sample of daily page views
	g1 <- ggplot(data=as.data.frame(sample), aes(sample)) +
	geom_histogram(aes(y = ..density..), binwidth = 6,
	col = barlines,
	fill = barfill) +
	xlab("Number of page views per day") +
	ylab("Density") +
	theme_bw() +
	ggtitle("Sample distribution of page views over 30 days") +
	theme(plot.title = element_text(lineheight=.8, face="bold"))

	print(g1)
	# Plotting a histogram with the +/- 1 and 2 standard error intervals.
	sem1 <- data.frame(SEMs="+/- 1 SEM",
	vals = c(mean(mn_vector) - sd(mn_vector), mean(mn_vector) + sd(mn_vector)))
	sem2 <- data.frame(SEMs="+/- 2 SEMs",
	vals = c(mean(mn_vector) - 2 * sd(mn_vector), mean(mn_vector) + 2 * sd(mn_vector)))
	sems <- rbind(sem1, sem2)

	g1 <- ggplot(data=as.data.frame(mn_vector), aes(mn_vector)) +
	geom_histogram(aes(y = ..density..), binwidth = 1,
	col = barlines,
	fill = barfill) +
	xlab("Mean of each resample") +
	ylab("Density") +
	theme_bw() +
	ggtitle("Distribution of Means of 1,000 Resamples") +
	theme(plot.title = element_text(lineheight=.8, face="bold")) +
	geom_vline(data=sems, aes(xintercept=vals, linetype = SEMs,
	colour = SEMs), size = 1, show_guide = TRUE) +
	scale_color_manual(values=c("+/- 1 SEM" = line1,
	"+/- 2 SEMs" = line2))

	print(g1)
	# Setting seed
	set.seed(567)

	# Generate the mean of each resample and store in a vector, and store each resample in a dataframe
	mn_vector <- NULL
	resample_frame <- data.frame(row.names = seq(from = 1, to = 30, by = 1))
	for (i in 1 : 1000) {
	s <- sample(sample, 30, replace = TRUE)
	resample_frame <- cbind(resample_frame, s)
	mn_vector <- c(mn_vector, mean(s))
	}

	# Name the columns in the resample dataframe
	names(resample_frame) <- paste0("n", seq(from = 1, to = 1000, by = 1))
	# Setting seed and drawing non-Poisson sample
	set.seed(567)
	sample_np <- c(rpois(10, lambda = 220), runif(20, min = 180, max = 245))

	# Generate the mean of each resample and store in a vector
	mn_vector_np <- NULL
	for (i in 1 : 1000) {
	s <- sample(sample_np, 30, replace = TRUE)
	mn_vector_np <- c(mn_vector_np, mean(s))
	}

	# Generate bootstrapped SEM
	b_sem <- sd(mn_vector_np)

	# Generate formula-derived SEM
	f_sem <- sqrt(mean(sample_np) / 30)
	# Defining lambda and n
	lambda <- mean(sample)
	n <- 30

	# Calculating SEM
	sem <- sqrt(lambda / n)
	mean1 <- data.frame(Means="Population mean", vals = 220)
	mean2 <- data.frame(Means="Sample mean", vals = mean(resample_frame$n1))
	means <- rbind(mean1, mean2)

	g1 <- ggplot(data=resample_frame, aes(resample_frame$n1)) +
	geom_histogram(aes(y = ..density..),
	binwidth = 5, fill = barfill, colour = barlines) +
	xlab("Daily page views") +
	ylab("Density") +
	theme_bw() +
	ggtitle("Sample 1") +
	theme(plot.title = element_text(lineheight=1.1, face="bold")) +
	geom_vline(data=means, aes(xintercept=vals, linetype = Means,
	colour = Means), size = 1, show_guide = TRUE) +
	scale_color_manual(values=c("Population mean" = line1, "Sample mean" = line2))

	mean1 <- data.frame(Means="Population mean", vals = 220)
	mean2 <- data.frame(Means="Sample mean", vals = mean(resample_frame$n2))
	means <- rbind(mean1, mean2)

	g2 <- ggplot(data=resample_frame, aes(resample_frame$n2)) +
	geom_histogram(aes(y = ..density..),
	binwidth = 5, fill = barfill, colour = barlines) +
	xlab("Daily page views") +
	ylab("Density") +
	theme_bw() +
	ggtitle("Sample 2") +
	theme(plot.title = element_text(lineheight=1.1, face="bold")) +
	geom_vline(data=means, aes(xintercept=vals, linetype = Means,
	colour = Means), size = 1, show_guide = TRUE) +
	scale_color_manual(values=c("Population mean" = line1, "Sample mean" = line2))

	grid.arrange(g1, g2, nrow = 1, ncol = 2)
	# Setting seed and generating a single sample
	set.seed(567)
	sample <- rpois(30, lambda = 220)
	# Plotting a histogram with the +/- 1 standard error intervals.
	sem1 <- data.frame(SEMs="+/- 1 SEM (bootstrap)",
	vals = c(mean(mn_vector_np) - b_sem, mean(mn_vector_np) + b_sem))
	sem2 <- data.frame(SEMs="+/- 1 SEM (formula)",
	vals = c(mean(mn_vector_np) - f_sem, mean(mn_vector_np) + f_sem))
	sems <- rbind(sem1, sem2)

	g1 <- ggplot(data=as.data.frame(mn_vector_np), aes(mn_vector_np)) +
	geom_histogram(aes(y = ..density..), binwidth = 1,
	col = barlines,
	fill = barfill) +
	xlab("Mean of each resample") +
	ylab("Density") +
	theme_bw() +
	ggtitle("Distribution of Means of 1,000 Resamples") +
	theme(plot.title = element_text(lineheight=.8, face="bold")) +
	geom_vline(data=sems, aes(xintercept=vals, linetype = SEMs,
	colour = SEMs), size = 1, show_guide = TRUE) +
	scale_color_manual(values=c("+/- 1 SEM (bootstrap)" = line1,
	"+/- 1 SEM (formula)" = line2))

	print(g1)