Skip to content

Instantly share code, notes, and snippets.

@t-redactyl
Last active September 27, 2017 05:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save t-redactyl/5ffef9200ea51cc81510 to your computer and use it in GitHub Desktop.
Save t-redactyl/5ffef9200ea51cc81510 to your computer and use it in GitHub Desktop.
require(ggplot2); require(gridExtra)
# Set the colours for the graphs
barfill <- "#4271AE"
barlines <- "#1F3552"
line1 <- "black"
line2 <- "#FF3721"
# Plotting histogram of sample of daily page views
g1 <- ggplot(data=as.data.frame(sample), aes(sample)) +
geom_histogram(aes(y = ..density..), binwidth = 6,
col = barlines,
fill = barfill) +
xlab("Number of page views per day") +
ylab("Density") +
theme_bw() +
ggtitle("Sample distribution of page views over 30 days") +
theme(plot.title = element_text(lineheight=.8, face="bold"))
print(g1)
# Plotting a histogram with the +/- 1 and 2 standard error intervals.
sem1 <- data.frame(SEMs="+/- 1 SEM",
vals = c(mean(mn_vector) - sd(mn_vector), mean(mn_vector) + sd(mn_vector)))
sem2 <- data.frame(SEMs="+/- 2 SEMs",
vals = c(mean(mn_vector) - 2 * sd(mn_vector), mean(mn_vector) + 2 * sd(mn_vector)))
sems <- rbind(sem1, sem2)
g1 <- ggplot(data=as.data.frame(mn_vector), aes(mn_vector)) +
geom_histogram(aes(y = ..density..), binwidth = 1,
col = barlines,
fill = barfill) +
xlab("Mean of each resample") +
ylab("Density") +
theme_bw() +
ggtitle("Distribution of Means of 1,000 Resamples") +
theme(plot.title = element_text(lineheight=.8, face="bold")) +
geom_vline(data=sems, aes(xintercept=vals, linetype = SEMs,
colour = SEMs), size = 1, show_guide = TRUE) +
scale_color_manual(values=c("+/- 1 SEM" = line1,
"+/- 2 SEMs" = line2))
print(g1)
# Setting seed
set.seed(567)
# Generate the mean of each resample and store in a vector, and store each resample in a dataframe
mn_vector <- NULL
resample_frame <- data.frame(row.names = seq(from = 1, to = 30, by = 1))
for (i in 1 : 1000) {
s <- sample(sample, 30, replace = TRUE)
resample_frame <- cbind(resample_frame, s)
mn_vector <- c(mn_vector, mean(s))
}
# Name the columns in the resample dataframe
names(resample_frame) <- paste0("n", seq(from = 1, to = 1000, by = 1))
# Setting seed and drawing non-Poisson sample
set.seed(567)
sample_np <- c(rpois(10, lambda = 220), runif(20, min = 180, max = 245))
# Generate the mean of each resample and store in a vector
mn_vector_np <- NULL
for (i in 1 : 1000) {
s <- sample(sample_np, 30, replace = TRUE)
mn_vector_np <- c(mn_vector_np, mean(s))
}
# Generate bootstrapped SEM
b_sem <- sd(mn_vector_np)
# Generate formula-derived SEM
f_sem <- sqrt(mean(sample_np) / 30)
# Defining lambda and n
lambda <- mean(sample)
n <- 30
# Calculating SEM
sem <- sqrt(lambda / n)
mean1 <- data.frame(Means="Population mean", vals = 220)
mean2 <- data.frame(Means="Sample mean", vals = mean(resample_frame$n1))
means <- rbind(mean1, mean2)
g1 <- ggplot(data=resample_frame, aes(resample_frame$n1)) +
geom_histogram(aes(y = ..density..),
binwidth = 5, fill = barfill, colour = barlines) +
xlab("Daily page views") +
ylab("Density") +
theme_bw() +
ggtitle("Sample 1") +
theme(plot.title = element_text(lineheight=1.1, face="bold")) +
geom_vline(data=means, aes(xintercept=vals, linetype = Means,
colour = Means), size = 1, show_guide = TRUE) +
scale_color_manual(values=c("Population mean" = line1, "Sample mean" = line2))
mean1 <- data.frame(Means="Population mean", vals = 220)
mean2 <- data.frame(Means="Sample mean", vals = mean(resample_frame$n2))
means <- rbind(mean1, mean2)
g2 <- ggplot(data=resample_frame, aes(resample_frame$n2)) +
geom_histogram(aes(y = ..density..),
binwidth = 5, fill = barfill, colour = barlines) +
xlab("Daily page views") +
ylab("Density") +
theme_bw() +
ggtitle("Sample 2") +
theme(plot.title = element_text(lineheight=1.1, face="bold")) +
geom_vline(data=means, aes(xintercept=vals, linetype = Means,
colour = Means), size = 1, show_guide = TRUE) +
scale_color_manual(values=c("Population mean" = line1, "Sample mean" = line2))
grid.arrange(g1, g2, nrow = 1, ncol = 2)
# Setting seed and generating a single sample
set.seed(567)
sample <- rpois(30, lambda = 220)
# Plotting a histogram with the +/- 1 standard error intervals.
sem1 <- data.frame(SEMs="+/- 1 SEM (bootstrap)",
vals = c(mean(mn_vector_np) - b_sem, mean(mn_vector_np) + b_sem))
sem2 <- data.frame(SEMs="+/- 1 SEM (formula)",
vals = c(mean(mn_vector_np) - f_sem, mean(mn_vector_np) + f_sem))
sems <- rbind(sem1, sem2)
g1 <- ggplot(data=as.data.frame(mn_vector_np), aes(mn_vector_np)) +
geom_histogram(aes(y = ..density..), binwidth = 1,
col = barlines,
fill = barfill) +
xlab("Mean of each resample") +
ylab("Density") +
theme_bw() +
ggtitle("Distribution of Means of 1,000 Resamples") +
theme(plot.title = element_text(lineheight=.8, face="bold")) +
geom_vline(data=sems, aes(xintercept=vals, linetype = SEMs,
colour = SEMs), size = 1, show_guide = TRUE) +
scale_color_manual(values=c("+/- 1 SEM (bootstrap)" = line1,
"+/- 1 SEM (formula)" = line2))
print(g1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment