Skip to content

Instantly share code, notes, and snippets.

@graebnerc
Last active June 23, 2022 13:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save graebnerc/822ddd3e0d316aae9dda7b5afbe11685 to your computer and use it in GitHub Desktop.
Save graebnerc/822ddd3e0d316aae9dda7b5afbe11685 to your computer and use it in GitHub Desktop.
Lecture script and solutions for T14 on sampling
Lecture script and solutions for T14 on sampling. Note that we did not do the exercises in class, treat them as additional info.
here::i_am("R/T14-Exercise-1-Solution.R")
library(here)
library(tibble)
library(dplyr)
library(purrr)
library(ggplot2)
library(icaeDesign)
# remotes::install_github("graebnerc/DataScienceExercises")
library(DataScienceExercises)
reps <- 1000
student_pop <- DataScienceExercises::EUFstudents
# Alternative 1: draw samples using purrr-------------
set.seed(123)
samp_size10 <- map_dbl(
.x = seq_len(reps),
.f = ~mean(sample(x = student_pop[["Height"]],
size = 10,
replace = FALSE))
)
samp_size50 <- map_dbl(
.x = seq_len(reps),
.f = ~mean(sample(x = student_pop[["Height"]],
size = 50,
replace = FALSE)))
full_samples <- rbind(
tibble("samplesize"="Sample size: 10", "mean"=samp_size10),
tibble("samplesize"="Sample size: 50", "mean"=samp_size50)
)
# Alternative 2: draw samples using for loop----------
set.seed(123)
samp_size10_loop <- rep(NA, reps)
samp_size50_loop <- rep(NA, reps)
for (i in seq_len(reps)){
sample_small <- sample(
x = student_pop[["Height"]],
size = 10, replace = FALSE)
sample_small_mean <- mean(sample_small)
samp_size10_loop[i] <- sample_small_mean
sample_large <- sample(
x = student_pop[["Height"]],
size = 10, replace = FALSE)
sample_large_mean <- mean(sample_large)
samp_size50_loop[i] <- sample_large_mean
}
# Visualization------------------------
sampling_plot <- ggplot(data = full_samples, aes_string(x="mean")) +
geom_histogram(alpha=0.5, color="#00395B", fill="#00395B") +
scale_y_continuous(expand = expansion(add = c(0, 10))) +
scale_x_continuous(limits = c(158, 178), expand = expansion()) +
facet_wrap(~samplesize, nrow = 2) +
labs(
x = "Sample means",
y = "Count",
title = "The sampling distributions") +
theme_icae()
sampling_plot
ggsave(plot = sampling_plot,
filename = here("output/T14/EufPopPlotSamples.pdf"),
width = 4, height = 3)
# Information about the population-----
true_vals <- student_pop %>%
group_by(Gender) %>%
summarise(
Mean=mean(Height), SD=sd(Height)
)
true_vals2 <- student_pop %>%
summarise(
Mean=mean(Height), SD=sd(Height)
) %>%
dplyr::mutate(Gender="total") %>%
dplyr::select(Gender, Mean, SD)
true_vals <- rbind(true_vals, true_vals2)
true_vals
sample_props <- full_samples %>%
group_by(samplesize) %>%
summarise(`Mean`=mean(mean),
`Variation`=sd(mean)) %>%
rename(` `=samplesize)
sample_props
here::i_am("R/T14-Exercise-2-Solution.R")
library(here)
library(purrr)
library(tibble)
library(data.table)
library(ggplot2)
library(ggpubr)
library(icaeDesign)
set.seed(123)
# Population with 5000 exponentially distributed values
N <- 5000
exp_par <- 2
# Visualization of the 'true population':
# MCS
iterations <- 500
sample_sizes <- c(5, 10, 20, 50)
population <- rexp(n = N, rate = exp_par)
population_tab <- tibble("pop_n" = population)
true_mean <- mean(population)
mean_dist_list <- list()
for (i in seq_along(sample_sizes)){
sample_size <- sample_sizes[i]
sample_means <- purrr::map_dbl(
.x = 1:iterations,
.f = ~mean(sample(population, size = sample_size, replace = FALSE)))
sample_means_tab <- tibble(
"sample_size"=sample_size,
"sample_means"=sample_means
)
mean_dist_list[[sample_size]] <- sample_means_tab
}
full_results <- rbindlist(mean_dist_list)
population_plot <- ggplot(data = population_tab, aes(x = pop_n)) +
geom_histogram(aes(y = ..density..)) +
scale_y_continuous(expand = expansion(add = c(0, 0.05))) +
labs(title = "The population", y = "Density") +
stat_function(fun = dexp, args = list(rate = exp_par)) +
theme_icae() +
theme(axis.title.x = element_blank())
population_plot
hist_plot <- ggplot(data = full_results, aes(x=sample_means)) +
geom_histogram(aes(y=..density..), alpha=0.5, color="#00395B", fill="#00395B") +
scale_y_continuous(expand = expansion(add = c(0, 0.05))) +
scale_x_continuous(limits = c(-0.2, 1.2), expand = expansion()) +
facet_wrap(~sample_size, ncol = 4) +
stat_function(
data = dplyr::filter(full_results, sample_size==5),
fun = dnorm,
args = list(
mean = mean(dplyr::filter(full_results, sample_size==5)$sample_means),
sd = sd(dplyr::filter(full_results, sample_size==5)$sample_means))
) +
stat_function(
data = dplyr::filter(full_results, sample_size==10),
fun = dnorm,
args = list(
mean = mean(dplyr::filter(full_results, sample_size==10)$sample_means),
sd = sd(dplyr::filter(full_results, sample_size==10)$sample_means))
) +
stat_function(
data = dplyr::filter(full_results, sample_size==20),
fun = dnorm,
args = list(
mean = mean(dplyr::filter(full_results, sample_size==20)$sample_means),
sd = sd(dplyr::filter(full_results, sample_size==20)$sample_means))
) +
stat_function(
data = dplyr::filter(full_results, sample_size==50),
fun = dnorm,
args = list(
mean = mean(dplyr::filter(full_results, sample_size==50)$sample_means),
sd = sd(dplyr::filter(full_results, sample_size==50)$sample_means))
) +
labs(x = "Sample means", y = "Density", title = "The sampling distributions") +
theme_icae()
full_plot <- ggarrange(population_plot, hist_plot, nrow = 2)
ggsave(plot = full_plot, filename = here("output/T14/clt-example.pdf"),
width = 4, height = 3)
# A for loop
for (i in seq_len(10)){
print(i)
}
# Below we illustrate two techniques for iteration using a simple example:
# simulate throwing a dice
# 1st step: define the random process and define general parameters
nb_iterations <- 10
sample(1:6, size = 1)
# 2nd step: iterate the process (variant: for-loop)
result_container <- rep(NA, nb_iterations)
set.seed(123)
for (i in seq_len(nb_iterations)){
result_container[i] <- sample(1:6, size = 1)
}
result_container
result_tab <- tibble("draw"=factor(result_container, levels = 1:6))
# 2nd step: iterate the process (variant: purrr package)
library(purrr)
set.seed(123)
result_container_purrr <- map_dbl(
.x = seq_len(nb_iterations),
.f = ~sample(1:6, size = 1)
)
result_container_purrr
# 3rd step: visualize and analyze the result
ggplot(data = result_tab, mapping = aes(x=draw)) +
geom_bar() +
scale_y_continuous(expand = expansion()) +
scale_x_discrete(drop=FALSE) + # To show also numbers that were not thrown
theme_bw()
library(purrr)
library(tibble)
library(dplyr)
library(ggplot2)
# Create the ball pid population
ball_pid_size <- 5000
ball_pid_share_white <- 0.65
white_balls <- as.integer(ball_pid_share_white*ball_pid_size)
grey_balls <- ball_pid_size - white_balls
ball_pid_colors <- c(rep("white", white_balls), rep("grey", grey_balls))
ball_pid <- tibble::tibble(
id = seq(1, ball_pid_size),
color = sample(ball_pid_colors)
)
# Sampling of the balls
# 1st step: define the random process and set general parameters
sample_size <- 20
nb_iterations <- 400
#' Function that takes a sample of balls and computes share of white balls
sample_balls <- function(bowl, sample_size, replace_balls=FALSE){
sample_obtained <- sample(
x = bowl, size = sample_size, replace = replace_balls)
sum(sample_obtained == "white") / sample_size
}
# 2nd step: iterate the process
set.seed(123)
sampling_df <- tibble(
"sample_size" = sample_size,
"sample_id" = seq_len(nb_iterations),
"share_white" = map_dbl(
.x = seq_len(nb_iterations),
.f = ~sample_balls(ball_pid$color, sample_size = sample_size))
)
# Alternative formulation using a for-loop:
result_container <- rep(NA, nb_iterations)
set.seed(123)
for (i in seq_len(nb_iterations)){
# print(i)
sample_drawm <- sample(
x = ball_pid[["color"]],
size = sample_size,
replace = FALSE)
share_white <- sum(sample_drawm=="white") / length(sample_drawm)
share_white
result_container[i] <- share_white
}
# 3rd step: visualize and analyze the result
sample_plot <- ggplot(data = sampling_df, aes(x=share_white)) +
geom_histogram(binwidth = 0.02, fill="#00395B") +
theme_bw()
sample_plot
# Compare the effect of different sample sizes-------------
# 1. Simulate data for different sample sizes:
set.seed(123)
sample_size <- 10
sampling_df_10 <- tibble(
"sample_size" = sample_size,
"share_white" = map_dbl(
.x = seq_len(nb_iterations),
.f = ~sample_balls(ball_pid$color, sample_size = sample_size))
)
set.seed(123)
sample_size <- 50
sampling_df_50 <- tibble(
"sample_size" = sample_size,
"share_white" = map_dbl(
.x = seq_len(nb_iterations),
.f = ~sample_balls(ball_pid$color, sample_size = sample_size))
)
set.seed(123)
sample_size <- 500
sampling_df_100 <- tibble(
"sample_size" = sample_size,
"share_white" = map_dbl(
.x = seq_len(nb_iterations),
.f = ~sample_balls(ball_pid$color, sample_size = sample_size))
)
# 2. Analyse the results quantitatively:
# Merge the tibbles:
sample_df_full <- rbind(sampling_df_10, sampling_df_50, sampling_df_100)
# Compute mean and standard deviation:
true_white_share <- sum(ball_pid[["color"]]=="white") / nrow(ball_pid)
sample_df_full %>%
group_by(sample_size) %>%
summarise(
mean_share = mean(share_white),
diff_true_val = abs(mean_share - true_white_share),
sd_share = sd(share_white)
)
# Visualize the distributions:
ggplot(data = sample_df_full, mapping = aes(x=share_white)) +
geom_histogram(binwidth = 0.025) +
geom_vline(xintercept = true_white_share) + # to add a line with true value
scale_x_continuous(limits = c(0, 1.2), expand = expansion()) +
scale_y_continuous(expand = expansion(add = c(0, 10))) +
facet_wrap(~sample_size, ncol = 3) +
theme_bw()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment