Last active
June 23, 2022 13:41
-
-
Save graebnerc/822ddd3e0d316aae9dda7b5afbe11685 to your computer and use it in GitHub Desktop.
Lecture script and solutions for T14 on sampling
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Lecture script and solutions for T14 on sampling. Note that we did not do the exercises in class, treat them as additional info. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
here::i_am("R/T14-Exercise-1-Solution.R") | |
library(here) | |
library(tibble) | |
library(dplyr) | |
library(purrr) | |
library(ggplot2) | |
library(icaeDesign) | |
# remotes::install_github("graebnerc/DataScienceExercises") | |
library(DataScienceExercises) | |
reps <- 1000 | |
student_pop <- DataScienceExercises::EUFstudents | |
# Alternative 1: draw samples using purrr------------- | |
set.seed(123) | |
samp_size10 <- map_dbl( | |
.x = seq_len(reps), | |
.f = ~mean(sample(x = student_pop[["Height"]], | |
size = 10, | |
replace = FALSE)) | |
) | |
samp_size50 <- map_dbl( | |
.x = seq_len(reps), | |
.f = ~mean(sample(x = student_pop[["Height"]], | |
size = 50, | |
replace = FALSE))) | |
full_samples <- rbind( | |
tibble("samplesize"="Sample size: 10", "mean"=samp_size10), | |
tibble("samplesize"="Sample size: 50", "mean"=samp_size50) | |
) | |
# Alternative 2: draw samples using for loop---------- | |
set.seed(123) | |
samp_size10_loop <- rep(NA, reps) | |
samp_size50_loop <- rep(NA, reps) | |
for (i in seq_len(reps)){ | |
sample_small <- sample( | |
x = student_pop[["Height"]], | |
size = 10, replace = FALSE) | |
sample_small_mean <- mean(sample_small) | |
samp_size10_loop[i] <- sample_small_mean | |
sample_large <- sample( | |
x = student_pop[["Height"]], | |
size = 10, replace = FALSE) | |
sample_large_mean <- mean(sample_large) | |
samp_size50_loop[i] <- sample_large_mean | |
} | |
# Visualization------------------------ | |
sampling_plot <- ggplot(data = full_samples, aes_string(x="mean")) + | |
geom_histogram(alpha=0.5, color="#00395B", fill="#00395B") + | |
scale_y_continuous(expand = expansion(add = c(0, 10))) + | |
scale_x_continuous(limits = c(158, 178), expand = expansion()) + | |
facet_wrap(~samplesize, nrow = 2) + | |
labs( | |
x = "Sample means", | |
y = "Count", | |
title = "The sampling distributions") + | |
theme_icae() | |
sampling_plot | |
ggsave(plot = sampling_plot, | |
filename = here("output/T14/EufPopPlotSamples.pdf"), | |
width = 4, height = 3) | |
# Information about the population----- | |
true_vals <- student_pop %>% | |
group_by(Gender) %>% | |
summarise( | |
Mean=mean(Height), SD=sd(Height) | |
) | |
true_vals2 <- student_pop %>% | |
summarise( | |
Mean=mean(Height), SD=sd(Height) | |
) %>% | |
dplyr::mutate(Gender="total") %>% | |
dplyr::select(Gender, Mean, SD) | |
true_vals <- rbind(true_vals, true_vals2) | |
true_vals | |
sample_props <- full_samples %>% | |
group_by(samplesize) %>% | |
summarise(`Mean`=mean(mean), | |
`Variation`=sd(mean)) %>% | |
rename(` `=samplesize) | |
sample_props |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
here::i_am("R/T14-Exercise-2-Solution.R") | |
library(here) | |
library(purrr) | |
library(tibble) | |
library(data.table) | |
library(ggplot2) | |
library(ggpubr) | |
library(icaeDesign) | |
set.seed(123) | |
# Population with 5000 exponentially distributed values | |
N <- 5000 | |
exp_par <- 2 | |
# Visualization of the 'true population': | |
# MCS | |
iterations <- 500 | |
sample_sizes <- c(5, 10, 20, 50) | |
population <- rexp(n = N, rate = exp_par) | |
population_tab <- tibble("pop_n" = population) | |
true_mean <- mean(population) | |
mean_dist_list <- list() | |
for (i in seq_along(sample_sizes)){ | |
sample_size <- sample_sizes[i] | |
sample_means <- purrr::map_dbl( | |
.x = 1:iterations, | |
.f = ~mean(sample(population, size = sample_size, replace = FALSE))) | |
sample_means_tab <- tibble( | |
"sample_size"=sample_size, | |
"sample_means"=sample_means | |
) | |
mean_dist_list[[sample_size]] <- sample_means_tab | |
} | |
full_results <- rbindlist(mean_dist_list) | |
population_plot <- ggplot(data = population_tab, aes(x = pop_n)) + | |
geom_histogram(aes(y = ..density..)) + | |
scale_y_continuous(expand = expansion(add = c(0, 0.05))) + | |
labs(title = "The population", y = "Density") + | |
stat_function(fun = dexp, args = list(rate = exp_par)) + | |
theme_icae() + | |
theme(axis.title.x = element_blank()) | |
population_plot | |
hist_plot <- ggplot(data = full_results, aes(x=sample_means)) + | |
geom_histogram(aes(y=..density..), alpha=0.5, color="#00395B", fill="#00395B") + | |
scale_y_continuous(expand = expansion(add = c(0, 0.05))) + | |
scale_x_continuous(limits = c(-0.2, 1.2), expand = expansion()) + | |
facet_wrap(~sample_size, ncol = 4) + | |
stat_function( | |
data = dplyr::filter(full_results, sample_size==5), | |
fun = dnorm, | |
args = list( | |
mean = mean(dplyr::filter(full_results, sample_size==5)$sample_means), | |
sd = sd(dplyr::filter(full_results, sample_size==5)$sample_means)) | |
) + | |
stat_function( | |
data = dplyr::filter(full_results, sample_size==10), | |
fun = dnorm, | |
args = list( | |
mean = mean(dplyr::filter(full_results, sample_size==10)$sample_means), | |
sd = sd(dplyr::filter(full_results, sample_size==10)$sample_means)) | |
) + | |
stat_function( | |
data = dplyr::filter(full_results, sample_size==20), | |
fun = dnorm, | |
args = list( | |
mean = mean(dplyr::filter(full_results, sample_size==20)$sample_means), | |
sd = sd(dplyr::filter(full_results, sample_size==20)$sample_means)) | |
) + | |
stat_function( | |
data = dplyr::filter(full_results, sample_size==50), | |
fun = dnorm, | |
args = list( | |
mean = mean(dplyr::filter(full_results, sample_size==50)$sample_means), | |
sd = sd(dplyr::filter(full_results, sample_size==50)$sample_means)) | |
) + | |
labs(x = "Sample means", y = "Density", title = "The sampling distributions") + | |
theme_icae() | |
full_plot <- ggarrange(population_plot, hist_plot, nrow = 2) | |
ggsave(plot = full_plot, filename = here("output/T14/clt-example.pdf"), | |
width = 4, height = 3) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A for loop | |
for (i in seq_len(10)){ | |
print(i) | |
} | |
# Below we illustrate two techniques for iteration using a simple example: | |
# simulate throwing a dice | |
# 1st step: define the random process and define general parameters | |
nb_iterations <- 10 | |
sample(1:6, size = 1) | |
# 2nd step: iterate the process (variant: for-loop) | |
result_container <- rep(NA, nb_iterations) | |
set.seed(123) | |
for (i in seq_len(nb_iterations)){ | |
result_container[i] <- sample(1:6, size = 1) | |
} | |
result_container | |
result_tab <- tibble("draw"=factor(result_container, levels = 1:6)) | |
# 2nd step: iterate the process (variant: purrr package) | |
library(purrr) | |
set.seed(123) | |
result_container_purrr <- map_dbl( | |
.x = seq_len(nb_iterations), | |
.f = ~sample(1:6, size = 1) | |
) | |
result_container_purrr | |
# 3rd step: visualize and analyze the result | |
ggplot(data = result_tab, mapping = aes(x=draw)) + | |
geom_bar() + | |
scale_y_continuous(expand = expansion()) + | |
scale_x_discrete(drop=FALSE) + # To show also numbers that were not thrown | |
theme_bw() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(purrr) | |
library(tibble) | |
library(dplyr) | |
library(ggplot2) | |
# Create the ball pid population | |
ball_pid_size <- 5000 | |
ball_pid_share_white <- 0.65 | |
white_balls <- as.integer(ball_pid_share_white*ball_pid_size) | |
grey_balls <- ball_pid_size - white_balls | |
ball_pid_colors <- c(rep("white", white_balls), rep("grey", grey_balls)) | |
ball_pid <- tibble::tibble( | |
id = seq(1, ball_pid_size), | |
color = sample(ball_pid_colors) | |
) | |
# Sampling of the balls | |
# 1st step: define the random process and set general parameters | |
sample_size <- 20 | |
nb_iterations <- 400 | |
#' Function that takes a sample of balls and computes share of white balls | |
sample_balls <- function(bowl, sample_size, replace_balls=FALSE){ | |
sample_obtained <- sample( | |
x = bowl, size = sample_size, replace = replace_balls) | |
sum(sample_obtained == "white") / sample_size | |
} | |
# 2nd step: iterate the process | |
set.seed(123) | |
sampling_df <- tibble( | |
"sample_size" = sample_size, | |
"sample_id" = seq_len(nb_iterations), | |
"share_white" = map_dbl( | |
.x = seq_len(nb_iterations), | |
.f = ~sample_balls(ball_pid$color, sample_size = sample_size)) | |
) | |
# Alternative formulation using a for-loop: | |
result_container <- rep(NA, nb_iterations) | |
set.seed(123) | |
for (i in seq_len(nb_iterations)){ | |
# print(i) | |
sample_drawm <- sample( | |
x = ball_pid[["color"]], | |
size = sample_size, | |
replace = FALSE) | |
share_white <- sum(sample_drawm=="white") / length(sample_drawm) | |
share_white | |
result_container[i] <- share_white | |
} | |
# 3rd step: visualize and analyze the result | |
sample_plot <- ggplot(data = sampling_df, aes(x=share_white)) + | |
geom_histogram(binwidth = 0.02, fill="#00395B") + | |
theme_bw() | |
sample_plot | |
# Compare the effect of different sample sizes------------- | |
# 1. Simulate data for different sample sizes: | |
set.seed(123) | |
sample_size <- 10 | |
sampling_df_10 <- tibble( | |
"sample_size" = sample_size, | |
"share_white" = map_dbl( | |
.x = seq_len(nb_iterations), | |
.f = ~sample_balls(ball_pid$color, sample_size = sample_size)) | |
) | |
set.seed(123) | |
sample_size <- 50 | |
sampling_df_50 <- tibble( | |
"sample_size" = sample_size, | |
"share_white" = map_dbl( | |
.x = seq_len(nb_iterations), | |
.f = ~sample_balls(ball_pid$color, sample_size = sample_size)) | |
) | |
set.seed(123) | |
sample_size <- 500 | |
sampling_df_100 <- tibble( | |
"sample_size" = sample_size, | |
"share_white" = map_dbl( | |
.x = seq_len(nb_iterations), | |
.f = ~sample_balls(ball_pid$color, sample_size = sample_size)) | |
) | |
# 2. Analyse the results quantitatively: | |
# Merge the tibbles: | |
sample_df_full <- rbind(sampling_df_10, sampling_df_50, sampling_df_100) | |
# Compute mean and standard deviation: | |
true_white_share <- sum(ball_pid[["color"]]=="white") / nrow(ball_pid) | |
sample_df_full %>% | |
group_by(sample_size) %>% | |
summarise( | |
mean_share = mean(share_white), | |
diff_true_val = abs(mean_share - true_white_share), | |
sd_share = sd(share_white) | |
) | |
# Visualize the distributions: | |
ggplot(data = sample_df_full, mapping = aes(x=share_white)) + | |
geom_histogram(binwidth = 0.025) + | |
geom_vline(xintercept = true_white_share) + # to add a line with true value | |
scale_x_continuous(limits = c(0, 1.2), expand = expansion()) + | |
scale_y_continuous(expand = expansion(add = c(0, 10))) + | |
facet_wrap(~sample_size, ncol = 3) + | |
theme_bw() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment