Last active
June 2, 2023 09:32
-
-
Save graebnerc/ad69c3d843b8e20da23f99752082ae43 to your computer and use it in GitHub Desktop.
The script used during session 14 on sampling theory, as well as possible solutions to the exercises.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This is the script we used during the lecture. For a version that is better documented but essentially covers the same issues, see the tutorial on Monte Carlo Simulations: | |
https://euf-datascience-spring23.netlify.app/tutorial/mcs/pubdir/onlinecontent.html |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
here::i_am("CentralLimit.R") | |
library(here) | |
library(purrr) | |
library(tibble) | |
library(data.table) | |
library(ggplot2) | |
library(ggpubr) | |
library(icaeDesign) | |
set.seed(123) | |
# Population with 5000 exponentially distributed values | |
N <- 5000 | |
exp_par <- 2 | |
population <- rexp(n = N, rate = exp_par) | |
population_tab <- tibble("pop_n" = population) | |
true_mean <- mean(population) | |
population_plot <- ggplot(data = population_tab, aes(x = pop_n)) + | |
geom_histogram(aes(y = after_stat(density))) + | |
scale_y_continuous(expand = expansion(add = c(0, 0.05))) + | |
labs(title = "The population", y = "Density") + | |
theme_icae() + | |
theme(axis.title.x = element_blank()) | |
population_plot | |
# Visualization of the 'true population': | |
# MCS | |
iterations <- 500 | |
sample_sizes <- c(5, 10, 20, 50) | |
mean_dist_list <- list() | |
for (i in seq_along(sample_sizes)){ | |
sample_size <- sample_sizes[i] | |
sample_means <- rep(NA, iterations) | |
for (j in seq_len(iterations)){ | |
sample_drawn <- sample(population, size = sample_size, replace = FALSE) | |
sample_means[[j]] <- mean(sample_drawn) | |
} | |
sample_means_tab <- tibble( | |
"sample_size"=sample_size, | |
"sample_means"=sample_means | |
) | |
mean_dist_list[[sample_size]] <- sample_means_tab | |
} | |
full_results <- data.table::rbindlist(mean_dist_list) | |
hist_plot <- ggplot(data = full_results, aes(x=sample_means)) + | |
geom_histogram(aes(y=after_stat(density)), alpha=0.5, color="#00395B", fill="#00395B") + | |
scale_y_continuous(expand = expansion(add = c(0, 0.05))) + | |
scale_x_continuous(limits = c(-0.2, 1.2), expand = expansion()) + | |
facet_wrap(~sample_size, ncol = 4) + | |
stat_function( | |
data = dplyr::filter(full_results, sample_size==5), | |
fun = dnorm, | |
args = list( | |
mean = mean(dplyr::filter(full_results, sample_size==5)$sample_means), | |
sd = sd(dplyr::filter(full_results, sample_size==5)$sample_means)) | |
) + | |
stat_function( | |
data = dplyr::filter(full_results, sample_size==10), | |
fun = dnorm, | |
args = list( | |
mean = mean(dplyr::filter(full_results, sample_size==10)$sample_means), | |
sd = sd(dplyr::filter(full_results, sample_size==10)$sample_means)) | |
) + | |
stat_function( | |
data = dplyr::filter(full_results, sample_size==20), | |
fun = dnorm, | |
args = list( | |
mean = mean(dplyr::filter(full_results, sample_size==20)$sample_means), | |
sd = sd(dplyr::filter(full_results, sample_size==20)$sample_means)) | |
) + | |
stat_function( | |
data = dplyr::filter(full_results, sample_size==50), | |
fun = dnorm, | |
args = list( | |
mean = mean(dplyr::filter(full_results, sample_size==50)$sample_means), | |
sd = sd(dplyr::filter(full_results, sample_size==50)$sample_means)) | |
) + | |
labs(x = "Sample means", y = "Density", title = "The sampling distributions") + | |
theme_icae() | |
hist_plot |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Exercise 2" | |
author: "Claudius Gräbner-Radkowitsch" | |
date: "June 2, 2023" | |
format: html | |
execute: | |
echo: false | |
warning: false | |
messsage: false | |
--- | |
```{r} | |
library(purrr) | |
library(dplyr) | |
library(tidyr) | |
library(ggplot2) | |
library(kableExtra) | |
library(DataScienceExercises) | |
library(icaeDesign) | |
student_pop <- DataScienceExercises::EUFstudents | |
true_vals <- student_pop %>% | |
group_by(Gender) %>% | |
summarise( | |
Mean=mean(Height), SD=sd(Height) | |
) | |
``` | |
# Task | |
*Consider the previous example where you studied the height of selected EUF students to make a statement about average height of all EUF students. Describe the various elements using the terminology we have introduced above. Make sure you make use of the following concepts:* | |
- *Population Sample and sample size* | |
- *Point estimate* | |
- *Sampling distribution* | |
- *Standard error* | |
- *Properties of the sample and inference* | |
# Possible solution | |
The *population* is the set of all students enrolled at the EUF | |
($N=$ `r nrow(student_pop)`). We are interested in the | |
*population parameter* 'average height'. | |
The *sample* is set of randomly drawn students from the population. | |
We consider two cases, one with $n=10$ and one with $n=20$. We can assume | |
the sample is a viable random sample, i.e. that it is unbiased and | |
representative of the population. | |
The *point estimate* is the sample mean, which is used to estimate the | |
population parameter of interest. This is viable since, given our | |
assumptions, statements about the sample can be generalised to the population. | |
The sampling distribution visualizes the sampling variation, i.e. the | |
variation of the point estimate obtained from different random samples. | |
While in reality the sampling distribution remains unknown due to the fact that | |
only one sample can be drawn, in the present case we used an artificial | |
population and a Monte Carlo Simulation to characterize the sampling | |
distribution as follows: | |
```{r} | |
#| echo: false | |
#| include: true | |
set.seed(123) | |
reps <- 1000 | |
samp_size10 <- map_dbl( | |
.x = seq_len(reps), | |
.f = ~mean(sample(x = student_pop[["Height"]], | |
size = 10, | |
replace = FALSE)) | |
) | |
samp_size50 <- map_dbl( | |
.x = seq_len(reps), | |
.f = ~mean(sample(x = student_pop[["Height"]], | |
size = 50, | |
replace = FALSE))) | |
full_samples <- rbind( | |
tibble("samplesize"="Sample size: 10", "mean"=samp_size10), | |
tibble("samplesize"="Sample size: 50", "mean"=samp_size50) | |
) | |
sampling_plot <- ggplot(data = full_samples, aes(x=mean)) + | |
geom_histogram(alpha=0.5, color="#00395B", fill="#00395B", binwidth = 0.5) + | |
scale_y_continuous(expand = expansion(add = c(0, 10))) + | |
scale_x_continuous(limits = c(158, 178), expand = expansion()) + | |
geom_vline(xintercept = filter(true_vals, Gender=="total") %>% pull(Mean)) + | |
facet_wrap(~samplesize, nrow = 2) + | |
labs( | |
x = "Sample means", | |
y = "Count", | |
title = "The sampling distributions") + | |
theme_icae() | |
sampling_plot | |
``` | |
The *standard error* corresponds to the standard deviation of the sampling | |
distributions and, due to the MCS framework, can be computed directly: | |
```{r} | |
sample_props <- full_samples %>% | |
group_by(samplesize) %>% | |
summarise(`Mean`=mean(mean), | |
`Variation`=sd(mean)) %>% | |
rename(` `=samplesize) %>% | |
kable(digits = 3) | |
sample_props | |
``` | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Possible solution for exercise 1: Monte Carlo Simulation | |
here::here("R/exercise_EUFstudents.R") | |
library(here) | |
library(dplyr) | |
library(tidyr) | |
library(ggplot2) | |
library(ggpubr) | |
library(purrr) | |
# remotes::install_github("graebnerc/DataScienceExercises") | |
library(DataScienceExercises) | |
# remotes::install_github("graebnerc/icaeDesign") | |
library(icaeDesign) | |
student_pop <- DataScienceExercises::EUFstudents | |
# 1. Look up true values------------------ | |
# While not asked explicitly, we also look at different values for the two | |
# genders considered in the data: | |
true_vals <- student_pop %>% | |
group_by(Gender) %>% | |
summarise( | |
Mean=mean(Height), SD=sd(Height) | |
) | |
true_vals2 <- student_pop %>% | |
summarise( | |
Mean=mean(Height), SD=sd(Height) | |
) %>% | |
dplyr::mutate(Gender="total") %>% | |
dplyr::select(Gender, Mean, SD) | |
true_vals <- rbind(true_vals, true_vals2) | |
true_vals | |
# 2. Study sampling distribution | |
reps <- 1000 | |
## 2a) Alternative 1: draw samples using purrr------------- | |
set.seed(123) | |
samp_size10 <- map_dbl( | |
.x = seq_len(reps), | |
.f = ~mean(sample(x = student_pop[["Height"]], | |
size = 10, | |
replace = FALSE)) | |
) | |
samp_size50 <- map_dbl( | |
.x = seq_len(reps), | |
.f = ~mean(sample(x = student_pop[["Height"]], | |
size = 50, | |
replace = FALSE))) | |
full_samples <- rbind( | |
tibble("samplesize"="Sample size: 10", "mean"=samp_size10), | |
tibble("samplesize"="Sample size: 50", "mean"=samp_size50) | |
) | |
## 2b) Alternative 2: draw samples using for loop---------- | |
set.seed(123) | |
samp_size10_loop <- rep(NA, reps) | |
samp_size50_loop <- rep(NA, reps) | |
for (i in seq_len(reps)){ | |
sample_small <- sample( | |
x = student_pop[["Height"]], | |
size = 10, replace = FALSE) | |
sample_small_mean <- mean(sample_small) | |
samp_size10_loop[i] <- sample_small_mean | |
sample_large <- sample( | |
x = student_pop[["Height"]], | |
size = 10, replace = FALSE) | |
sample_large_mean <- mean(sample_large) | |
samp_size50_loop[i] <- sample_large_mean | |
} | |
# 3. Visualization------------------------ | |
sampling_plot <- ggplot(data = full_samples, aes(x=mean)) + | |
geom_histogram(alpha=0.5, color="#00395B", fill="#00395B", binwidth = 0.5) + | |
scale_y_continuous(expand = expansion(add = c(0, 10))) + | |
scale_x_continuous(limits = c(158, 178), expand = expansion()) + | |
geom_vline(xintercept = filter(true_vals, Gender=="total") %>% pull(Mean)) + | |
facet_wrap(~samplesize, nrow = 2) + | |
labs( | |
x = "Sample means", | |
y = "Count", | |
title = "The sampling distributions") + | |
theme_icae() | |
sampling_plot | |
# 5. Quantitative results-------------- | |
sample_props <- full_samples %>% | |
group_by(samplesize) %>% | |
summarise(`Mean`=mean(mean), | |
`Variation`=sd(mean)) %>% | |
rename(` `=samplesize) | |
sample_props | |
# Optional: visualization of the population---------------- | |
population_plot_sep <- ggplot(data = student_pop, aes(x = Height)) + | |
geom_histogram( | |
data = filter(student_pop, Gender=="female"), aes(y = after_stat(density)), | |
color="black", fill=icaeDesign::get_euf_colors("red") , alpha=0.5) + | |
geom_histogram( | |
data = filter(student_pop, Gender=="male"), aes(y = after_stat(density)), | |
color="black", fill=icaeDesign::get_euf_colors("lightblue"), alpha=0.5) + | |
scale_y_continuous(expand = expansion(), limits = c(0, 0.1)) + | |
labs(title = "Gender differentiated", y = "Density") + | |
stat_function(fun = dnorm, color=icaeDesign::get_euf_colors("red"), | |
args = list( | |
mean = true_vals[["Mean"]][1], | |
sd = true_vals[["SD"]][1]) | |
) + | |
stat_function(fun = dnorm, color=icaeDesign::get_euf_colors("lightblue"), | |
args = list( | |
mean = true_vals[["Mean"]][2], | |
sd = true_vals[["SD"]][2]) | |
) + | |
theme_icae() + | |
theme(axis.title.x = element_blank()) | |
population_plot_sep | |
population_plot_full <- ggplot(data = student_pop, aes(x = Height)) + | |
geom_histogram(aes(y = after_stat(density)), | |
color="black", fill=icaeDesign::get_euf_colors("blue"), alpha=0.5) + | |
scale_y_continuous(expand = expansion(), limits = c(0, 0.06)) + | |
labs(title = "Complete", y = "Density") + | |
theme_icae() + | |
theme(axis.title.x = element_blank()) | |
population_plot_full | |
population_full <- ggarrange( | |
population_plot_full, population_plot_sep, nrow = 2) | |
population_full <- annotate_figure( | |
population_full, top = "The population of EUF students") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is the script we used during the lecture. For a version that | |
# is better documented but essentially covers the same issues, see the | |
# tutorial on Monte Carlo Simulations: | |
# https://euf-datascience-spring23.netlify.app/tutorial/mcs/pubdir/onlinecontent.html | |
here::i_am("R/lecture_script.R") | |
library(here) | |
library(tidyr) | |
library(data.table) | |
library(dplyr) | |
library(ggplot2) | |
library(scales) | |
library(purrr) # | |
library(icaeDesign) | |
# 1. Example for a for-loop--------------- | |
base_list <- list( | |
"element_1" = c(1, 4, 5, 6), | |
"element_2" = c(9, 2, 2, 8), | |
"element_3" = c(4, 3, 0, 7) | |
) | |
result <- rep(NA, length(base_list)) # The output container | |
print(result) | |
for (i in seq_along(base_list)) { # The looping sequence | |
result[[i]] <- mean(base_list[[i]]) # The action body | |
} | |
print(result) | |
# Note: I strongly recommend to use the functions seq_len() and seq_along() | |
# when specifying the looping sequence; while you could also iterate over | |
# names(base_list) directly, this is less flexible and more prone to errors. | |
# 2. The MCS workflow--------------------- | |
# 2a) If necessary: create the artificial population | |
# 2b) Define the random process to be analysed | |
# 2c) Repeat the random process many times | |
# 2d) Summarize the simulation results | |
## 2.a) Create the population--------------- | |
# Ball pid of 5000 balls with 65% white balls | |
size_population <- 5000 | |
white_balls <- as.integer(size_population*0.65) | |
grey_balls <- size_population - white_balls | |
population_tib <- tibble( | |
"ball_id" = seq(1, 5000), | |
"ball_color" = c(rep("white", white_balls), rep("grey", grey_balls) ) | |
) | |
## 2.b) Define the random process----------- | |
# Draw a sample and compute the share of white balls | |
# First, make sure you know how to specify the random process: | |
sample_taken <- sample( | |
x = population_tib[["ball_color"]], | |
size = 50, replace = TRUE) | |
share_white <- sum(sample_taken == "white") / length(sample_taken) | |
# Then define a function based on the previous code: | |
#' Draw a sample and compute share of white balls | |
#' @param sample_size The size of the sample to be drawn | |
#' @returns A double with the share of white balls in the sample | |
draw_sample <- function(sample_size){ | |
sample_taken <- sample( | |
x = population_tib[["ball_color"]], | |
size = sample_size, replace = TRUE) | |
share_white <- sum(sample_taken == "white") / length(sample_taken) | |
return(share_white) | |
} | |
draw_sample(50) | |
## 2.c) Conduct the MCS--------------------- | |
# 1000 samples | |
# Sample sizes 20, 50 and 1000 | |
n_iterations <- 10000 # When developing the loop, start with 3; once it works, | |
# you can increase the number | |
sample_size_used <- 100 | |
simulation_container <- rep(NA, n_iterations) | |
simulation_container | |
set.seed(123) # To make the simulation reproducible | |
for (i in seq_len(n_iterations)){ | |
simulation_container[i] <- draw_sample(sample_size = sample_size_used) | |
} | |
simulation_container | |
# With all cases: | |
n_samples <- 1000 | |
results_n20 <- rep(NA, n_samples) | |
results_n50 <- rep(NA, n_samples) | |
results_n100 <- rep(NA, n_samples) | |
set.seed(123) | |
for (i in seq_len(n_samples)){ | |
results_n20[i] <- draw_sample(sample_size = 20) | |
results_n50[i] <- draw_sample(sample_size = 50) | |
results_n100[i] <- draw_sample(sample_size = 100) | |
} | |
result_table <- tibble( | |
sample_size20 = results_n20, | |
sample_size50 = results_n50, | |
sample_size100 = results_n100 | |
) | |
## 2d) Analyze results ----------------------- | |
### Quantitative:------- | |
# The quantitative results (of interest definitely the mean and standard | |
#. deviation of the sampling distribution) | |
result_table %>% | |
pivot_longer(cols = everything(), | |
names_to = "Sample size", | |
values_to = "Share of white balls") %>% | |
mutate( | |
`Sample size` = as.integer( | |
stringr::str_remove_all(`Sample size`, "[^0-9.-]")) | |
) %>% | |
group_by(`Sample size`) %>% | |
summarise( | |
`Mean share` = mean(`Share of white balls`), | |
`Standard deviation` = sd(`Share of white balls`) | |
) | |
### Visual:------- | |
# A quick-and-dirty-way: | |
hist(result_table$sample_size20) | |
# A bit nicer: | |
ggplot(data = result_table, aes(x=sample_size20)) + | |
geom_histogram(binwidth = 0.05) + theme_bw() + | |
scale_x_continuous(limits = c(0, 1)) + | |
geom_vline(xintercept = 0.65) | |
# A good visualization: | |
hist_visualization <- result_table %>% | |
pivot_longer(cols = everything(), | |
names_to = "Sample size", | |
values_to = "Share of white balls") %>% | |
mutate( | |
`Sample size` = as.integer( | |
stringr::str_remove_all(`Sample size`, "[^0-9.-]")) | |
) %>% | |
mutate( | |
`Sample size` = paste0("Sample size: ", as.character(`Sample size`)), | |
`Sample size` = factor( | |
`Sample size`, | |
levels=c("Sample size: 20", "Sample size: 50", "Sample size: 100"), | |
ordered = TRUE), | |
) %>% | |
ggplot(data = ., aes(x=`Share of white balls`)) + | |
geom_histogram(binwidth = 0.02, fill="#00395B") + | |
scale_y_continuous(expand = expansion(add = c(0, 1))) + | |
scale_x_continuous(labels = percent_format()) + | |
labs( | |
x = "Share of white balls", | |
y = "Number of samples", | |
title = "True share: 65%") + | |
geom_vline(xintercept = 0.65) + | |
facet_wrap(~`Sample size`, scales = "fixed") + | |
theme_icae() + | |
theme( | |
panel.grid.minor.x = element_blank(), | |
panel.grid.minor.y = element_blank()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment