graebnerc/#T14: Lecture script

## #T14: Lecture script
This is the script we used during the lecture. For a version that  is better documented but essentially covers the same issues, see the tutorial on Monte Carlo Simulations:
https://euf-datascience-spring23.netlify.app/tutorial/mcs/pubdir/onlinecontent.html

## exercise_CLT.R
here::i_am("CentralLimit.R")
library(here)
library(purrr)
library(tibble)
library(data.table)
library(ggplot2)
library(ggpubr)
library(icaeDesign)

set.seed(123)
# Population with 5000 exponentially distributed values
N <- 5000
exp_par <- 2

population <- rexp(n = N, rate = exp_par)
population_tab <- tibble("pop_n" = population)
true_mean <- mean(population)

population_plot <- ggplot(data = population_tab, aes(x = pop_n)) +
  geom_histogram(aes(y = after_stat(density))) +
  scale_y_continuous(expand = expansion(add = c(0, 0.05))) +
  labs(title = "The population", y = "Density") +
  theme_icae() +
  theme(axis.title.x = element_blank())
population_plot


# Visualization of the 'true population':

# MCS
iterations <- 500
sample_sizes <- c(5, 10, 20, 50)
mean_dist_list <- list()

for (i in seq_along(sample_sizes)){
  sample_size <- sample_sizes[i]

  sample_means <- rep(NA, iterations)

  for (j in seq_len(iterations)){
    sample_drawn <- sample(population, size = sample_size, replace = FALSE)
    sample_means[[j]] <- mean(sample_drawn)
  }

  sample_means_tab <- tibble(
    "sample_size"=sample_size,
    "sample_means"=sample_means
  )
  mean_dist_list[[sample_size]] <- sample_means_tab
}

full_results <- data.table::rbindlist(mean_dist_list)

hist_plot <- ggplot(data = full_results, aes(x=sample_means)) +
  geom_histogram(aes(y=after_stat(density)), alpha=0.5, color="#00395B", fill="#00395B") +
  scale_y_continuous(expand = expansion(add = c(0, 0.05))) +
  scale_x_continuous(limits = c(-0.2, 1.2), expand = expansion()) +
  facet_wrap(~sample_size, ncol = 4) +
  stat_function(
    data = dplyr::filter(full_results, sample_size==5),
    fun = dnorm,
    args = list(
      mean = mean(dplyr::filter(full_results, sample_size==5)$sample_means),
      sd = sd(dplyr::filter(full_results, sample_size==5)$sample_means))
  ) +
  stat_function(
    data = dplyr::filter(full_results, sample_size==10),
    fun = dnorm,
    args = list(
      mean = mean(dplyr::filter(full_results, sample_size==10)$sample_means),
      sd = sd(dplyr::filter(full_results, sample_size==10)$sample_means))
  ) +
  stat_function(
    data = dplyr::filter(full_results, sample_size==20),
    fun = dnorm,
    args = list(
      mean = mean(dplyr::filter(full_results, sample_size==20)$sample_means),
      sd = sd(dplyr::filter(full_results, sample_size==20)$sample_means))
  ) +
  stat_function(
    data = dplyr::filter(full_results, sample_size==50),
    fun = dnorm,
    args = list(
      mean = mean(dplyr::filter(full_results, sample_size==50)$sample_means),
      sd = sd(dplyr::filter(full_results, sample_size==50)$sample_means))
  ) +
  labs(x = "Sample means", y = "Density", title = "The sampling distributions") +
  theme_icae()
hist_plot

## exercise_EUFstudents-Terminology.qmd
---
title: "Exercise 2"
author: "Claudius Gräbner-Radkowitsch"
date: "June 2, 2023"
format: html
execute:
  echo: false
  warning: false
  messsage: false

---

```{r}
library(purrr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(kableExtra)
library(DataScienceExercises)
library(icaeDesign)
student_pop <- DataScienceExercises::EUFstudents
true_vals <- student_pop %>%
  group_by(Gender) %>%
  summarise(
    Mean=mean(Height), SD=sd(Height)
  )
```


# Task

*Consider the previous example where you studied the height of selected EUF students to make a statement about average height of all EUF students. Describe the various elements using the terminology we have introduced above. Make sure you make use of the following concepts:*

-   *Population Sample and sample size*

-   *Point estimate*

-   *Sampling distribution*

-   *Standard error*

-   *Properties of the sample and inference*

# Possible solution

The *population* is the set of all students enrolled at the EUF
($N=$ `r nrow(student_pop)`). We are interested in the
*population parameter* 'average height'.

The *sample* is set of randomly drawn students from the population.
We consider two cases, one with $n=10$ and one with $n=20$. We can assume
the sample is a viable random sample, i.e. that it is unbiased and
representative of the population.

The *point estimate* is the sample mean, which is used to estimate the
population parameter of interest. This is viable since, given our
assumptions, statements about the sample can be generalised to the population.

The sampling distribution visualizes the sampling variation, i.e. the
variation of the point estimate obtained from different random samples.
While in reality the sampling distribution remains unknown due to the fact that
only one sample can be drawn, in the present case we used an artificial
population and a Monte Carlo Simulation to characterize the sampling
distribution as follows:

```{r}
#| echo: false
#| include: true
set.seed(123)
reps <- 1000
samp_size10 <- map_dbl(
  .x = seq_len(reps),
  .f = ~mean(sample(x = student_pop[["Height"]],
                    size = 10,
                    replace = FALSE))
)
samp_size50 <- map_dbl(
  .x = seq_len(reps),
  .f = ~mean(sample(x = student_pop[["Height"]],
                    size = 50,
                    replace = FALSE)))

full_samples <- rbind(
  tibble("samplesize"="Sample size: 10", "mean"=samp_size10),
  tibble("samplesize"="Sample size: 50", "mean"=samp_size50)
)

sampling_plot <- ggplot(data = full_samples, aes(x=mean)) +
  geom_histogram(alpha=0.5, color="#00395B", fill="#00395B", binwidth = 0.5) +
  scale_y_continuous(expand = expansion(add = c(0, 10))) +
  scale_x_continuous(limits = c(158, 178), expand = expansion()) +
  geom_vline(xintercept = filter(true_vals, Gender=="total") %>% pull(Mean)) +
  facet_wrap(~samplesize, nrow = 2) +
  labs(
    x = "Sample means",
    y = "Count",
    title = "The sampling distributions") +
  theme_icae()
sampling_plot
```

The *standard error* corresponds to the standard deviation of the sampling
distributions and, due to the MCS framework, can be computed directly:

```{r}
sample_props <- full_samples %>%
  group_by(samplesize) %>%
  summarise(`Mean`=mean(mean),
            `Variation`=sd(mean)) %>%
  rename(` `=samplesize) %>%
  kable(digits = 3)
sample_props
```


## exercise_EUFstudents.R
# Possible solution for exercise 1: Monte Carlo Simulation
here::here("R/exercise_EUFstudents.R")
library(here)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggpubr)
library(purrr)
# remotes::install_github("graebnerc/DataScienceExercises")
library(DataScienceExercises)
# remotes::install_github("graebnerc/icaeDesign")
library(icaeDesign)

student_pop <- DataScienceExercises::EUFstudents

# 1. Look up true values------------------
# While not asked explicitly, we also look at different values for the two
#  genders considered in the data:
true_vals <- student_pop %>%
  group_by(Gender) %>%
  summarise(
    Mean=mean(Height), SD=sd(Height)
  )

true_vals2 <- student_pop %>%
  summarise(
    Mean=mean(Height), SD=sd(Height)
  ) %>%
  dplyr::mutate(Gender="total") %>%
  dplyr::select(Gender, Mean, SD)

true_vals <- rbind(true_vals, true_vals2)
true_vals

# 2. Study sampling distribution
reps <- 1000

## 2a) Alternative 1: draw samples using purrr-------------
set.seed(123)
samp_size10 <- map_dbl(
  .x = seq_len(reps),
  .f = ~mean(sample(x = student_pop[["Height"]],
                    size = 10,
                    replace = FALSE))
)
samp_size50 <- map_dbl(
  .x = seq_len(reps),
  .f = ~mean(sample(x = student_pop[["Height"]],
                    size = 50,
                    replace = FALSE)))

full_samples <- rbind(
  tibble("samplesize"="Sample size: 10", "mean"=samp_size10),
  tibble("samplesize"="Sample size: 50", "mean"=samp_size50)
)

## 2b) Alternative 2: draw samples using for loop----------
set.seed(123)
samp_size10_loop <- rep(NA, reps)
samp_size50_loop <- rep(NA, reps)

for (i in seq_len(reps)){
  sample_small <- sample(
    x = student_pop[["Height"]],
    size = 10, replace = FALSE)
  sample_small_mean <- mean(sample_small)
  samp_size10_loop[i] <- sample_small_mean

  sample_large <- sample(
    x = student_pop[["Height"]],
    size = 10, replace = FALSE)
  sample_large_mean <- mean(sample_large)
  samp_size50_loop[i] <- sample_large_mean
}

# 3. Visualization------------------------
sampling_plot <- ggplot(data = full_samples, aes(x=mean)) +
  geom_histogram(alpha=0.5, color="#00395B", fill="#00395B", binwidth = 0.5) +
  scale_y_continuous(expand = expansion(add = c(0, 10))) +
  scale_x_continuous(limits = c(158, 178), expand = expansion()) +
  geom_vline(xintercept = filter(true_vals, Gender=="total") %>% pull(Mean)) +
  facet_wrap(~samplesize, nrow = 2) +
  labs(
    x = "Sample means",
    y = "Count",
    title = "The sampling distributions") +
  theme_icae()
sampling_plot

# 5. Quantitative results--------------
sample_props <- full_samples %>%
  group_by(samplesize) %>%
  summarise(`Mean`=mean(mean),
            `Variation`=sd(mean)) %>%
  rename(` `=samplesize)
sample_props

# Optional: visualization of the population----------------
population_plot_sep <- ggplot(data = student_pop, aes(x = Height)) +
  geom_histogram(
    data = filter(student_pop, Gender=="female"), aes(y = after_stat(density)),
    color="black", fill=icaeDesign::get_euf_colors("red") , alpha=0.5) +
  geom_histogram(
    data = filter(student_pop, Gender=="male"), aes(y = after_stat(density)),
    color="black", fill=icaeDesign::get_euf_colors("lightblue"), alpha=0.5) +
  scale_y_continuous(expand = expansion(), limits = c(0, 0.1)) +
  labs(title = "Gender differentiated", y = "Density") +
  stat_function(fun = dnorm, color=icaeDesign::get_euf_colors("red"),
                args = list(
                  mean = true_vals[["Mean"]][1],
                  sd = true_vals[["SD"]][1])
  ) +
  stat_function(fun = dnorm, color=icaeDesign::get_euf_colors("lightblue"),
                args = list(
                  mean = true_vals[["Mean"]][2],
                  sd = true_vals[["SD"]][2])
  ) +
  theme_icae() +
  theme(axis.title.x = element_blank())
population_plot_sep

population_plot_full <- ggplot(data = student_pop, aes(x = Height)) +
  geom_histogram(aes(y = after_stat(density)),
                 color="black", fill=icaeDesign::get_euf_colors("blue"), alpha=0.5) +
  scale_y_continuous(expand = expansion(), limits = c(0, 0.06)) +
  labs(title = "Complete", y = "Density") +
  theme_icae() +
  theme(axis.title.x = element_blank())
population_plot_full

population_full <- ggarrange(
  population_plot_full, population_plot_sep, nrow = 2)

population_full <- annotate_figure(
  population_full, top = "The population of EUF students")

## lecture_script.R
# This is the script we used during the lecture. For a version that
#  is better documented but essentially covers the same issues, see the
#  tutorial on Monte Carlo Simulations:
#  https://euf-datascience-spring23.netlify.app/tutorial/mcs/pubdir/onlinecontent.html
here::i_am("R/lecture_script.R")
library(here)
library(tidyr)
library(data.table)
library(dplyr)
library(ggplot2)
library(scales)
library(purrr) #
library(icaeDesign)

# 1. Example for a for-loop---------------
base_list <- list(
  "element_1" = c(1, 4, 5, 6),
  "element_2" = c(9, 2, 2, 8),
  "element_3" = c(4, 3, 0, 7)
)

result <- rep(NA, length(base_list)) # The output container
print(result)

for (i in seq_along(base_list)) { # The looping sequence
  result[[i]] <- mean(base_list[[i]]) # The action body
}
print(result)

# Note: I strongly recommend to use the functions seq_len() and seq_along()
#  when specifying the looping sequence; while you could also iterate over
#  names(base_list) directly, this is less flexible and more prone to errors.

# 2. The MCS workflow---------------------
# 2a) If necessary: create the artificial population
# 2b) Define the random process to be analysed
# 2c) Repeat the random process many times
# 2d) Summarize the simulation results

## 2.a) Create the population---------------
# Ball pid of 5000 balls with 65% white balls
size_population <- 5000
white_balls <- as.integer(size_population*0.65)
grey_balls <- size_population - white_balls
population_tib <- tibble(
  "ball_id" = seq(1, 5000),
  "ball_color" = c(rep("white", white_balls), rep("grey", grey_balls) )
)

## 2.b) Define the random process-----------
# Draw a sample and compute the share of white balls

# First, make sure you know how to specify the random process:
sample_taken <- sample(
  x = population_tib[["ball_color"]],
  size = 50, replace = TRUE)
share_white <- sum(sample_taken == "white") / length(sample_taken)

# Then define a function based on the previous code:

#' Draw a sample and compute share of white balls
#' @param sample_size The size of the sample to be drawn
#' @returns A double with the share of white balls in the sample
draw_sample <- function(sample_size){
  sample_taken <- sample(
    x = population_tib[["ball_color"]],
    size = sample_size, replace = TRUE)
  share_white <- sum(sample_taken == "white") / length(sample_taken)
  return(share_white)
}
draw_sample(50)

## 2.c) Conduct the MCS---------------------
# 1000 samples
# Sample sizes 20, 50 and 1000
n_iterations <- 10000 # When developing the loop, start with 3; once it works,
#  you can increase the number

sample_size_used <- 100
simulation_container <- rep(NA, n_iterations)

simulation_container
set.seed(123) # To make the simulation reproducible
for (i in seq_len(n_iterations)){
  simulation_container[i] <- draw_sample(sample_size = sample_size_used)
}
simulation_container

# With all cases:
n_samples <- 1000
results_n20 <- rep(NA, n_samples)
results_n50 <- rep(NA, n_samples)
results_n100 <- rep(NA, n_samples)

set.seed(123)
for (i in seq_len(n_samples)){
  results_n20[i] <- draw_sample(sample_size = 20)
  results_n50[i] <- draw_sample(sample_size = 50)
  results_n100[i] <- draw_sample(sample_size = 100)
}

result_table <- tibble(
  sample_size20 = results_n20,
  sample_size50 = results_n50,
  sample_size100 = results_n100
)

## 2d) Analyze results -----------------------
### Quantitative:-------
# The quantitative results (of interest definitely the mean and standard
#. deviation of the sampling distribution)
result_table %>%
  pivot_longer(cols = everything(),
               names_to = "Sample size",
               values_to = "Share of white balls") %>%
  mutate(
    `Sample size` = as.integer(
      stringr::str_remove_all(`Sample size`, "[^0-9.-]"))
  ) %>%
  group_by(`Sample size`) %>%
  summarise(
    `Mean share` = mean(`Share of white balls`),
    `Standard deviation` = sd(`Share of white balls`)
  )

### Visual:-------
# A quick-and-dirty-way:
hist(result_table$sample_size20)
# A bit nicer:
ggplot(data = result_table, aes(x=sample_size20)) +
  geom_histogram(binwidth = 0.05) + theme_bw()  +
  scale_x_continuous(limits = c(0, 1)) +
  geom_vline(xintercept = 0.65)

# A good visualization:
hist_visualization <- result_table %>%
  pivot_longer(cols = everything(),
               names_to = "Sample size",
               values_to = "Share of white balls") %>%
  mutate(
    `Sample size` = as.integer(
      stringr::str_remove_all(`Sample size`, "[^0-9.-]"))
  ) %>%
  mutate(
    `Sample size` = paste0("Sample size: ", as.character(`Sample size`)),
    `Sample size` = factor(
      `Sample size`,
      levels=c("Sample size: 20", "Sample size: 50", "Sample size: 100"),
      ordered = TRUE),
  ) %>%
  ggplot(data = ., aes(x=`Share of white balls`)) +
  geom_histogram(binwidth = 0.02, fill="#00395B") +
  scale_y_continuous(expand = expansion(add = c(0, 1))) +
  scale_x_continuous(labels = percent_format()) +
  labs(
    x = "Share of white balls",
    y = "Number of samples",
    title = "True share: 65%") +
  geom_vline(xintercept = 0.65) +
  facet_wrap(~`Sample size`, scales = "fixed") +
  theme_icae() +
  theme(
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank())
	This is the script we used during the lecture. For a version that is better documented but essentially covers the same issues, see the tutorial on Monte Carlo Simulations:
	https://euf-datascience-spring23.netlify.app/tutorial/mcs/pubdir/onlinecontent.html
	here::i_am("CentralLimit.R")
	library(here)
	library(purrr)
	library(tibble)
	library(data.table)
	library(ggplot2)
	library(ggpubr)
	library(icaeDesign)

	set.seed(123)
	# Population with 5000 exponentially distributed values
	N <- 5000
	exp_par <- 2

	population <- rexp(n = N, rate = exp_par)
	population_tab <- tibble("pop_n" = population)
	true_mean <- mean(population)

	population_plot <- ggplot(data = population_tab, aes(x = pop_n)) +
	geom_histogram(aes(y = after_stat(density))) +
	scale_y_continuous(expand = expansion(add = c(0, 0.05))) +
	labs(title = "The population", y = "Density") +
	theme_icae() +
	theme(axis.title.x = element_blank())
	population_plot


	# Visualization of the 'true population':

	# MCS
	iterations <- 500
	sample_sizes <- c(5, 10, 20, 50)
	mean_dist_list <- list()

	for (i in seq_along(sample_sizes)){
	sample_size <- sample_sizes[i]

	sample_means <- rep(NA, iterations)

	for (j in seq_len(iterations)){
	sample_drawn <- sample(population, size = sample_size, replace = FALSE)
	sample_means[[j]] <- mean(sample_drawn)
	}

	sample_means_tab <- tibble(
	"sample_size"=sample_size,
	"sample_means"=sample_means
	)
	mean_dist_list[[sample_size]] <- sample_means_tab
	}

	full_results <- data.table::rbindlist(mean_dist_list)

	hist_plot <- ggplot(data = full_results, aes(x=sample_means)) +
	geom_histogram(aes(y=after_stat(density)), alpha=0.5, color="#00395B", fill="#00395B") +
	scale_y_continuous(expand = expansion(add = c(0, 0.05))) +
	scale_x_continuous(limits = c(-0.2, 1.2), expand = expansion()) +
	facet_wrap(~sample_size, ncol = 4) +
	stat_function(
	data = dplyr::filter(full_results, sample_size==5),
	fun = dnorm,
	args = list(
	mean = mean(dplyr::filter(full_results, sample_size==5)$sample_means),
	sd = sd(dplyr::filter(full_results, sample_size==5)$sample_means))
	) +
	stat_function(
	data = dplyr::filter(full_results, sample_size==10),
	fun = dnorm,
	args = list(
	mean = mean(dplyr::filter(full_results, sample_size==10)$sample_means),
	sd = sd(dplyr::filter(full_results, sample_size==10)$sample_means))
	) +
	stat_function(
	data = dplyr::filter(full_results, sample_size==20),
	fun = dnorm,
	args = list(
	mean = mean(dplyr::filter(full_results, sample_size==20)$sample_means),
	sd = sd(dplyr::filter(full_results, sample_size==20)$sample_means))
	) +
	stat_function(
	data = dplyr::filter(full_results, sample_size==50),
	fun = dnorm,
	args = list(
	mean = mean(dplyr::filter(full_results, sample_size==50)$sample_means),
	sd = sd(dplyr::filter(full_results, sample_size==50)$sample_means))
	) +
	labs(x = "Sample means", y = "Density", title = "The sampling distributions") +
	theme_icae()
	hist_plot
	---
	title: "Exercise 2"
	author: "Claudius Gräbner-Radkowitsch"
	date: "June 2, 2023"
	format: html
	execute:
	echo: false
	warning: false
	messsage: false

	---

	```{r}
	library(purrr)
	library(dplyr)
	library(tidyr)
	library(ggplot2)
	library(kableExtra)
	library(DataScienceExercises)
	library(icaeDesign)
	student_pop <- DataScienceExercises::EUFstudents
	true_vals <- student_pop %>%
	group_by(Gender) %>%
	summarise(
	Mean=mean(Height), SD=sd(Height)
	)
	```


	# Task

	Consider the previous example where you studied the height of selected EUF students to make a statement about average height of all EUF students. Describe the various elements using the terminology we have introduced above. Make sure you make use of the following concepts:

	- Population Sample and sample size

	- Point estimate

	- Sampling distribution

	- Standard error

	- Properties of the sample and inference

	# Possible solution

	The population is the set of all students enrolled at the EUF
	($N=$ `r nrow(student_pop)`). We are interested in the
	population parameter 'average height'.

	The sample is set of randomly drawn students from the population.
	We consider two cases, one with $n=10$ and one with $n=20$. We can assume
	the sample is a viable random sample, i.e. that it is unbiased and
	representative of the population.

	The point estimate is the sample mean, which is used to estimate the
	population parameter of interest. This is viable since, given our
	assumptions, statements about the sample can be generalised to the population.

	The sampling distribution visualizes the sampling variation, i.e. the
	variation of the point estimate obtained from different random samples.
	While in reality the sampling distribution remains unknown due to the fact that
	only one sample can be drawn, in the present case we used an artificial
	population and a Monte Carlo Simulation to characterize the sampling
	distribution as follows:

	```{r}
	#\| echo: false
	#\| include: true
	set.seed(123)
	reps <- 1000
	samp_size10 <- map_dbl(
	.x = seq_len(reps),
	.f = ~mean(sample(x = student_pop[["Height"]],
	size = 10,
	replace = FALSE))
	)
	samp_size50 <- map_dbl(
	.x = seq_len(reps),
	.f = ~mean(sample(x = student_pop[["Height"]],
	size = 50,
	replace = FALSE)))

	full_samples <- rbind(
	tibble("samplesize"="Sample size: 10", "mean"=samp_size10),
	tibble("samplesize"="Sample size: 50", "mean"=samp_size50)
	)

	sampling_plot <- ggplot(data = full_samples, aes(x=mean)) +
	geom_histogram(alpha=0.5, color="#00395B", fill="#00395B", binwidth = 0.5) +
	scale_y_continuous(expand = expansion(add = c(0, 10))) +
	scale_x_continuous(limits = c(158, 178), expand = expansion()) +
	geom_vline(xintercept = filter(true_vals, Gender=="total") %>% pull(Mean)) +
	facet_wrap(~samplesize, nrow = 2) +
	labs(
	x = "Sample means",
	y = "Count",
	title = "The sampling distributions") +
	theme_icae()
	sampling_plot
	```

	The standard error corresponds to the standard deviation of the sampling
	distributions and, due to the MCS framework, can be computed directly:

	```{r}
	sample_props <- full_samples %>%
	group_by(samplesize) %>%
	summarise(`Mean`=mean(mean),
	`Variation`=sd(mean)) %>%
	rename(` `=samplesize) %>%
	kable(digits = 3)
	sample_props
	```
	# Possible solution for exercise 1: Monte Carlo Simulation
	here::here("R/exercise_EUFstudents.R")
	library(here)
	library(dplyr)
	library(tidyr)
	library(ggplot2)
	library(ggpubr)
	library(purrr)
	# remotes::install_github("graebnerc/DataScienceExercises")
	library(DataScienceExercises)
	# remotes::install_github("graebnerc/icaeDesign")
	library(icaeDesign)

	student_pop <- DataScienceExercises::EUFstudents

	# 1. Look up true values------------------
	# While not asked explicitly, we also look at different values for the two
	# genders considered in the data:
	true_vals <- student_pop %>%
	group_by(Gender) %>%
	summarise(
	Mean=mean(Height), SD=sd(Height)
	)

	true_vals2 <- student_pop %>%
	summarise(
	Mean=mean(Height), SD=sd(Height)
	) %>%
	dplyr::mutate(Gender="total") %>%
	dplyr::select(Gender, Mean, SD)

	true_vals <- rbind(true_vals, true_vals2)
	true_vals

	# 2. Study sampling distribution
	reps <- 1000

	## 2a) Alternative 1: draw samples using purrr-------------
	set.seed(123)
	samp_size10 <- map_dbl(
	.x = seq_len(reps),
	.f = ~mean(sample(x = student_pop[["Height"]],
	size = 10,
	replace = FALSE))
	)
	samp_size50 <- map_dbl(
	.x = seq_len(reps),
	.f = ~mean(sample(x = student_pop[["Height"]],
	size = 50,
	replace = FALSE)))

	full_samples <- rbind(
	tibble("samplesize"="Sample size: 10", "mean"=samp_size10),
	tibble("samplesize"="Sample size: 50", "mean"=samp_size50)
	)

	## 2b) Alternative 2: draw samples using for loop----------
	set.seed(123)
	samp_size10_loop <- rep(NA, reps)
	samp_size50_loop <- rep(NA, reps)

	for (i in seq_len(reps)){
	sample_small <- sample(
	x = student_pop[["Height"]],
	size = 10, replace = FALSE)
	sample_small_mean <- mean(sample_small)
	samp_size10_loop[i] <- sample_small_mean

	sample_large <- sample(
	x = student_pop[["Height"]],
	size = 10, replace = FALSE)
	sample_large_mean <- mean(sample_large)
	samp_size50_loop[i] <- sample_large_mean
	}

	# 3. Visualization------------------------
	sampling_plot <- ggplot(data = full_samples, aes(x=mean)) +
	geom_histogram(alpha=0.5, color="#00395B", fill="#00395B", binwidth = 0.5) +
	scale_y_continuous(expand = expansion(add = c(0, 10))) +
	scale_x_continuous(limits = c(158, 178), expand = expansion()) +
	geom_vline(xintercept = filter(true_vals, Gender=="total") %>% pull(Mean)) +
	facet_wrap(~samplesize, nrow = 2) +
	labs(
	x = "Sample means",
	y = "Count",
	title = "The sampling distributions") +
	theme_icae()
	sampling_plot

	# 5. Quantitative results--------------
	sample_props <- full_samples %>%
	group_by(samplesize) %>%
	summarise(`Mean`=mean(mean),
	`Variation`=sd(mean)) %>%
	rename(` `=samplesize)
	sample_props

	# Optional: visualization of the population----------------
	population_plot_sep <- ggplot(data = student_pop, aes(x = Height)) +
	geom_histogram(
	data = filter(student_pop, Gender=="female"), aes(y = after_stat(density)),
	color="black", fill=icaeDesign::get_euf_colors("red") , alpha=0.5) +
	geom_histogram(
	data = filter(student_pop, Gender=="male"), aes(y = after_stat(density)),
	color="black", fill=icaeDesign::get_euf_colors("lightblue"), alpha=0.5) +
	scale_y_continuous(expand = expansion(), limits = c(0, 0.1)) +
	labs(title = "Gender differentiated", y = "Density") +
	stat_function(fun = dnorm, color=icaeDesign::get_euf_colors("red"),
	args = list(
	mean = true_vals[["Mean"]][1],
	sd = true_vals[["SD"]][1])
	) +
	stat_function(fun = dnorm, color=icaeDesign::get_euf_colors("lightblue"),
	args = list(
	mean = true_vals[["Mean"]][2],
	sd = true_vals[["SD"]][2])
	) +
	theme_icae() +
	theme(axis.title.x = element_blank())
	population_plot_sep

	population_plot_full <- ggplot(data = student_pop, aes(x = Height)) +
	geom_histogram(aes(y = after_stat(density)),
	color="black", fill=icaeDesign::get_euf_colors("blue"), alpha=0.5) +
	scale_y_continuous(expand = expansion(), limits = c(0, 0.06)) +
	labs(title = "Complete", y = "Density") +
	theme_icae() +
	theme(axis.title.x = element_blank())
	population_plot_full

	population_full <- ggarrange(
	population_plot_full, population_plot_sep, nrow = 2)

	population_full <- annotate_figure(
	population_full, top = "The population of EUF students")
	# This is the script we used during the lecture. For a version that
	# is better documented but essentially covers the same issues, see the
	# tutorial on Monte Carlo Simulations:
	# https://euf-datascience-spring23.netlify.app/tutorial/mcs/pubdir/onlinecontent.html
	here::i_am("R/lecture_script.R")
	library(here)
	library(tidyr)
	library(data.table)
	library(dplyr)
	library(ggplot2)
	library(scales)
	library(purrr) #
	library(icaeDesign)

	# 1. Example for a for-loop---------------
	base_list <- list(
	"element_1" = c(1, 4, 5, 6),
	"element_2" = c(9, 2, 2, 8),
	"element_3" = c(4, 3, 0, 7)
	)

	result <- rep(NA, length(base_list)) # The output container
	print(result)

	for (i in seq_along(base_list)) { # The looping sequence
	result[[i]] <- mean(base_list[[i]]) # The action body
	}
	print(result)

	# Note: I strongly recommend to use the functions seq_len() and seq_along()
	# when specifying the looping sequence; while you could also iterate over
	# names(base_list) directly, this is less flexible and more prone to errors.

	# 2. The MCS workflow---------------------
	# 2a) If necessary: create the artificial population
	# 2b) Define the random process to be analysed
	# 2c) Repeat the random process many times
	# 2d) Summarize the simulation results

	## 2.a) Create the population---------------
	# Ball pid of 5000 balls with 65% white balls
	size_population <- 5000
	white_balls <- as.integer(size_population*0.65)
	grey_balls <- size_population - white_balls
	population_tib <- tibble(
	"ball_id" = seq(1, 5000),
	"ball_color" = c(rep("white", white_balls), rep("grey", grey_balls) )
	)

	## 2.b) Define the random process-----------
	# Draw a sample and compute the share of white balls

	# First, make sure you know how to specify the random process:
	sample_taken <- sample(
	x = population_tib[["ball_color"]],
	size = 50, replace = TRUE)
	share_white <- sum(sample_taken == "white") / length(sample_taken)

	# Then define a function based on the previous code:

	#' Draw a sample and compute share of white balls
	#' @param sample_size The size of the sample to be drawn
	#' @returns A double with the share of white balls in the sample
	draw_sample <- function(sample_size){
	sample_taken <- sample(
	x = population_tib[["ball_color"]],
	size = sample_size, replace = TRUE)
	share_white <- sum(sample_taken == "white") / length(sample_taken)
	return(share_white)
	}
	draw_sample(50)

	## 2.c) Conduct the MCS---------------------
	# 1000 samples
	# Sample sizes 20, 50 and 1000
	n_iterations <- 10000 # When developing the loop, start with 3; once it works,
	# you can increase the number

	sample_size_used <- 100
	simulation_container <- rep(NA, n_iterations)

	simulation_container
	set.seed(123) # To make the simulation reproducible
	for (i in seq_len(n_iterations)){
	simulation_container[i] <- draw_sample(sample_size = sample_size_used)
	}
	simulation_container

	# With all cases:
	n_samples <- 1000
	results_n20 <- rep(NA, n_samples)
	results_n50 <- rep(NA, n_samples)
	results_n100 <- rep(NA, n_samples)

	set.seed(123)
	for (i in seq_len(n_samples)){
	results_n20[i] <- draw_sample(sample_size = 20)
	results_n50[i] <- draw_sample(sample_size = 50)
	results_n100[i] <- draw_sample(sample_size = 100)
	}

	result_table <- tibble(
	sample_size20 = results_n20,
	sample_size50 = results_n50,
	sample_size100 = results_n100
	)

	## 2d) Analyze results -----------------------
	### Quantitative:-------
	# The quantitative results (of interest definitely the mean and standard
	#. deviation of the sampling distribution)
	result_table %>%
	pivot_longer(cols = everything(),
	names_to = "Sample size",
	values_to = "Share of white balls") %>%
	mutate(
	`Sample size` = as.integer(
	stringr::str_remove_all(`Sample size`, "[^0-9.-]"))
	) %>%
	group_by(`Sample size`) %>%
	summarise(
	`Mean share` = mean(`Share of white balls`),
	`Standard deviation` = sd(`Share of white balls`)
	)

	### Visual:-------
	# A quick-and-dirty-way:
	hist(result_table$sample_size20)
	# A bit nicer:
	ggplot(data = result_table, aes(x=sample_size20)) +
	geom_histogram(binwidth = 0.05) + theme_bw() +
	scale_x_continuous(limits = c(0, 1)) +
	geom_vline(xintercept = 0.65)

	# A good visualization:
	hist_visualization <- result_table %>%
	pivot_longer(cols = everything(),
	names_to = "Sample size",
	values_to = "Share of white balls") %>%
	mutate(
	`Sample size` = as.integer(
	stringr::str_remove_all(`Sample size`, "[^0-9.-]"))
	) %>%
	mutate(
	`Sample size` = paste0("Sample size: ", as.character(`Sample size`)),
	`Sample size` = factor(
	`Sample size`,
	levels=c("Sample size: 20", "Sample size: 50", "Sample size: 100"),
	ordered = TRUE),
	) %>%
	ggplot(data = ., aes(x=`Share of white balls`)) +
	geom_histogram(binwidth = 0.02, fill="#00395B") +
	scale_y_continuous(expand = expansion(add = c(0, 1))) +
	scale_x_continuous(labels = percent_format()) +
	labs(
	x = "Share of white balls",
	y = "Number of samples",
	title = "True share: 65%") +
	geom_vline(xintercept = 0.65) +
	facet_wrap(~`Sample size`, scales = "fixed") +
	theme_icae() +
	theme(
	panel.grid.minor.x = element_blank(),
	panel.grid.minor.y = element_blank())