graebnerc/#T14: lecture notes

## #T14: lecture notes
Lecture script and solutions for T14 on sampling. Note that we did not do the exercises in class, treat them as additional info.

## T14-Exercise-1-Solution.R
here::i_am("R/T14-Exercise-1-Solution.R")
library(here)
library(tibble)
library(dplyr)
library(purrr)
library(ggplot2)
library(icaeDesign)
# remotes::install_github("graebnerc/DataScienceExercises")
library(DataScienceExercises)

reps <- 1000
student_pop <- DataScienceExercises::EUFstudents

# Alternative 1: draw samples using purrr-------------
set.seed(123)
samp_size10 <- map_dbl(
  .x = seq_len(reps),
  .f = ~mean(sample(x = student_pop[["Height"]],
                    size = 10,
                    replace = FALSE))
  )
samp_size50 <- map_dbl(
  .x = seq_len(reps),
  .f = ~mean(sample(x = student_pop[["Height"]],
                    size = 50,
                    replace = FALSE)))

full_samples <- rbind(
  tibble("samplesize"="Sample size: 10", "mean"=samp_size10),
  tibble("samplesize"="Sample size: 50", "mean"=samp_size50)
  )

# Alternative 2: draw samples using for loop----------
set.seed(123)
samp_size10_loop <- rep(NA, reps)
samp_size50_loop <- rep(NA, reps)

for (i in seq_len(reps)){
  sample_small <- sample(
    x = student_pop[["Height"]],
    size = 10, replace = FALSE)
  sample_small_mean <- mean(sample_small)
  samp_size10_loop[i] <- sample_small_mean

  sample_large <- sample(
    x = student_pop[["Height"]],
    size = 10, replace = FALSE)
  sample_large_mean <- mean(sample_large)
  samp_size50_loop[i] <- sample_large_mean
}

# Visualization------------------------
sampling_plot <- ggplot(data = full_samples, aes_string(x="mean")) +
  geom_histogram(alpha=0.5, color="#00395B", fill="#00395B") +
  scale_y_continuous(expand = expansion(add = c(0, 10))) +
  scale_x_continuous(limits = c(158, 178), expand = expansion()) +
  facet_wrap(~samplesize, nrow = 2) +
  labs(
    x = "Sample means",
    y = "Count",
    title = "The sampling distributions") +
  theme_icae()
sampling_plot

ggsave(plot = sampling_plot,
       filename = here("output/T14/EufPopPlotSamples.pdf"),
       width = 4, height = 3)

# Information about the population-----
true_vals <- student_pop %>%
  group_by(Gender) %>%
  summarise(
    Mean=mean(Height), SD=sd(Height)
  )

true_vals2 <- student_pop %>%
  summarise(
    Mean=mean(Height), SD=sd(Height)
  ) %>%
  dplyr::mutate(Gender="total") %>%
  dplyr::select(Gender, Mean, SD)

true_vals <- rbind(true_vals, true_vals2)
true_vals

sample_props <- full_samples %>%
  group_by(samplesize) %>%
  summarise(`Mean`=mean(mean),
            `Variation`=sd(mean)) %>%
  rename(` `=samplesize)
sample_props

## T14-Exercise-2-Solution-CLT.R
here::i_am("R/T14-Exercise-2-Solution.R")
library(here)
library(purrr)
library(tibble)
library(data.table)
library(ggplot2)
library(ggpubr)
library(icaeDesign)

set.seed(123)
# Population with 5000 exponentially distributed values
N <- 5000
exp_par <- 2

# Visualization of the 'true population':

# MCS
iterations <- 500
sample_sizes <- c(5, 10, 20, 50)
population <- rexp(n = N, rate = exp_par)
population_tab <- tibble("pop_n" = population)
true_mean <- mean(population)
mean_dist_list <- list()

for (i in seq_along(sample_sizes)){
  sample_size <- sample_sizes[i]
  sample_means <- purrr::map_dbl(
    .x = 1:iterations,
    .f = ~mean(sample(population, size = sample_size, replace = FALSE)))
  sample_means_tab <- tibble(
    "sample_size"=sample_size,
    "sample_means"=sample_means
  )
  mean_dist_list[[sample_size]] <- sample_means_tab
}

full_results <- rbindlist(mean_dist_list)

population_plot <- ggplot(data = population_tab, aes(x = pop_n)) +
  geom_histogram(aes(y = ..density..)) +
  scale_y_continuous(expand = expansion(add = c(0, 0.05))) +
  labs(title = "The population", y = "Density") +
  stat_function(fun = dexp, args = list(rate = exp_par)) +
  theme_icae() +
  theme(axis.title.x = element_blank())
population_plot

hist_plot <- ggplot(data = full_results, aes(x=sample_means)) +
  geom_histogram(aes(y=..density..), alpha=0.5, color="#00395B", fill="#00395B") +
  scale_y_continuous(expand = expansion(add = c(0, 0.05))) +
  scale_x_continuous(limits = c(-0.2, 1.2), expand = expansion()) +
  facet_wrap(~sample_size, ncol = 4) +
  stat_function(
    data = dplyr::filter(full_results, sample_size==5),
    fun = dnorm,
    args = list(
      mean = mean(dplyr::filter(full_results, sample_size==5)$sample_means),
      sd = sd(dplyr::filter(full_results, sample_size==5)$sample_means))
    ) +
  stat_function(
    data = dplyr::filter(full_results, sample_size==10),
    fun = dnorm,
    args = list(
      mean = mean(dplyr::filter(full_results, sample_size==10)$sample_means),
      sd = sd(dplyr::filter(full_results, sample_size==10)$sample_means))
  ) +
  stat_function(
    data = dplyr::filter(full_results, sample_size==20),
    fun = dnorm,
    args = list(
      mean = mean(dplyr::filter(full_results, sample_size==20)$sample_means),
      sd = sd(dplyr::filter(full_results, sample_size==20)$sample_means))
  ) +
  stat_function(
    data = dplyr::filter(full_results, sample_size==50),
    fun = dnorm,
    args = list(
      mean = mean(dplyr::filter(full_results, sample_size==50)$sample_means),
      sd = sd(dplyr::filter(full_results, sample_size==50)$sample_means))
  ) +
  labs(x = "Sample means", y = "Density", title = "The sampling distributions") +
  theme_icae()

full_plot <- ggarrange(population_plot, hist_plot, nrow = 2)
ggsave(plot = full_plot, filename = here("output/T14/clt-example.pdf"),
       width = 4, height = 3)

## T14-LectureScript-1-DiceExample.R
# A for loop
for (i in seq_len(10)){
  print(i)
}

# Below we illustrate two techniques for iteration using a simple example:
#  simulate throwing a dice

# 1st step: define the random process and define general parameters
nb_iterations <- 10
sample(1:6, size = 1)

# 2nd step: iterate the process (variant: for-loop)
result_container <- rep(NA, nb_iterations)
set.seed(123)
for (i in seq_len(nb_iterations)){
  result_container[i] <- sample(1:6, size = 1)
}
result_container
result_tab <- tibble("draw"=factor(result_container, levels = 1:6))

# 2nd step: iterate the process (variant: purrr package)
library(purrr)
set.seed(123)
result_container_purrr <- map_dbl(
  .x = seq_len(nb_iterations),
  .f = ~sample(1:6, size = 1)
)

result_container_purrr

# 3rd step: visualize and analyze the result
ggplot(data = result_tab, mapping = aes(x=draw)) +
  geom_bar() +
  scale_y_continuous(expand = expansion()) +
  scale_x_discrete(drop=FALSE) + # To show also numbers that were not thrown
  theme_bw()

## T14-LectureScript-2-BallPidExample.R
library(purrr)
library(tibble)
library(dplyr)
library(ggplot2)

# Create the ball pid population
ball_pid_size <- 5000
ball_pid_share_white <- 0.65
white_balls <- as.integer(ball_pid_share_white*ball_pid_size)
grey_balls <- ball_pid_size - white_balls
ball_pid_colors <- c(rep("white", white_balls), rep("grey", grey_balls))
ball_pid <- tibble::tibble(
  id = seq(1, ball_pid_size),
  color = sample(ball_pid_colors)
)

# Sampling of the balls

# 1st step: define the random process and set general parameters
sample_size <- 20
nb_iterations <- 400

#' Function that takes a sample of balls and computes share of white balls
sample_balls <- function(bowl, sample_size, replace_balls=FALSE){
  sample_obtained <- sample(
    x = bowl, size = sample_size, replace = replace_balls)
  sum(sample_obtained == "white") / sample_size
}

# 2nd step: iterate the process
set.seed(123)
sampling_df <- tibble(
  "sample_size" = sample_size,
  "sample_id" = seq_len(nb_iterations),
  "share_white" = map_dbl(
    .x = seq_len(nb_iterations),
    .f = ~sample_balls(ball_pid$color, sample_size = sample_size))
)

# Alternative formulation using a for-loop:
result_container <- rep(NA, nb_iterations)

set.seed(123)
for (i in seq_len(nb_iterations)){
  # print(i)
  sample_drawm <- sample(
    x = ball_pid[["color"]],
    size = sample_size,
    replace = FALSE)

  share_white <- sum(sample_drawm=="white") / length(sample_drawm)
  share_white

  result_container[i] <- share_white
}

# 3rd step: visualize and analyze the result
sample_plot <- ggplot(data = sampling_df, aes(x=share_white)) +
  geom_histogram(binwidth = 0.02, fill="#00395B") +
  theme_bw()
sample_plot

# Compare the effect of different sample sizes-------------

# 1. Simulate data for different sample sizes:
set.seed(123)
sample_size <- 10
sampling_df_10 <- tibble(
  "sample_size" = sample_size,
  "share_white" = map_dbl(
    .x = seq_len(nb_iterations),
    .f = ~sample_balls(ball_pid$color, sample_size = sample_size))
)

set.seed(123)
sample_size <- 50
sampling_df_50 <- tibble(
  "sample_size" = sample_size,
  "share_white" = map_dbl(
    .x = seq_len(nb_iterations),
    .f = ~sample_balls(ball_pid$color, sample_size = sample_size))
)

set.seed(123)
sample_size <- 500
sampling_df_100 <- tibble(
  "sample_size" = sample_size,
  "share_white" = map_dbl(
    .x = seq_len(nb_iterations),
    .f = ~sample_balls(ball_pid$color, sample_size = sample_size))
)

# 2. Analyse the results quantitatively:

# Merge the tibbles:
sample_df_full <- rbind(sampling_df_10, sampling_df_50, sampling_df_100)

# Compute mean and standard deviation:
true_white_share <- sum(ball_pid[["color"]]=="white") / nrow(ball_pid)

sample_df_full %>%
  group_by(sample_size) %>%
  summarise(
    mean_share = mean(share_white),
    diff_true_val = abs(mean_share - true_white_share),
    sd_share = sd(share_white)
  )

# Visualize the distributions:

ggplot(data = sample_df_full, mapping = aes(x=share_white)) +
  geom_histogram(binwidth = 0.025) +
  geom_vline(xintercept = true_white_share) + # to add a line with true value
  scale_x_continuous(limits = c(0, 1.2), expand = expansion()) +
  scale_y_continuous(expand = expansion(add = c(0, 10))) +
  facet_wrap(~sample_size, ncol = 3) +
  theme_bw()
	here::i_am("R/T14-Exercise-1-Solution.R")
	library(here)
	library(tibble)
	library(dplyr)
	library(purrr)
	library(ggplot2)
	library(icaeDesign)
	# remotes::install_github("graebnerc/DataScienceExercises")
	library(DataScienceExercises)

	reps <- 1000
	student_pop <- DataScienceExercises::EUFstudents

	# Alternative 1: draw samples using purrr-------------
	set.seed(123)
	samp_size10 <- map_dbl(
	.x = seq_len(reps),
	.f = ~mean(sample(x = student_pop[["Height"]],
	size = 10,
	replace = FALSE))
	)
	samp_size50 <- map_dbl(
	.x = seq_len(reps),
	.f = ~mean(sample(x = student_pop[["Height"]],
	size = 50,
	replace = FALSE)))

	full_samples <- rbind(
	tibble("samplesize"="Sample size: 10", "mean"=samp_size10),
	tibble("samplesize"="Sample size: 50", "mean"=samp_size50)
	)

	# Alternative 2: draw samples using for loop----------
	set.seed(123)
	samp_size10_loop <- rep(NA, reps)
	samp_size50_loop <- rep(NA, reps)

	for (i in seq_len(reps)){
	sample_small <- sample(
	x = student_pop[["Height"]],
	size = 10, replace = FALSE)
	sample_small_mean <- mean(sample_small)
	samp_size10_loop[i] <- sample_small_mean

	sample_large <- sample(
	x = student_pop[["Height"]],
	size = 10, replace = FALSE)
	sample_large_mean <- mean(sample_large)
	samp_size50_loop[i] <- sample_large_mean
	}

	# Visualization------------------------
	sampling_plot <- ggplot(data = full_samples, aes_string(x="mean")) +
	geom_histogram(alpha=0.5, color="#00395B", fill="#00395B") +
	scale_y_continuous(expand = expansion(add = c(0, 10))) +
	scale_x_continuous(limits = c(158, 178), expand = expansion()) +
	facet_wrap(~samplesize, nrow = 2) +
	labs(
	x = "Sample means",
	y = "Count",
	title = "The sampling distributions") +
	theme_icae()
	sampling_plot

	ggsave(plot = sampling_plot,
	filename = here("output/T14/EufPopPlotSamples.pdf"),
	width = 4, height = 3)

	# Information about the population-----
	true_vals <- student_pop %>%
	group_by(Gender) %>%
	summarise(
	Mean=mean(Height), SD=sd(Height)
	)

	true_vals2 <- student_pop %>%
	summarise(
	Mean=mean(Height), SD=sd(Height)
	) %>%
	dplyr::mutate(Gender="total") %>%
	dplyr::select(Gender, Mean, SD)

	true_vals <- rbind(true_vals, true_vals2)
	true_vals

	sample_props <- full_samples %>%
	group_by(samplesize) %>%
	summarise(`Mean`=mean(mean),
	`Variation`=sd(mean)) %>%
	rename(` `=samplesize)
	sample_props
	here::i_am("R/T14-Exercise-2-Solution.R")
	library(here)
	library(purrr)
	library(tibble)
	library(data.table)
	library(ggplot2)
	library(ggpubr)
	library(icaeDesign)

	set.seed(123)
	# Population with 5000 exponentially distributed values
	N <- 5000
	exp_par <- 2

	# Visualization of the 'true population':

	# MCS
	iterations <- 500
	sample_sizes <- c(5, 10, 20, 50)
	population <- rexp(n = N, rate = exp_par)
	population_tab <- tibble("pop_n" = population)
	true_mean <- mean(population)
	mean_dist_list <- list()

	for (i in seq_along(sample_sizes)){
	sample_size <- sample_sizes[i]
	sample_means <- purrr::map_dbl(
	.x = 1:iterations,
	.f = ~mean(sample(population, size = sample_size, replace = FALSE)))
	sample_means_tab <- tibble(
	"sample_size"=sample_size,
	"sample_means"=sample_means
	)
	mean_dist_list[[sample_size]] <- sample_means_tab
	}

	full_results <- rbindlist(mean_dist_list)

	population_plot <- ggplot(data = population_tab, aes(x = pop_n)) +
	geom_histogram(aes(y = ..density..)) +
	scale_y_continuous(expand = expansion(add = c(0, 0.05))) +
	labs(title = "The population", y = "Density") +
	stat_function(fun = dexp, args = list(rate = exp_par)) +
	theme_icae() +
	theme(axis.title.x = element_blank())
	population_plot

	hist_plot <- ggplot(data = full_results, aes(x=sample_means)) +
	geom_histogram(aes(y=..density..), alpha=0.5, color="#00395B", fill="#00395B") +
	scale_y_continuous(expand = expansion(add = c(0, 0.05))) +
	scale_x_continuous(limits = c(-0.2, 1.2), expand = expansion()) +
	facet_wrap(~sample_size, ncol = 4) +
	stat_function(
	data = dplyr::filter(full_results, sample_size==5),
	fun = dnorm,
	args = list(
	mean = mean(dplyr::filter(full_results, sample_size==5)$sample_means),
	sd = sd(dplyr::filter(full_results, sample_size==5)$sample_means))
	) +
	stat_function(
	data = dplyr::filter(full_results, sample_size==10),
	fun = dnorm,
	args = list(
	mean = mean(dplyr::filter(full_results, sample_size==10)$sample_means),
	sd = sd(dplyr::filter(full_results, sample_size==10)$sample_means))
	) +
	stat_function(
	data = dplyr::filter(full_results, sample_size==20),
	fun = dnorm,
	args = list(
	mean = mean(dplyr::filter(full_results, sample_size==20)$sample_means),
	sd = sd(dplyr::filter(full_results, sample_size==20)$sample_means))
	) +
	stat_function(
	data = dplyr::filter(full_results, sample_size==50),
	fun = dnorm,
	args = list(
	mean = mean(dplyr::filter(full_results, sample_size==50)$sample_means),
	sd = sd(dplyr::filter(full_results, sample_size==50)$sample_means))
	) +
	labs(x = "Sample means", y = "Density", title = "The sampling distributions") +
	theme_icae()

	full_plot <- ggarrange(population_plot, hist_plot, nrow = 2)
	ggsave(plot = full_plot, filename = here("output/T14/clt-example.pdf"),
	width = 4, height = 3)
	# A for loop
	for (i in seq_len(10)){
	print(i)
	}

	# Below we illustrate two techniques for iteration using a simple example:
	# simulate throwing a dice

	# 1st step: define the random process and define general parameters
	nb_iterations <- 10
	sample(1:6, size = 1)

	# 2nd step: iterate the process (variant: for-loop)
	result_container <- rep(NA, nb_iterations)
	set.seed(123)
	for (i in seq_len(nb_iterations)){
	result_container[i] <- sample(1:6, size = 1)
	}
	result_container
	result_tab <- tibble("draw"=factor(result_container, levels = 1:6))

	# 2nd step: iterate the process (variant: purrr package)
	library(purrr)
	set.seed(123)
	result_container_purrr <- map_dbl(
	.x = seq_len(nb_iterations),
	.f = ~sample(1:6, size = 1)
	)

	result_container_purrr

	# 3rd step: visualize and analyze the result
	ggplot(data = result_tab, mapping = aes(x=draw)) +
	geom_bar() +
	scale_y_continuous(expand = expansion()) +
	scale_x_discrete(drop=FALSE) + # To show also numbers that were not thrown
	theme_bw()