graebnerc/# Sampling

## # Sampling
Code used during the lecture on sampling.

## 0_for-loop-example.R
base_object <- c("a", "b", "c", "d")
result_container <- rep(NA, length(base_object))

for (i in base_object){
  print(i)
}

# Always better to use indices:
# (note: function "paste" used to add explanation)
for (i in seq_along(base_object)) {
  print(paste("Iteration:", i))
  print(paste("Element of base_object:", base_object[i]))
}

# Note: you do not need to use "i" in the action sequence, e.g. if you are just
#  interested in repeating a certain action several times:
for (i in seq(1, 10)){
  print(i)
  print(sample(base_object, size = 1)) # Draws a random element from base_object
}

## 1_exercise-1_n20.R
# Write a for-loop that loops over the vector c(1,2,3,4,5,99)
#. and computes the square root for each element.

base_object <- c(1,2,3,4,5,99)
# Note: yes, you can do this via vectorization
sqrt(base_object)
# but we use this as a simple example to illustrate the idea

# 1. Output container
output_container <- rep(NA, length(base_object))

# 2. Looping sequence
for (i in seq_along(base_object)){# 3. Action body
  output_container[i] <- sqrt(base_object[i])
}
output_container

## 2_ball-pid-example_n20.R
library(tibble)
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)

# More elaborate example: the ball pid-----------
# Note: for a slightly more elegant solution using a function in the loop,
#. please read the tutorial on sampling

# First step: create the artificial ball pid------

ball_pid_size <- 5000
ball_pid_share_white <- 0.65
white_balls <- as.integer(ball_pid_share_white*ball_pid_size)
grey_balls <- ball_pid_size - white_balls
ball_pid_colors <- c(rep("white", white_balls), rep("grey", grey_balls))

ball_pid <- tibble::tibble(
  id = seq(1, ball_pid_size),
  color = sample(ball_pid_colors)
)
head(ball_pid)

## Conduct the simulation for only the case with sample size 20-----------
## Conduct the simulation--------------
n_samples <- 1000 # The number of iterations we want to get
results_n20 <- rep(NA, n_samples) # The output container

# Since n_samples is a single number, we use seq_len() instead of seq_along()
for (i in seq_len(n_samples)){
  # Draw a sample of size 20:
  sample_drawn <- sample(x = ball_pid$color, size = 20)
  # Compute the share of white balls within this sample:
  share_white <- sum(sample_drawn=="white")/length(sample_drawn)
  # Write into output container:
  results_n20[i] <- share_white
}

# Compute mean and standard deviation of the sample distribution-----
mean(results_n20)
sd(results_n20)

## Visualize the result-----------
hist_visualization <- ggplot(
    data = tibble(results_n20),
    mapping = aes(x=results_n20)
    ) +
  geom_histogram(binwidth = 0.02, fill="#00395B", alpha=0.75) +
  scale_y_continuous(expand = expansion(add = c(0, 10))) +
  scale_x_continuous(labels = percent_format()) +
  labs(
    x = "Share of white balls",
    y = "Number of samples",
    title = "True share: 65%") +
  geom_vline(xintercept = 0.65) +
  theme_linedraw() +
  theme(
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank())
hist_visualization

## 3_ball-pid-example_full.R
library(tibble)
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)

# Create the population--------
ball_pid_size <- 5000
ball_pid_share_white <- 0.65
white_balls <- as.integer(ball_pid_share_white*ball_pid_size)
grey_balls <- ball_pid_size - white_balls
ball_pid_colors <- c(rep("white", white_balls), rep("grey", grey_balls))

ball_pid <- tibble::tibble(
  id = seq(1, ball_pid_size),
  color = sample(ball_pid_colors)
)

# Conduct the simulation--------------
n_samples <- 1000 # The number of iterations we want to get
results_n20 <- rep(NA, n_samples) # The output container for n=20
results_n50 <- rep(NA, n_samples) # The output container for n=50
results_n100 <- rep(NA, n_samples) # The output container for n=100

# Since n_samples is a single number, we use seq_len() instead of seq_along()
for (i in seq_len(n_samples)){
  # Draw samples of the respective sizes:
  sample_drawn_20 <- sample(x = ball_pid$color, size = 20)
  sample_drawn_50 <- sample(x = ball_pid$color, size = 50)
  sample_drawn_100 <- sample(x = ball_pid$color, size = 100)
  # Compute the share of white balls within this sample:
  share_white_20 <- sum(sample_drawn_20=="white")/length(sample_drawn_20)
  share_white_50 <- sum(sample_drawn_50=="white")/length(sample_drawn_50)
  share_white_100 <- sum(sample_drawn_100=="white")/length(sample_drawn_100)
  # Write into output container:
  results_n20[i] <- share_white_20
  results_n50[i] <- share_white_50
  results_n100[i] <- share_white_100
}

# Combine all three cases in one tibble:
result_table <- tibble(
  sample_size20 = results_n20,
  sample_size50 = results_n50,
  sample_size100 = results_n100
)

# Compute mean and standard deviation of the sample distribution-----
result_table %>%
  pivot_longer(
    cols = everything(),
    names_to = "Sample size",
    values_to = "Values") %>%
  summarise(
    `Mean share of whites`=mean(Values), `Variation`=sd(Values),
    .by = "Sample size")

# Visualize the result-----------
hist_visualization <- result_table %>%
  pivot_longer(
    cols = everything(),
    names_to = "Sample size",
    values_to = "Values") %>%
  mutate(`Sample size` = as.integer(gsub(
    x = `Sample size`, pattern = "sample_size", replacement = "")) # To remove string part
  ) %>%
  ggplot(
    data = .,
    mapping = aes(x=Values)
  ) +
  geom_histogram(binwidth = 0.02, fill="#00395B", alpha=0.75) +
  scale_y_continuous(expand = expansion(add = c(0, 10))) +
  scale_x_continuous(labels = percent_format()) +
  labs(
    x = "Share of white balls",
    y = "Number of samples",
    title = "True share: 65%") +
  geom_vline(xintercept = 0.65) +
  facet_wrap(~`Sample size`, scales = "fixed") +
  theme_linedraw() +
  theme(
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank())
hist_visualization
	base_object <- c("a", "b", "c", "d")
	result_container <- rep(NA, length(base_object))

	for (i in base_object){
	print(i)
	}

	# Always better to use indices:
	# (note: function "paste" used to add explanation)
	for (i in seq_along(base_object)) {
	print(paste("Iteration:", i))
	print(paste("Element of base_object:", base_object[i]))
	}

	# Note: you do not need to use "i" in the action sequence, e.g. if you are just
	# interested in repeating a certain action several times:
	for (i in seq(1, 10)){
	print(i)
	print(sample(base_object, size = 1)) # Draws a random element from base_object
	}
	# Write a for-loop that loops over the vector c(1,2,3,4,5,99)
	#. and computes the square root for each element.

	base_object <- c(1,2,3,4,5,99)
	# Note: yes, you can do this via vectorization
	sqrt(base_object)
	# but we use this as a simple example to illustrate the idea

	# 1. Output container
	output_container <- rep(NA, length(base_object))

	# 2. Looping sequence
	for (i in seq_along(base_object)){# 3. Action body
	output_container[i] <- sqrt(base_object[i])
	}
	output_container
	library(tibble)
	library(dplyr)
	library(tidyr)
	library(ggplot2)
	library(scales)

	# More elaborate example: the ball pid-----------
	# Note: for a slightly more elegant solution using a function in the loop,
	#. please read the tutorial on sampling

	# First step: create the artificial ball pid------

	ball_pid_size <- 5000
	ball_pid_share_white <- 0.65
	white_balls <- as.integer(ball_pid_share_white*ball_pid_size)
	grey_balls <- ball_pid_size - white_balls
	ball_pid_colors <- c(rep("white", white_balls), rep("grey", grey_balls))

	ball_pid <- tibble::tibble(
	id = seq(1, ball_pid_size),
	color = sample(ball_pid_colors)
	)
	head(ball_pid)

	## Conduct the simulation for only the case with sample size 20-----------
	## Conduct the simulation--------------
	n_samples <- 1000 # The number of iterations we want to get
	results_n20 <- rep(NA, n_samples) # The output container

	# Since n_samples is a single number, we use seq_len() instead of seq_along()
	for (i in seq_len(n_samples)){
	# Draw a sample of size 20:
	sample_drawn <- sample(x = ball_pid$color, size = 20)
	# Compute the share of white balls within this sample:
	share_white <- sum(sample_drawn=="white")/length(sample_drawn)
	# Write into output container:
	results_n20[i] <- share_white
	}

	# Compute mean and standard deviation of the sample distribution-----
	mean(results_n20)
	sd(results_n20)

	## Visualize the result-----------
	hist_visualization <- ggplot(
	data = tibble(results_n20),
	mapping = aes(x=results_n20)
	) +
	geom_histogram(binwidth = 0.02, fill="#00395B", alpha=0.75) +
	scale_y_continuous(expand = expansion(add = c(0, 10))) +
	scale_x_continuous(labels = percent_format()) +
	labs(
	x = "Share of white balls",
	y = "Number of samples",
	title = "True share: 65%") +
	geom_vline(xintercept = 0.65) +
	theme_linedraw() +
	theme(
	panel.grid.minor.x = element_blank(),
	panel.grid.minor.y = element_blank())
	hist_visualization