Skip to content

Instantly share code, notes, and snippets.

@graebnerc
Last active June 7, 2024 14:59
Show Gist options
  • Save graebnerc/5567989fe275f6eefbe6ea2dafe1a750 to your computer and use it in GitHub Desktop.
Save graebnerc/5567989fe275f6eefbe6ea2dafe1a750 to your computer and use it in GitHub Desktop.
Code used during the lecture on sampling.
Code used during the lecture on sampling.
base_object <- c("a", "b", "c", "d")
result_container <- rep(NA, length(base_object))
for (i in base_object){
print(i)
}
# Always better to use indices:
# (note: function "paste" used to add explanation)
for (i in seq_along(base_object)) {
print(paste("Iteration:", i))
print(paste("Element of base_object:", base_object[i]))
}
# Note: you do not need to use "i" in the action sequence, e.g. if you are just
# interested in repeating a certain action several times:
for (i in seq(1, 10)){
print(i)
print(sample(base_object, size = 1)) # Draws a random element from base_object
}
# Write a for-loop that loops over the vector c(1,2,3,4,5,99)
#. and computes the square root for each element.
base_object <- c(1,2,3,4,5,99)
# Note: yes, you can do this via vectorization
sqrt(base_object)
# but we use this as a simple example to illustrate the idea
# 1. Output container
output_container <- rep(NA, length(base_object))
# 2. Looping sequence
for (i in seq_along(base_object)){# 3. Action body
output_container[i] <- sqrt(base_object[i])
}
output_container
library(tibble)
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
# More elaborate example: the ball pid-----------
# Note: for a slightly more elegant solution using a function in the loop,
#. please read the tutorial on sampling
# First step: create the artificial ball pid------
ball_pid_size <- 5000
ball_pid_share_white <- 0.65
white_balls <- as.integer(ball_pid_share_white*ball_pid_size)
grey_balls <- ball_pid_size - white_balls
ball_pid_colors <- c(rep("white", white_balls), rep("grey", grey_balls))
ball_pid <- tibble::tibble(
id = seq(1, ball_pid_size),
color = sample(ball_pid_colors)
)
head(ball_pid)
## Conduct the simulation for only the case with sample size 20-----------
## Conduct the simulation--------------
n_samples <- 1000 # The number of iterations we want to get
results_n20 <- rep(NA, n_samples) # The output container
# Since n_samples is a single number, we use seq_len() instead of seq_along()
for (i in seq_len(n_samples)){
# Draw a sample of size 20:
sample_drawn <- sample(x = ball_pid$color, size = 20)
# Compute the share of white balls within this sample:
share_white <- sum(sample_drawn=="white")/length(sample_drawn)
# Write into output container:
results_n20[i] <- share_white
}
# Compute mean and standard deviation of the sample distribution-----
mean(results_n20)
sd(results_n20)
## Visualize the result-----------
hist_visualization <- ggplot(
data = tibble(results_n20),
mapping = aes(x=results_n20)
) +
geom_histogram(binwidth = 0.02, fill="#00395B", alpha=0.75) +
scale_y_continuous(expand = expansion(add = c(0, 10))) +
scale_x_continuous(labels = percent_format()) +
labs(
x = "Share of white balls",
y = "Number of samples",
title = "True share: 65%") +
geom_vline(xintercept = 0.65) +
theme_linedraw() +
theme(
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank())
hist_visualization
library(tibble)
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
# Create the population--------
ball_pid_size <- 5000
ball_pid_share_white <- 0.65
white_balls <- as.integer(ball_pid_share_white*ball_pid_size)
grey_balls <- ball_pid_size - white_balls
ball_pid_colors <- c(rep("white", white_balls), rep("grey", grey_balls))
ball_pid <- tibble::tibble(
id = seq(1, ball_pid_size),
color = sample(ball_pid_colors)
)
# Conduct the simulation--------------
n_samples <- 1000 # The number of iterations we want to get
results_n20 <- rep(NA, n_samples) # The output container for n=20
results_n50 <- rep(NA, n_samples) # The output container for n=50
results_n100 <- rep(NA, n_samples) # The output container for n=100
# Since n_samples is a single number, we use seq_len() instead of seq_along()
for (i in seq_len(n_samples)){
# Draw samples of the respective sizes:
sample_drawn_20 <- sample(x = ball_pid$color, size = 20)
sample_drawn_50 <- sample(x = ball_pid$color, size = 50)
sample_drawn_100 <- sample(x = ball_pid$color, size = 100)
# Compute the share of white balls within this sample:
share_white_20 <- sum(sample_drawn_20=="white")/length(sample_drawn_20)
share_white_50 <- sum(sample_drawn_50=="white")/length(sample_drawn_50)
share_white_100 <- sum(sample_drawn_100=="white")/length(sample_drawn_100)
# Write into output container:
results_n20[i] <- share_white_20
results_n50[i] <- share_white_50
results_n100[i] <- share_white_100
}
# Combine all three cases in one tibble:
result_table <- tibble(
sample_size20 = results_n20,
sample_size50 = results_n50,
sample_size100 = results_n100
)
# Compute mean and standard deviation of the sample distribution-----
result_table %>%
pivot_longer(
cols = everything(),
names_to = "Sample size",
values_to = "Values") %>%
summarise(
`Mean share of whites`=mean(Values), `Variation`=sd(Values),
.by = "Sample size")
# Visualize the result-----------
hist_visualization <- result_table %>%
pivot_longer(
cols = everything(),
names_to = "Sample size",
values_to = "Values") %>%
mutate(`Sample size` = as.integer(gsub(
x = `Sample size`, pattern = "sample_size", replacement = "")) # To remove string part
) %>%
ggplot(
data = .,
mapping = aes(x=Values)
) +
geom_histogram(binwidth = 0.02, fill="#00395B", alpha=0.75) +
scale_y_continuous(expand = expansion(add = c(0, 10))) +
scale_x_continuous(labels = percent_format()) +
labs(
x = "Share of white balls",
y = "Number of samples",
title = "True share: 65%") +
geom_vline(xintercept = 0.65) +
facet_wrap(~`Sample size`, scales = "fixed") +
theme_linedraw() +
theme(
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank())
hist_visualization
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment