Last active
June 7, 2024 14:59
-
-
Save graebnerc/5567989fe275f6eefbe6ea2dafe1a750 to your computer and use it in GitHub Desktop.
Code used during the lecture on sampling.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Code used during the lecture on sampling. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
base_object <- c("a", "b", "c", "d") | |
result_container <- rep(NA, length(base_object)) | |
for (i in base_object){ | |
print(i) | |
} | |
# Always better to use indices: | |
# (note: function "paste" used to add explanation) | |
for (i in seq_along(base_object)) { | |
print(paste("Iteration:", i)) | |
print(paste("Element of base_object:", base_object[i])) | |
} | |
# Note: you do not need to use "i" in the action sequence, e.g. if you are just | |
# interested in repeating a certain action several times: | |
for (i in seq(1, 10)){ | |
print(i) | |
print(sample(base_object, size = 1)) # Draws a random element from base_object | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Write a for-loop that loops over the vector c(1,2,3,4,5,99) | |
#. and computes the square root for each element. | |
base_object <- c(1,2,3,4,5,99) | |
# Note: yes, you can do this via vectorization | |
sqrt(base_object) | |
# but we use this as a simple example to illustrate the idea | |
# 1. Output container | |
output_container <- rep(NA, length(base_object)) | |
# 2. Looping sequence | |
for (i in seq_along(base_object)){# 3. Action body | |
output_container[i] <- sqrt(base_object[i]) | |
} | |
output_container |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tibble) | |
library(dplyr) | |
library(tidyr) | |
library(ggplot2) | |
library(scales) | |
# More elaborate example: the ball pid----------- | |
# Note: for a slightly more elegant solution using a function in the loop, | |
#. please read the tutorial on sampling | |
# First step: create the artificial ball pid------ | |
ball_pid_size <- 5000 | |
ball_pid_share_white <- 0.65 | |
white_balls <- as.integer(ball_pid_share_white*ball_pid_size) | |
grey_balls <- ball_pid_size - white_balls | |
ball_pid_colors <- c(rep("white", white_balls), rep("grey", grey_balls)) | |
ball_pid <- tibble::tibble( | |
id = seq(1, ball_pid_size), | |
color = sample(ball_pid_colors) | |
) | |
head(ball_pid) | |
## Conduct the simulation for only the case with sample size 20----------- | |
## Conduct the simulation-------------- | |
n_samples <- 1000 # The number of iterations we want to get | |
results_n20 <- rep(NA, n_samples) # The output container | |
# Since n_samples is a single number, we use seq_len() instead of seq_along() | |
for (i in seq_len(n_samples)){ | |
# Draw a sample of size 20: | |
sample_drawn <- sample(x = ball_pid$color, size = 20) | |
# Compute the share of white balls within this sample: | |
share_white <- sum(sample_drawn=="white")/length(sample_drawn) | |
# Write into output container: | |
results_n20[i] <- share_white | |
} | |
# Compute mean and standard deviation of the sample distribution----- | |
mean(results_n20) | |
sd(results_n20) | |
## Visualize the result----------- | |
hist_visualization <- ggplot( | |
data = tibble(results_n20), | |
mapping = aes(x=results_n20) | |
) + | |
geom_histogram(binwidth = 0.02, fill="#00395B", alpha=0.75) + | |
scale_y_continuous(expand = expansion(add = c(0, 10))) + | |
scale_x_continuous(labels = percent_format()) + | |
labs( | |
x = "Share of white balls", | |
y = "Number of samples", | |
title = "True share: 65%") + | |
geom_vline(xintercept = 0.65) + | |
theme_linedraw() + | |
theme( | |
panel.grid.minor.x = element_blank(), | |
panel.grid.minor.y = element_blank()) | |
hist_visualization |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tibble) | |
library(dplyr) | |
library(tidyr) | |
library(ggplot2) | |
library(scales) | |
# Create the population-------- | |
ball_pid_size <- 5000 | |
ball_pid_share_white <- 0.65 | |
white_balls <- as.integer(ball_pid_share_white*ball_pid_size) | |
grey_balls <- ball_pid_size - white_balls | |
ball_pid_colors <- c(rep("white", white_balls), rep("grey", grey_balls)) | |
ball_pid <- tibble::tibble( | |
id = seq(1, ball_pid_size), | |
color = sample(ball_pid_colors) | |
) | |
# Conduct the simulation-------------- | |
n_samples <- 1000 # The number of iterations we want to get | |
results_n20 <- rep(NA, n_samples) # The output container for n=20 | |
results_n50 <- rep(NA, n_samples) # The output container for n=50 | |
results_n100 <- rep(NA, n_samples) # The output container for n=100 | |
# Since n_samples is a single number, we use seq_len() instead of seq_along() | |
for (i in seq_len(n_samples)){ | |
# Draw samples of the respective sizes: | |
sample_drawn_20 <- sample(x = ball_pid$color, size = 20) | |
sample_drawn_50 <- sample(x = ball_pid$color, size = 50) | |
sample_drawn_100 <- sample(x = ball_pid$color, size = 100) | |
# Compute the share of white balls within this sample: | |
share_white_20 <- sum(sample_drawn_20=="white")/length(sample_drawn_20) | |
share_white_50 <- sum(sample_drawn_50=="white")/length(sample_drawn_50) | |
share_white_100 <- sum(sample_drawn_100=="white")/length(sample_drawn_100) | |
# Write into output container: | |
results_n20[i] <- share_white_20 | |
results_n50[i] <- share_white_50 | |
results_n100[i] <- share_white_100 | |
} | |
# Combine all three cases in one tibble: | |
result_table <- tibble( | |
sample_size20 = results_n20, | |
sample_size50 = results_n50, | |
sample_size100 = results_n100 | |
) | |
# Compute mean and standard deviation of the sample distribution----- | |
result_table %>% | |
pivot_longer( | |
cols = everything(), | |
names_to = "Sample size", | |
values_to = "Values") %>% | |
summarise( | |
`Mean share of whites`=mean(Values), `Variation`=sd(Values), | |
.by = "Sample size") | |
# Visualize the result----------- | |
hist_visualization <- result_table %>% | |
pivot_longer( | |
cols = everything(), | |
names_to = "Sample size", | |
values_to = "Values") %>% | |
mutate(`Sample size` = as.integer(gsub( | |
x = `Sample size`, pattern = "sample_size", replacement = "")) # To remove string part | |
) %>% | |
ggplot( | |
data = ., | |
mapping = aes(x=Values) | |
) + | |
geom_histogram(binwidth = 0.02, fill="#00395B", alpha=0.75) + | |
scale_y_continuous(expand = expansion(add = c(0, 10))) + | |
scale_x_continuous(labels = percent_format()) + | |
labs( | |
x = "Share of white balls", | |
y = "Number of samples", | |
title = "True share: 65%") + | |
geom_vline(xintercept = 0.65) + | |
facet_wrap(~`Sample size`, scales = "fixed") + | |
theme_linedraw() + | |
theme( | |
panel.grid.minor.x = element_blank(), | |
panel.grid.minor.y = element_blank()) | |
hist_visualization |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment