Skip to content

Instantly share code, notes, and snippets.

@Tadge-Analytics
Created October 29, 2019 00:08
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save Tadge-Analytics/ba2c045f2a8f327aab9672e361cda746 to your computer and use it in GitHub Desktop.
Save Tadge-Analytics/ba2c045f2a8f327aab9672e361cda746 to your computer and use it in GitHub Desktop.
library(tidyverse)
# Let's generate our different trials data
# and save to rds
test_if_sameness <- function(size_of_group) {
tibble(person_id = 1:size_of_group) %>%
mutate(birthday = sample(1:365, n(), replace = T)) %>%
mutate(other_dates = list(rename(., person_id1 = person_id,
birthday1 = birthday))) %>%
unnest(other_dates) %>%
filter(person_id != person_id1) %>%
filter(birthday == birthday1) %>%
nrow() > 0
}
bday_sample <- function(num_repeated_trials, sizes_of_groups, number_of_trials) {
tibble(group_size = sizes_of_groups) %>%
crossing(trial_id = 1:number_of_trials,
num_repeated_trials = 1:num_repeated_trials) %>%
mutate(outcome = furrr::future_map_lgl(group_size, ~test_if_sameness(.x))) %>%
group_by(group_size, num_repeated_trials) %>%
summarise(count_of_success = sum(outcome),
percent_success = count_of_success/number_of_trials)
}
library(furrr)
future::plan(multiprocess)
set.seed(123)
first <- bday_sample(1, 1:60, 1000)
set.seed(123)
second <- bday_sample(20, 22:24, 1000)
set.seed(123)
big_sample <- bday_sample(20, 23, 10000)
write_rds(first, "first.rds")
write_rds(second, "second.rds")
write_rds(big_sample, "big_sample.rds")
#############################################
library(tidyverse)
# let's start our analysis
first <- read_rds("first.rds")
second <- read_rds("second.rds")
big_sample <- read_rds("big_sample.rds")
first %>%
ggplot() +
aes(group_size, percent_success) +
geom_line(stat = "identity")
data_for_plot <- second %>%
group_by(group_size) %>%
summarise(mean_success = mean(percent_success),
sd_success = sd(percent_success),
num_repeated_trials = n(),
over_50 = sum(percent_success > 0.5),
probability_chance_is_over_50 = over_50/num_repeated_trials)
second %>%
ggplot() +
aes(percent_success) +
geom_histogram(aes(fill = percent_success > 0.5)) +
facet_grid(. ~ group_size) +
labs(fill = "Over 50%") +
guides(fill = guide_legend(reverse=T)) +
geom_vline(data = data_for_plot, aes(xintercept = mean_success), col = 'black', linetype = "dashed", alpha = 0.3, size = 1) +
geom_label(data = data_for_plot, aes(mean_success, y = 3.5, label = paste0("mean = \n", round(mean_success, 3))))
# Assuming normality, here are the distributions
tibble(x = c(0.40, 0.60)) %>%
ggplot() +
aes(x) +
stat_function(fun = dnorm, args = list(mean = data_for_plot[[1, 2]], sd = data_for_plot[[1, 3]]), col = "red") +
stat_function(fun = dnorm, args = list(mean = data_for_plot[[2, 2]], sd = data_for_plot[[2, 3]]), col = "blue") +
stat_function(fun = dnorm, args = list(mean = data_for_plot[[3, 2]], sd = data_for_plot[[3, 3]]), col = "black")
# for shading area under the curve.
stat_function(fun = dnorm, args = list(mean = data_for_plot[[2, 2]], sd = data_for_plot[[2, 3]]),
xlim = c(0.474, 0.545), geom = "area", fill = "blue", alpha = .2) +
# for comparison of 23-group with 1,000 trials to the 10,000 trials
new_data_for_plot <- bind_rows(data_for_plot %>% filter(group_size == 23),
big_sample %>%
group_by(group_size) %>%
summarise(mean_success = mean(percent_success),
sd_success = sd(percent_success),
num_repeated_trials = n(),
over_50 = sum(percent_success > 0.5),
probability_chance_is_over_50 = over_50/num_repeated_trials))
tibble(x = c(0.40, 0.60)) %>%
ggplot() +
aes(x) +
stat_function(fun = dnorm, args = list(mean = new_data_for_plot[[1, 2]], sd = new_data_for_plot[[1, 3]]), col = "blue") +
stat_function(fun = dnorm, args = list(mean = new_data_for_plot[[2, 2]], sd = new_data_for_plot[[2, 3]]), col = "red")
# function for working out area under the curve
pnorm()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment