Tadge-Analytics/birthday_problem

## birthday_problem
library(tidyverse)

# Let's generate our different trials data
# and save to rds

test_if_sameness <- function(size_of_group) {
  tibble(person_id = 1:size_of_group) %>%
    mutate(birthday = sample(1:365, n(), replace = T)) %>%
    mutate(other_dates = list(rename(., person_id1 = person_id,
                                     birthday1 = birthday))) %>%
    unnest(other_dates) %>%
    filter(person_id != person_id1) %>%
    filter(birthday == birthday1) %>%
    nrow() > 0
}


bday_sample <- function(num_repeated_trials, sizes_of_groups, number_of_trials) {

  tibble(group_size = sizes_of_groups) %>%
    crossing(trial_id = 1:number_of_trials,
             num_repeated_trials = 1:num_repeated_trials) %>%
    mutate(outcome = furrr::future_map_lgl(group_size, ~test_if_sameness(.x))) %>%
    group_by(group_size, num_repeated_trials) %>%
    summarise(count_of_success = sum(outcome),
              percent_success = count_of_success/number_of_trials)

}


library(furrr)
future::plan(multiprocess)

set.seed(123)
first <- bday_sample(1, 1:60, 1000)

set.seed(123)
second <- bday_sample(20, 22:24, 1000)

set.seed(123)
big_sample <- bday_sample(20, 23, 10000)


write_rds(first, "first.rds")
write_rds(second, "second.rds")
write_rds(big_sample, "big_sample.rds")


#############################################

library(tidyverse)

# let's start our analysis

first <- read_rds("first.rds")
second <- read_rds("second.rds")
big_sample <- read_rds("big_sample.rds")


first %>%
  ggplot() +
  aes(group_size, percent_success) +
  geom_line(stat = "identity")


data_for_plot <- second %>%
  group_by(group_size) %>%
  summarise(mean_success = mean(percent_success),
            sd_success = sd(percent_success),
            num_repeated_trials = n(),
            over_50 = sum(percent_success > 0.5),
            probability_chance_is_over_50 = over_50/num_repeated_trials)


second %>%
  ggplot() +
  aes(percent_success) +
  geom_histogram(aes(fill = percent_success > 0.5)) +
  facet_grid(. ~ group_size) +
  labs(fill = "Over 50%") +
  guides(fill = guide_legend(reverse=T)) +
  geom_vline(data = data_for_plot, aes(xintercept = mean_success), col = 'black', linetype = "dashed", alpha = 0.3, size = 1) +
  geom_label(data = data_for_plot, aes(mean_success, y = 3.5, label = paste0("mean = \n", round(mean_success, 3))))


# Assuming normality, here are the distributions
tibble(x = c(0.40, 0.60)) %>%
  ggplot() +
  aes(x) +
  stat_function(fun = dnorm, args = list(mean = data_for_plot[[1, 2]], sd = data_for_plot[[1, 3]]), col = "red") +
  stat_function(fun = dnorm, args = list(mean = data_for_plot[[2, 2]], sd = data_for_plot[[2, 3]]), col = "blue") +
  stat_function(fun = dnorm, args = list(mean = data_for_plot[[3, 2]], sd = data_for_plot[[3, 3]]), col = "black")


# for shading area under the curve.
stat_function(fun = dnorm, args = list(mean = data_for_plot[[2, 2]], sd = data_for_plot[[2, 3]]),
                xlim = c(0.474, 0.545), geom = "area", fill = "blue", alpha = .2) +


# for comparison of 23-group with 1,000 trials to the 10,000 trials
new_data_for_plot <- bind_rows(data_for_plot %>% filter(group_size == 23),
                                   big_sample %>%
                                     group_by(group_size) %>%
                                     summarise(mean_success = mean(percent_success),
                                               sd_success = sd(percent_success),
                                               num_repeated_trials = n(),
                                               over_50 = sum(percent_success > 0.5),
                                               probability_chance_is_over_50 = over_50/num_repeated_trials))


tibble(x = c(0.40, 0.60)) %>%
  ggplot() +
  aes(x) +
  stat_function(fun = dnorm, args = list(mean = new_data_for_plot[[1, 2]], sd = new_data_for_plot[[1, 3]]), col = "blue") +
  stat_function(fun = dnorm, args = list(mean = new_data_for_plot[[2, 2]], sd = new_data_for_plot[[2, 3]]), col = "red")


# function for working out area under the curve
pnorm()
	library(tidyverse)

	# Let's generate our different trials data
	# and save to rds

	test_if_sameness <- function(size_of_group) {
	tibble(person_id = 1:size_of_group) %>%
	mutate(birthday = sample(1:365, n(), replace = T)) %>%
	mutate(other_dates = list(rename(., person_id1 = person_id,
	birthday1 = birthday))) %>%
	unnest(other_dates) %>%
	filter(person_id != person_id1) %>%
	filter(birthday == birthday1) %>%
	nrow() > 0
	}



	bday_sample <- function(num_repeated_trials, sizes_of_groups, number_of_trials) {

	tibble(group_size = sizes_of_groups) %>%
	crossing(trial_id = 1:number_of_trials,
	num_repeated_trials = 1:num_repeated_trials) %>%
	mutate(outcome = furrr::future_map_lgl(group_size, ~test_if_sameness(.x))) %>%
	group_by(group_size, num_repeated_trials) %>%
	summarise(count_of_success = sum(outcome),
	percent_success = count_of_success/number_of_trials)

	}



	library(furrr)
	future::plan(multiprocess)

	set.seed(123)
	first <- bday_sample(1, 1:60, 1000)

	set.seed(123)
	second <- bday_sample(20, 22:24, 1000)

	set.seed(123)
	big_sample <- bday_sample(20, 23, 10000)



	write_rds(first, "first.rds")
	write_rds(second, "second.rds")
	write_rds(big_sample, "big_sample.rds")



	#############################################

	library(tidyverse)

	# let's start our analysis

	first <- read_rds("first.rds")
	second <- read_rds("second.rds")
	big_sample <- read_rds("big_sample.rds")



	first %>%
	ggplot() +
	aes(group_size, percent_success) +
	geom_line(stat = "identity")



	data_for_plot <- second %>%
	group_by(group_size) %>%
	summarise(mean_success = mean(percent_success),
	sd_success = sd(percent_success),
	num_repeated_trials = n(),
	over_50 = sum(percent_success > 0.5),
	probability_chance_is_over_50 = over_50/num_repeated_trials)



	second %>%
	ggplot() +
	aes(percent_success) +
	geom_histogram(aes(fill = percent_success > 0.5)) +
	facet_grid(. ~ group_size) +
	labs(fill = "Over 50%") +
	guides(fill = guide_legend(reverse=T)) +
	geom_vline(data = data_for_plot, aes(xintercept = mean_success), col = 'black', linetype = "dashed", alpha = 0.3, size = 1) +
	geom_label(data = data_for_plot, aes(mean_success, y = 3.5, label = paste0("mean = \n", round(mean_success, 3))))



	# Assuming normality, here are the distributions
	tibble(x = c(0.40, 0.60)) %>%
	ggplot() +
	aes(x) +
	stat_function(fun = dnorm, args = list(mean = data_for_plot[[1, 2]], sd = data_for_plot[[1, 3]]), col = "red") +
	stat_function(fun = dnorm, args = list(mean = data_for_plot[[2, 2]], sd = data_for_plot[[2, 3]]), col = "blue") +
	stat_function(fun = dnorm, args = list(mean = data_for_plot[[3, 2]], sd = data_for_plot[[3, 3]]), col = "black")




	# for shading area under the curve.
	stat_function(fun = dnorm, args = list(mean = data_for_plot[[2, 2]], sd = data_for_plot[[2, 3]]),
	xlim = c(0.474, 0.545), geom = "area", fill = "blue", alpha = .2) +





	# for comparison of 23-group with 1,000 trials to the 10,000 trials
	new_data_for_plot <- bind_rows(data_for_plot %>% filter(group_size == 23),
	big_sample %>%
	group_by(group_size) %>%
	summarise(mean_success = mean(percent_success),
	sd_success = sd(percent_success),
	num_repeated_trials = n(),
	over_50 = sum(percent_success > 0.5),
	probability_chance_is_over_50 = over_50/num_repeated_trials))


	tibble(x = c(0.40, 0.60)) %>%
	ggplot() +
	aes(x) +
	stat_function(fun = dnorm, args = list(mean = new_data_for_plot[[1, 2]], sd = new_data_for_plot[[1, 3]]), col = "blue") +
	stat_function(fun = dnorm, args = list(mean = new_data_for_plot[[2, 2]], sd = new_data_for_plot[[2, 3]]), col = "red")



	# function for working out area under the curve
	pnorm()