Skip to content

Instantly share code, notes, and snippets.

@ismayc
Last active July 20, 2017 15:00
Show Gist options
  • Save ismayc/74eee31d4b9f63f9b62e9d7e411b5628 to your computer and use it in GitHub Desktop.
Save ismayc/74eee31d4b9f63f9b62e9d7e411b5628 to your computer and use it in GitHub Desktop.
library(okcupiddata)
library(stringr)
# devtools::install_github("andrewpbray/infer")
library(infer)
library(dplyr)
library(ggplot2)
set.seed(2017)
prof_small <- profiles %>%
sample_n(size = 500) %>%
mutate(frisco = case_when(
str_detect(location, "san fran") ~ "san fran",
!str_detect(location, "san fran") ~ "not san fran"
)) %>%
select(age, sex, frisco, drugs, height, status)
## H_0: \mu_f - \mu_m = 0
## H_A: \mu_f - \mu_m < 0
# Calculate observed test statistic
obs_stat <- prof_small %>%
group_by(sex) %>%
summarize(mean_age = mean(age)) %>%
summarize(diff_mean = diff(mean_age)) %>%
pull(diff_mean)
# Not sold on using `pull()` here but without it, you'll
# need to use $ to get to obs_stat$diff_mean.
# To visualize currently without geom_vline() in visualize()
prof_small %>%
specify(age ~ sex) %>% # alt: response = age, explanatory = sex
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in means") %>%
visualize() +
geom_vline(xintercept = obs_stat, color = "blue")
# Eventually this could look something like
prof_small %>%
specify(age ~ sex) %>% # alt: response = age, explanatory = sex
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in means") %>%
# Totally spit-balling here
visualize(tail = "left", extreme_as = obs_stat)
# Or maybe it is better to save the null distribution
# And then plot and calculate the p-value?
null_distn <- prof_small %>%
specify(age ~ sex) %>% # alt: response = age, explanatory = sex
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in means")
ggplot(data = null_distn, mapping = aes(x = stat)) +
geom_histogram(bins = 30, color = "white") +
geom_vline(xintercept = obs_stat, color = "blue")
# Calculate p-value
null_distn %>%
summarize(p_value = mean(stat <= obs_stat))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment