Created
January 16, 2022 22:22
-
-
Save FrankRuns/34c6427094d99584e0d267dcf35e9430 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# toy simulation to determine how likely or | |
# unlikely the poll results from blog post | |
# are (Do you plan to relocate in 2022?) | |
# load packages | |
library(dplyr) | |
library(ggplot2) | |
# Single Instance ---- | |
# I always like to start with a single instance | |
# so I know if / how things are working... you'll | |
# notice that some of this is unnecessary | |
# parameters | |
relo_rate <- 17.5 # historical base rate | |
sample_size <- 10000 # num of instances | |
n <- 39 # where to look at estimated relo rate | |
# if you keep running this block of code, you'll | |
# see that at 39 poll participants, anywhere from | |
# ~10% to ~25% of the are relocators... | |
count = 0 # count of relocators | |
for (i in 1:sample_size) { | |
random_num <- runif(1) * 100 # gen random num from 1-100 | |
outcome <- ifelse(random_num <= relo_rate, 1, 0) # a relocator? | |
count <- count + outcome # increment count to go to next person | |
if (i == n) {print(paste("At", n, ":",count/n))} | |
} | |
# we also see that if 10,000 people take the survey, the | |
# percent of relocators converges to historical base rate (~17.5%) | |
# with some variation | |
print(paste("Materialized Relo Rate", count/sample_size)) | |
# Now, Monte Carlo ---- | |
# Do the above process 10,000x. However, don't go all the way | |
# to 10,000 poll participants (that's a waste and was just for | |
# understanding above). We'll only go to 39. | |
relo_rate <- 17.5 # historical base rate | |
sample_size <- 1000 # num of instances (you could do more for more precision) | |
n <- 39 # where to look at estimated relo rate | |
relo_rate_vector <- c() # vector to store 1,000 outcomes | |
for (i in 1:sample_size) { | |
count = 0 | |
for (j in 1:n) { | |
random_num <- runif(1) * 100 | |
outcome <- ifelse(random_num <= relo_rate, 1, 0) | |
count <- count + outcome | |
} | |
relo_rate_vector <- c(relo_rate_vector, count/n) | |
} | |
# What do we got? | |
# with the original parameters above, we get a histogram sort | |
# of normally distributed with it's peak between 15%-20%. Makes | |
# sense... | |
hist(relo_rate_vector) | |
# Now, what is the probability that we see an instance where | |
# the percent of relocaters is 50% or higher? | |
threshold <- 0.5 | |
1 - ecdf(relo_rate_vector)(threshold) # with default params, it's 0. | |
# Heatmap Vis ---- | |
# start by putting your vector in a dataframe and sorting it | |
# from highest rate to lowest (the order could be up or down, | |
# I just chose down) | |
df <- data.frame(relo_rate = relo_rate_vector) | |
df <- df %>% dplyr::arrange(desc(relo_rate)) | |
# in order to make a rectangular heatmap of the 1,000 | |
# instances, create x axis groups and y axis groups. Also, | |
# to simplify make the outcome binary (2 colors is good for | |
# the contrast) | |
df$index <- 1:nrow(df) | |
df$x_group <- rep(1:100,each=10) | |
df$y_group <- rep(1:10, 100) | |
df$relo_rate <- ifelse(df$relo_rate >= threshold, 1, 0) | |
# make your heatmap! | |
ggplot(data = df, aes(x=x_group, y=y_group, fill=relo_rate)) + | |
geom_tile(colour="white", size=0.25) + | |
scale_fill_gradient(low="red", high="blue") + | |
theme_grey(base_size=8)+ | |
# theme options | |
theme( | |
# remove legend | |
legend.position = "none", | |
# remove axis titles | |
axis.title=element_blank(), | |
# remove axis labels | |
axis.text=element_blank(), | |
# remove background grid | |
panel.background = element_blank() | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment