Skip to content

Instantly share code, notes, and snippets.

@FrankRuns
Created January 16, 2022 22:22
Show Gist options
  • Save FrankRuns/34c6427094d99584e0d267dcf35e9430 to your computer and use it in GitHub Desktop.
Save FrankRuns/34c6427094d99584e0d267dcf35e9430 to your computer and use it in GitHub Desktop.
# toy simulation to determine how likely or
# unlikely the poll results from blog post
# are (Do you plan to relocate in 2022?)
# load packages
library(dplyr)
library(ggplot2)
# Single Instance ----
# I always like to start with a single instance
# so I know if / how things are working... you'll
# notice that some of this is unnecessary
# parameters
relo_rate <- 17.5 # historical base rate
sample_size <- 10000 # num of instances
n <- 39 # where to look at estimated relo rate
# if you keep running this block of code, you'll
# see that at 39 poll participants, anywhere from
# ~10% to ~25% of the are relocators...
count = 0 # count of relocators
for (i in 1:sample_size) {
random_num <- runif(1) * 100 # gen random num from 1-100
outcome <- ifelse(random_num <= relo_rate, 1, 0) # a relocator?
count <- count + outcome # increment count to go to next person
if (i == n) {print(paste("At", n, ":",count/n))}
}
# we also see that if 10,000 people take the survey, the
# percent of relocators converges to historical base rate (~17.5%)
# with some variation
print(paste("Materialized Relo Rate", count/sample_size))
# Now, Monte Carlo ----
# Do the above process 10,000x. However, don't go all the way
# to 10,000 poll participants (that's a waste and was just for
# understanding above). We'll only go to 39.
relo_rate <- 17.5 # historical base rate
sample_size <- 1000 # num of instances (you could do more for more precision)
n <- 39 # where to look at estimated relo rate
relo_rate_vector <- c() # vector to store 1,000 outcomes
for (i in 1:sample_size) {
count = 0
for (j in 1:n) {
random_num <- runif(1) * 100
outcome <- ifelse(random_num <= relo_rate, 1, 0)
count <- count + outcome
}
relo_rate_vector <- c(relo_rate_vector, count/n)
}
# What do we got?
# with the original parameters above, we get a histogram sort
# of normally distributed with it's peak between 15%-20%. Makes
# sense...
hist(relo_rate_vector)
# Now, what is the probability that we see an instance where
# the percent of relocaters is 50% or higher?
threshold <- 0.5
1 - ecdf(relo_rate_vector)(threshold) # with default params, it's 0.
# Heatmap Vis ----
# start by putting your vector in a dataframe and sorting it
# from highest rate to lowest (the order could be up or down,
# I just chose down)
df <- data.frame(relo_rate = relo_rate_vector)
df <- df %>% dplyr::arrange(desc(relo_rate))
# in order to make a rectangular heatmap of the 1,000
# instances, create x axis groups and y axis groups. Also,
# to simplify make the outcome binary (2 colors is good for
# the contrast)
df$index <- 1:nrow(df)
df$x_group <- rep(1:100,each=10)
df$y_group <- rep(1:10, 100)
df$relo_rate <- ifelse(df$relo_rate >= threshold, 1, 0)
# make your heatmap!
ggplot(data = df, aes(x=x_group, y=y_group, fill=relo_rate)) +
geom_tile(colour="white", size=0.25) +
scale_fill_gradient(low="red", high="blue") +
theme_grey(base_size=8)+
# theme options
theme(
# remove legend
legend.position = "none",
# remove axis titles
axis.title=element_blank(),
# remove axis labels
axis.text=element_blank(),
# remove background grid
panel.background = element_blank()
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment