FrankRuns/relocator-poll-results-toy-sim.R

## relocator-poll-results-toy-sim.R
# toy simulation to determine how likely or
# unlikely the poll results from blog post
# are (Do you plan to relocate in 2022?)

# load packages
library(dplyr)
library(ggplot2)


# Single Instance ----
# I always like to start with a single instance
# so I know if / how things are working... you'll
# notice that some of this is unnecessary

# parameters
relo_rate <- 17.5 # historical base rate
sample_size <- 10000 # num of instances
n <- 39 # where to look at estimated relo rate

# if you keep running this block of code, you'll
# see that at 39 poll participants, anywhere from
# ~10% to ~25% of the are relocators...
count = 0 # count of relocators
for (i in 1:sample_size) {
  random_num <- runif(1) * 100 # gen random num from 1-100
  outcome <- ifelse(random_num <= relo_rate, 1, 0) # a relocator?
  count <- count + outcome # increment count to go to next person
  if (i == n) {print(paste("At", n, ":",count/n))}
}

# we also see that if 10,000 people take the survey, the
# percent of relocators converges to historical base rate (~17.5%)
# with some variation
print(paste("Materialized Relo Rate", count/sample_size))


# Now, Monte Carlo ----
# Do the above process 10,000x. However, don't go all the way
# to 10,000 poll participants (that's a waste and was just for
# understanding above). We'll only go to 39.

relo_rate <- 17.5 # historical base rate
sample_size <- 1000 # num of instances (you could do more for more precision)
n <- 39 # where to look at estimated relo rate

relo_rate_vector <- c() # vector to store 1,000 outcomes
for (i in 1:sample_size) {
  count = 0
  for (j in 1:n) {
    random_num <- runif(1) * 100
    outcome <- ifelse(random_num <= relo_rate, 1, 0)
    count <- count + outcome
  }
  relo_rate_vector <- c(relo_rate_vector, count/n)
}

# What do we got?
# with the original parameters above, we get a histogram sort
# of normally distributed with it's peak between 15%-20%. Makes
# sense...
hist(relo_rate_vector)
# Now, what is the probability that we see an instance where
# the percent of relocaters is 50% or higher?
threshold <- 0.5
1 - ecdf(relo_rate_vector)(threshold) # with default params, it's 0.


# Heatmap Vis ----

# start by putting your vector in a dataframe and sorting it
# from highest rate to lowest (the order could be up or down,
# I just chose down)
df <- data.frame(relo_rate = relo_rate_vector)
df <- df %>% dplyr::arrange(desc(relo_rate))

#  in order to make a rectangular heatmap of the 1,000
# instances, create x axis groups and y axis groups. Also,
# to simplify make the outcome binary (2 colors is good for
# the contrast)
df$index <- 1:nrow(df)
df$x_group <- rep(1:100,each=10)
df$y_group <- rep(1:10, 100)
df$relo_rate <- ifelse(df$relo_rate >= threshold, 1, 0)

# make your heatmap!
ggplot(data = df, aes(x=x_group, y=y_group, fill=relo_rate)) +
  geom_tile(colour="white", size=0.25) +
  scale_fill_gradient(low="red", high="blue") +
  theme_grey(base_size=8)+
  # theme options
  theme(
    # remove legend
    legend.position = "none",
    # remove axis titles
    axis.title=element_blank(),
    # remove axis labels
    axis.text=element_blank(),
    # remove background grid
    panel.background = element_blank()
  )
	# toy simulation to determine how likely or
	# unlikely the poll results from blog post
	# are (Do you plan to relocate in 2022?)

	# load packages
	library(dplyr)
	library(ggplot2)


	# Single Instance ----
	# I always like to start with a single instance
	# so I know if / how things are working... you'll
	# notice that some of this is unnecessary

	# parameters
	relo_rate <- 17.5 # historical base rate
	sample_size <- 10000 # num of instances
	n <- 39 # where to look at estimated relo rate

	# if you keep running this block of code, you'll
	# see that at 39 poll participants, anywhere from
	# ~10% to ~25% of the are relocators...
	count = 0 # count of relocators
	for (i in 1:sample_size) {
	random_num <- runif(1) * 100 # gen random num from 1-100
	outcome <- ifelse(random_num <= relo_rate, 1, 0) # a relocator?
	count <- count + outcome # increment count to go to next person
	if (i == n) {print(paste("At", n, ":",count/n))}
	}

	# we also see that if 10,000 people take the survey, the
	# percent of relocators converges to historical base rate (~17.5%)
	# with some variation
	print(paste("Materialized Relo Rate", count/sample_size))


	# Now, Monte Carlo ----
	# Do the above process 10,000x. However, don't go all the way
	# to 10,000 poll participants (that's a waste and was just for
	# understanding above). We'll only go to 39.

	relo_rate <- 17.5 # historical base rate
	sample_size <- 1000 # num of instances (you could do more for more precision)
	n <- 39 # where to look at estimated relo rate

	relo_rate_vector <- c() # vector to store 1,000 outcomes
	for (i in 1:sample_size) {
	count = 0
	for (j in 1:n) {
	random_num <- runif(1) * 100
	outcome <- ifelse(random_num <= relo_rate, 1, 0)
	count <- count + outcome
	}
	relo_rate_vector <- c(relo_rate_vector, count/n)
	}

	# What do we got?
	# with the original parameters above, we get a histogram sort
	# of normally distributed with it's peak between 15%-20%. Makes
	# sense...
	hist(relo_rate_vector)
	# Now, what is the probability that we see an instance where
	# the percent of relocaters is 50% or higher?
	threshold <- 0.5
	1 - ecdf(relo_rate_vector)(threshold) # with default params, it's 0.


	# Heatmap Vis ----

	# start by putting your vector in a dataframe and sorting it
	# from highest rate to lowest (the order could be up or down,
	# I just chose down)
	df <- data.frame(relo_rate = relo_rate_vector)
	df <- df %>% dplyr::arrange(desc(relo_rate))

	# in order to make a rectangular heatmap of the 1,000
	# instances, create x axis groups and y axis groups. Also,
	# to simplify make the outcome binary (2 colors is good for
	# the contrast)
	df$index <- 1:nrow(df)
	df$x_group <- rep(1:100,each=10)
	df$y_group <- rep(1:10, 100)
	df$relo_rate <- ifelse(df$relo_rate >= threshold, 1, 0)

	# make your heatmap!
	ggplot(data = df, aes(x=x_group, y=y_group, fill=relo_rate)) +
	geom_tile(colour="white", size=0.25) +
	scale_fill_gradient(low="red", high="blue") +
	theme_grey(base_size=8)+
	# theme options
	theme(
	# remove legend
	legend.position = "none",
	# remove axis titles
	axis.title=element_blank(),
	# remove axis labels
	axis.text=element_blank(),
	# remove background grid
	panel.background = element_blank()
	)