Last active
October 26, 2021 20:43
-
-
Save baogorek/daeecc4a272561c949cd466cde7ad31d to your computer and use it in GitHub Desktop.
Companion code to Medium article
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
# Structural Equations --------------------------------------------------- | |
get_subject_affinity <- function(N) { | |
# u ~ Uniform(1, 20) | |
runif(N, min=1, max=20) | |
} | |
get_time_resources <- function(N) { | |
# w ~ Gamma(3, 1) | |
rgamma(N, shape=3, scale=4) | |
} | |
get_enrolled <- function(time_resources) { | |
# Pr(s = 1) = .1 if w <=5, .2 if w > 5, w <= 20, .3 if w > 20 | |
pr_enrolled <- rep(.1, length(time_resources)) # base probability | |
pr_enrolled <- ifelse(time_resources > 5, .2, pr_enrolled) | |
pr_enrolled <- ifelse(time_resources > 20, .3, pr_enrolled) | |
runif(length(time_resources)) < pr_enrolled | |
} | |
get_enjoyment <- function(time_resources, subject_affinity) { | |
# z = 1 if w <= 1.3, z = u ^ 2 log(1 + w) if w > 1.3 | |
ifelse(time_resources <= 1.3, 1, | |
subject_affinity ^ 2 * log(1 + time_resources)) | |
} | |
get_hours <- function(enjoyment, time_resources) { | |
# hours is N(-1.5 + 1.5 * z, 3 ^ 2) left truncated at 0 | |
E_hours_unconstrained <- -1.5 + 1.5 * enjoyment ^ (.35) | |
e_star <- rnorm(length(enjoyment), 0, 3) | |
hours_unconstrained <- E_hours_unconstrained + e_star | |
hours_unconstrained <- ifelse(hours_unconstrained < 0, 0, hours_unconstrained) | |
ifelse(hours_unconstrained > time_resources, time_resources, | |
hours_unconstrained) | |
} | |
get_grade <- function(subject_affinity, hours) { | |
# grade is 40 + u * log(1 + x) + e, where e ~ N(0, 4 ^ 2) | |
epsilon <- rnorm(length(subject_affinity), 0, 4) | |
E_grade <- 40 + subject_affinity * log(1 + hours) | |
grade <- E_grade + epsilon | |
list(grade=grade, E_grade=E_grade, epsilon=epsilon) | |
} | |
simulate_observational <- function(N) { | |
time_resources <- get_time_resources(N) | |
subject_affinity <- get_subject_affinity(N) | |
enrolled <- get_enrolled(time_resources) | |
enjoyment <- get_enjoyment(time_resources, subject_affinity) | |
hours <- get_hours(enjoyment, time_resources) | |
grade_obj <- get_grade(subject_affinity, hours) | |
df <- data.frame(enrolled, time_resources, enjoyment, hours, | |
grade=grade_obj$grade, epsilon=grade_obj$epsilon, E_grade=grade_obj$E_grade, | |
subject_affinity=subject_affinity) | |
df # still need to filter out unenrolled | |
} | |
simulate_interventional <- function(N, hours) { | |
subject_affinity <- get_subject_affinity(N) | |
grade_obj <- get_grade(subject_affinity, hours) | |
data.frame(hours, grade=grade_obj$grade, epsilon=grade_obj$epsilon, | |
E_grade=grade_obj$E_grade, subject_affinity=subject_affinity) | |
} | |
# Tunable knobs | |
do_hours <- 6 | |
enjoyment_buckets <- 18 | |
fixed_time_resources <- do_hours * 1.1 | |
slide_tol <- .07 | |
min_n_for_density <- 2 | |
grade_grid <- seq(-5, 110, .1) | |
seed <- 124 | |
N <- 1E6 # gets filtered to roughly 20% by sampling | |
# Generating the data --------------------------- | |
set.seed(seed) | |
df_pop <- simulate_observational(N) | |
nrow(df_pop) | |
df <- df_pop %>% filter(enrolled) | |
# What we have to match: | |
interventional_df <- simulate_interventional(1E6, do_hours) | |
do_density <- density(interventional_df$grade) | |
f_y_do_x <- function(grade) { | |
f <- approxfun(do_density$x, do_density$y) | |
ifelse(is.na(f(grade)), 0, f(grade)) | |
} | |
# Quick plot of what we have to match | |
plot(f_y_do_x(grade_grid) ~ grade_grid, type='l') | |
integrate(f_y_do_x, 0, 105) | |
df_w <- df %>% filter(time_resources > (1 - slide_tol) * fixed_time_resources, | |
time_resources < (1 + slide_tol) * fixed_time_resources) | |
df_wx <- df_w %>% filter(hours > do_hours * (1 - slide_tol), | |
hours < do_hours * (1 + slide_tol)) | |
# overlap of enjoyment distribution: observed data vs conditional slice | |
cat(nrow(df_w), "records in w-window,", nrow(df_wx), | |
"records in w and x windows\n") | |
cat("Enjoyment (z) in the population\n") | |
summary(df$enjoyment) | |
cat("Enjoyment (z) in the time resources (w) and hours (x) cut\n") | |
summary(df_wx$enjoyment) | |
z_w_density <- density(df_w$enjoyment) | |
f_z <- approxfun(z_w_density$x, z_w_density$y) | |
z_cuts <- cut(df_w$enjoyment, enjoyment_buckets) | |
labs <- levels(z_cuts) | |
df_prz <- data.frame( | |
lower = as.numeric( sub("\\((.+),.*", "\\1", labs) ), | |
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs) ), | |
n_z = table(z_cuts), | |
pr_between = NA | |
) | |
for (i in 1:nrow(df_prz)) { | |
integral <- integrate(f_z, df_prz[i, "lower"], df_prz[i, "upper"]) | |
df_prz[i, "pr_between"] <- integral$value | |
} | |
# Ensure probs sum to 1 | |
int_fz <- sum(df_prz$pr_between) | |
df_prz$pr_between <- df_prz$pr_between / int_fz | |
# Redefine cuts to cover range of z (enjoyment) | |
df_prz[1, "lower"] <- 0 | |
df_prz[nrow(df_prz), "upper"] <- ceiling( | |
20 ^ 2 * log(1 + fixed_time_resources * (1 + slide_tol)) | |
) | |
df_prz[["n_z.z_cuts"]] <- NULL | |
par(mfrow=c(3, 6)) | |
results <- data.frame() | |
df_prz$prz_xwz <- NA | |
df_prz$n_xwz <- NA | |
fy_x <- rep(0, length(grade_grid)) # TODO: why is it called fy_x | |
for (i in 1:nrow(df_prz)) { | |
df_wxz <- df_wx %>% filter(enjoyment > df_prz[i, "lower"], | |
enjoyment < df_prz[i, "upper"]) | |
title <- paste0("Z in [", df_prz[i, "lower"], | |
", ", z_upper=df_prz[i, "upper"], "], n=", | |
nrow(df_wxz)) | |
df_prz[i, "prz_xwz"] <- nrow(df_wxz) / nrow(df_wx) | |
df_prz[i, "n_xwz"] <- nrow(df_wxz) | |
if (nrow(df_wxz) >= min_n_for_density) { | |
d_y <- density(df_wxz$grade) | |
f_y <- approxfun(x=d_y$x, y=d_y$y) | |
y <- f_y(grade_grid) | |
y[is.na(y)] <- 0 | |
fy_weighted <- df_prz[i, "pr_between"] * y | |
fy_x <- fy_x + fy_weighted | |
plot(fy_weighted ~ grade_grid, main=title) | |
res <- data.frame(z_lower=df_prz[i, "lower"], z_upper=df_prz[i, "upper"], | |
grade=grade_grid, y=fy_weighted) | |
results <- rbind(results, res) | |
} else { | |
plot(1 ~ 1, type="n", main=title) | |
text(1, 1, "not enough\n points for\n density") | |
} | |
} | |
res_agg <- ( | |
results %>% group_by(grade) %>% summarise(fy_x=sum(y)) %>% as.data.frame() | |
) | |
res_agg$fy_x <- res_agg$fy_x / sum(res_agg$fy_x * diff(res_agg$grade)[1]) # integrate to 1 | |
par(mfrow=c(1, 1)) | |
plot(res_agg$fy_x ~ as.numeric(res_agg$grade), type="l", lwd=3, | |
xlab="grade", ylab="f(y|do(x)", main="Estimated interventional distribution") | |
lines(f_y_do_x(grade_grid) ~ grade_grid, type='l', col='blue', lwd=3) | |
legend(5, .9 * max(res_agg$fy_x), legend=c("estimated", "true"), | |
col=c('black', 'blue'), lty=c(1, 1), cex=1, lwd=3) | |
# Extra visuals ----- | |
# Showing how the distribution of observed hours vary greatly with time resources | |
par(mfrow=c(1, 2)) | |
df %>% filter(time_resources > .8, time_resources < 1.2) %>% pull(hours) %>% | |
hist(., main = "Study hours given time resources near 1 hour", xlab="hours") | |
df %>% filter(time_resources > 19.5, time_resources < 20.5) %>% pull(hours) %>% | |
hist(., main = "Study hours given time resources near 20 hours", xlab="hours") | |
par(mfrow=c(1, 1)) | |
# Visualize the Z density approximation ----- | |
scale_r <- max(z_w_density$y) / max(df_prz$pr_between) | |
plot(z_w_density) | |
points(df_prz$lower, df_prz$pr_between * scale_r, type="h") | |
points(df_prz$upper, df_prz$pr_between * scale_r, type="h") | |
segments(x=df_prz$lower, x1=df_prz$upper, | |
y0=df_prz$pr_between * scale_r, y1=df_prz$pr_between * scale_r) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment