Skip to content

Instantly share code, notes, and snippets.

@baogorek
Last active October 26, 2021 20:43
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save baogorek/daeecc4a272561c949cd466cde7ad31d to your computer and use it in GitHub Desktop.
Save baogorek/daeecc4a272561c949cd466cde7ad31d to your computer and use it in GitHub Desktop.
Companion code to Medium article
library(dplyr)
# Structural Equations ---------------------------------------------------
get_subject_affinity <- function(N) {
# u ~ Uniform(1, 20)
runif(N, min=1, max=20)
}
get_time_resources <- function(N) {
# w ~ Gamma(3, 1)
rgamma(N, shape=3, scale=4)
}
get_enrolled <- function(time_resources) {
# Pr(s = 1) = .1 if w <=5, .2 if w > 5, w <= 20, .3 if w > 20
pr_enrolled <- rep(.1, length(time_resources)) # base probability
pr_enrolled <- ifelse(time_resources > 5, .2, pr_enrolled)
pr_enrolled <- ifelse(time_resources > 20, .3, pr_enrolled)
runif(length(time_resources)) < pr_enrolled
}
get_enjoyment <- function(time_resources, subject_affinity) {
# z = 1 if w <= 1.3, z = u ^ 2 log(1 + w) if w > 1.3
ifelse(time_resources <= 1.3, 1,
subject_affinity ^ 2 * log(1 + time_resources))
}
get_hours <- function(enjoyment, time_resources) {
# hours is N(-1.5 + 1.5 * z, 3 ^ 2) left truncated at 0
E_hours_unconstrained <- -1.5 + 1.5 * enjoyment ^ (.35)
e_star <- rnorm(length(enjoyment), 0, 3)
hours_unconstrained <- E_hours_unconstrained + e_star
hours_unconstrained <- ifelse(hours_unconstrained < 0, 0, hours_unconstrained)
ifelse(hours_unconstrained > time_resources, time_resources,
hours_unconstrained)
}
get_grade <- function(subject_affinity, hours) {
# grade is 40 + u * log(1 + x) + e, where e ~ N(0, 4 ^ 2)
epsilon <- rnorm(length(subject_affinity), 0, 4)
E_grade <- 40 + subject_affinity * log(1 + hours)
grade <- E_grade + epsilon
list(grade=grade, E_grade=E_grade, epsilon=epsilon)
}
simulate_observational <- function(N) {
time_resources <- get_time_resources(N)
subject_affinity <- get_subject_affinity(N)
enrolled <- get_enrolled(time_resources)
enjoyment <- get_enjoyment(time_resources, subject_affinity)
hours <- get_hours(enjoyment, time_resources)
grade_obj <- get_grade(subject_affinity, hours)
df <- data.frame(enrolled, time_resources, enjoyment, hours,
grade=grade_obj$grade, epsilon=grade_obj$epsilon, E_grade=grade_obj$E_grade,
subject_affinity=subject_affinity)
df # still need to filter out unenrolled
}
simulate_interventional <- function(N, hours) {
subject_affinity <- get_subject_affinity(N)
grade_obj <- get_grade(subject_affinity, hours)
data.frame(hours, grade=grade_obj$grade, epsilon=grade_obj$epsilon,
E_grade=grade_obj$E_grade, subject_affinity=subject_affinity)
}
# Tunable knobs
do_hours <- 6
enjoyment_buckets <- 18
fixed_time_resources <- do_hours * 1.1
slide_tol <- .07
min_n_for_density <- 2
grade_grid <- seq(-5, 110, .1)
seed <- 124
N <- 1E6 # gets filtered to roughly 20% by sampling
# Generating the data ---------------------------
set.seed(seed)
df_pop <- simulate_observational(N)
nrow(df_pop)
df <- df_pop %>% filter(enrolled)
# What we have to match:
interventional_df <- simulate_interventional(1E6, do_hours)
do_density <- density(interventional_df$grade)
f_y_do_x <- function(grade) {
f <- approxfun(do_density$x, do_density$y)
ifelse(is.na(f(grade)), 0, f(grade))
}
# Quick plot of what we have to match
plot(f_y_do_x(grade_grid) ~ grade_grid, type='l')
integrate(f_y_do_x, 0, 105)
df_w <- df %>% filter(time_resources > (1 - slide_tol) * fixed_time_resources,
time_resources < (1 + slide_tol) * fixed_time_resources)
df_wx <- df_w %>% filter(hours > do_hours * (1 - slide_tol),
hours < do_hours * (1 + slide_tol))
# overlap of enjoyment distribution: observed data vs conditional slice
cat(nrow(df_w), "records in w-window,", nrow(df_wx),
"records in w and x windows\n")
cat("Enjoyment (z) in the population\n")
summary(df$enjoyment)
cat("Enjoyment (z) in the time resources (w) and hours (x) cut\n")
summary(df_wx$enjoyment)
z_w_density <- density(df_w$enjoyment)
f_z <- approxfun(z_w_density$x, z_w_density$y)
z_cuts <- cut(df_w$enjoyment, enjoyment_buckets)
labs <- levels(z_cuts)
df_prz <- data.frame(
lower = as.numeric( sub("\\((.+),.*", "\\1", labs) ),
upper = as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", labs) ),
n_z = table(z_cuts),
pr_between = NA
)
for (i in 1:nrow(df_prz)) {
integral <- integrate(f_z, df_prz[i, "lower"], df_prz[i, "upper"])
df_prz[i, "pr_between"] <- integral$value
}
# Ensure probs sum to 1
int_fz <- sum(df_prz$pr_between)
df_prz$pr_between <- df_prz$pr_between / int_fz
# Redefine cuts to cover range of z (enjoyment)
df_prz[1, "lower"] <- 0
df_prz[nrow(df_prz), "upper"] <- ceiling(
20 ^ 2 * log(1 + fixed_time_resources * (1 + slide_tol))
)
df_prz[["n_z.z_cuts"]] <- NULL
par(mfrow=c(3, 6))
results <- data.frame()
df_prz$prz_xwz <- NA
df_prz$n_xwz <- NA
fy_x <- rep(0, length(grade_grid)) # TODO: why is it called fy_x
for (i in 1:nrow(df_prz)) {
df_wxz <- df_wx %>% filter(enjoyment > df_prz[i, "lower"],
enjoyment < df_prz[i, "upper"])
title <- paste0("Z in [", df_prz[i, "lower"],
", ", z_upper=df_prz[i, "upper"], "], n=",
nrow(df_wxz))
df_prz[i, "prz_xwz"] <- nrow(df_wxz) / nrow(df_wx)
df_prz[i, "n_xwz"] <- nrow(df_wxz)
if (nrow(df_wxz) >= min_n_for_density) {
d_y <- density(df_wxz$grade)
f_y <- approxfun(x=d_y$x, y=d_y$y)
y <- f_y(grade_grid)
y[is.na(y)] <- 0
fy_weighted <- df_prz[i, "pr_between"] * y
fy_x <- fy_x + fy_weighted
plot(fy_weighted ~ grade_grid, main=title)
res <- data.frame(z_lower=df_prz[i, "lower"], z_upper=df_prz[i, "upper"],
grade=grade_grid, y=fy_weighted)
results <- rbind(results, res)
} else {
plot(1 ~ 1, type="n", main=title)
text(1, 1, "not enough\n points for\n density")
}
}
res_agg <- (
results %>% group_by(grade) %>% summarise(fy_x=sum(y)) %>% as.data.frame()
)
res_agg$fy_x <- res_agg$fy_x / sum(res_agg$fy_x * diff(res_agg$grade)[1]) # integrate to 1
par(mfrow=c(1, 1))
plot(res_agg$fy_x ~ as.numeric(res_agg$grade), type="l", lwd=3,
xlab="grade", ylab="f(y|do(x)", main="Estimated interventional distribution")
lines(f_y_do_x(grade_grid) ~ grade_grid, type='l', col='blue', lwd=3)
legend(5, .9 * max(res_agg$fy_x), legend=c("estimated", "true"),
col=c('black', 'blue'), lty=c(1, 1), cex=1, lwd=3)
# Extra visuals -----
# Showing how the distribution of observed hours vary greatly with time resources
par(mfrow=c(1, 2))
df %>% filter(time_resources > .8, time_resources < 1.2) %>% pull(hours) %>%
hist(., main = "Study hours given time resources near 1 hour", xlab="hours")
df %>% filter(time_resources > 19.5, time_resources < 20.5) %>% pull(hours) %>%
hist(., main = "Study hours given time resources near 20 hours", xlab="hours")
par(mfrow=c(1, 1))
# Visualize the Z density approximation -----
scale_r <- max(z_w_density$y) / max(df_prz$pr_between)
plot(z_w_density)
points(df_prz$lower, df_prz$pr_between * scale_r, type="h")
points(df_prz$upper, df_prz$pr_between * scale_r, type="h")
segments(x=df_prz$lower, x1=df_prz$upper,
y0=df_prz$pr_between * scale_r, y1=df_prz$pr_between * scale_r)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment