Skip to content

Instantly share code, notes, and snippets.

@EoinTravers
Created August 12, 2021 14:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save EoinTravers/656ac7b77a5cfa966c706888185afcd5 to your computer and use it in GitHub Desktop.
Save EoinTravers/656ac7b77a5cfa966c706888185afcd5 to your computer and use it in GitHub Desktop.
---
title: "glm_prevelance"
author: "Eoin Travers"
output: html_document
---
```{r}
library(tidyverse)
binom_smooth = function (link = "probit", ...) {
geom_smooth(method = "glm",
method.args = list(family = binomial(link = link)), ...)
}
nsubjs = 1000
logit = plogis
invlogit = qlogis
TRAIN_X = rnorm(nsubjs, 0, 1)
TEST_X = rnorm(nsubjs, .5, 1)
do_sim = function(alpha = 0, beta = 1, X = TRAIN_X, nsubjs = 1000){
# alpha = invlogit(baseline_prob)
baseline_prob = logit(alpha)
mu = alpha + X * beta
p = logit(mu)
y = rbernoulli(nsubjs, p = p) * 1
data.frame(x=X, mu, p, y) %>%
mutate(baseline = baseline_prob,
alpha = alpha,
mean_x = mean_x,
beta = beta)
}
plot_data = function(data){
ggplot(data, aes(x, y)) +
binom_smooth() +
geom_hline(linetype = 'dashed', yintercept = .5) +
geom_vline(linetype = 'dashed', xintercept = 0)
}
```
```{r}
data = do_sim(0, 1)
plot_data(data)
```
```{r}
predict_prevalence = function(alpha, beta, X){
df = do_sim(alpha = alpha, X = X)
mean(df$p)
}
alphas = seq(-3, 3, length.out = 100)
prevalences_df = data.frame(
alpha = alphas,
train = map_dbl(alphas, predict_prevalence, beta = 1, X = TRAIN_X),
test = map_dbl(alphas, predict_prevalence, beta = 1, X = TEST_X)
)
prevalences_df %>%
pivot_longer(-alpha) %>%
ggplot(aes(alpha, value, color = name)) +
geom_path() +
labs(x = 'Intercept α', y = 'Predicted Prevalence',
color = 'Context')
```
Figure out best values to use for simulation
```{r}
train_alpha = with(prevalences_df, approx(train, alpha, .15))$y
test_alpha = with(prevalences_df, approx(test, alpha, .30))$y
data.frame(train_alpha, test_alpha)
```
Do simulations
```{r}
train = do_sim(alpha = train_alpha, X = TRAIN_X)
test = do_sim(alpha = test_alpha, X = TEST_X)
full_data = rbind(train, test) %>%
mutate(context = ifelse(alpha == train_alpha, 'Train', 'Test'))
ggplot(full_data, aes(x, p, color = context)) +
geom_point() +
labs(x = 'Predictor', y = 'True Probability')
```
```{r}
model = glm(y ~ x, data = train, family = binomial)
coefs = coef(model)
summary(model)
```
```{r}
make_predictions = function(alpha, beta, X){
mu = alpha + X * beta
logit(mu)
}
train$predictions = make_predictions(coefs[1], coefs[2], TRAIN_X)
test$unadjusted_predictions = make_predictions(coefs[1], coefs[2], TEST_X)
```
Figure out how much to adjust alpha
by finding out the value the predicts a prevalence of .3
```{r}
alphas = seq(-3, 3, length.out = 100)
predicted_prev = map_dbl(alphas, predict_prevalence,
beta = coefs[2], X = TEST_X)
# What value of alpha
adjusted_alpha = approx(predicted_prev, alphas, .3)$y
```
Make adjusted predictions
```{r}
test$adjusted_predictions = make_predictions(adjusted_alpha, coefs[2], TEST_X)
plot_df = test %>%
select(x, p, y, uadj=unadjusted_predictions, adj=adjusted_predictions) %>%
pivot_longer(c(p, uadj, adj)) %>%
mutate(
name = fct_recode(name,
'True prob.'='p',
'Unadjusted model'='uadj',
'Adjusted model'='adj'))
ggplot(plot_df, aes(x, value, color = name)) +
geom_point() +
labs(x = 'Predictor', y='Predicted Probability',
color = NULL)
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment