EoinTravers/glm_prevalence.Rmd

## glm_prevalence.Rmd
---
title: "glm_prevelance"
author: "Eoin Travers"
output: html_document
---

```{r}
library(tidyverse)
binom_smooth = function (link = "probit", ...) {
    geom_smooth(method = "glm",
                method.args = list(family = binomial(link = link)), ...)
}
nsubjs = 1000
logit = plogis
invlogit = qlogis

TRAIN_X = rnorm(nsubjs, 0, 1)
TEST_X  = rnorm(nsubjs, .5, 1)

do_sim = function(alpha = 0, beta = 1, X = TRAIN_X, nsubjs = 1000){
  # alpha = invlogit(baseline_prob)
  baseline_prob = logit(alpha)
  mu = alpha + X * beta
  p = logit(mu)
  y = rbernoulli(nsubjs, p = p) * 1
  data.frame(x=X, mu, p, y) %>%
    mutate(baseline = baseline_prob,
           alpha = alpha,
           mean_x = mean_x,
           beta = beta)

}

plot_data = function(data){
  ggplot(data, aes(x, y)) +
    binom_smooth() +
    geom_hline(linetype = 'dashed', yintercept = .5) +
    geom_vline(linetype = 'dashed', xintercept = 0)
}
```

```{r}
data = do_sim(0, 1)
plot_data(data)
```

```{r}
predict_prevalence = function(alpha, beta, X){
  df = do_sim(alpha = alpha, X = X)
  mean(df$p)
}

alphas = seq(-3, 3, length.out = 100)

prevalences_df = data.frame(
  alpha = alphas,
  train =  map_dbl(alphas, predict_prevalence, beta = 1, X = TRAIN_X),
  test = map_dbl(alphas, predict_prevalence, beta = 1, X = TEST_X)
)

prevalences_df %>%
  pivot_longer(-alpha) %>%
  ggplot(aes(alpha, value, color = name)) +
  geom_path() +
  labs(x = 'Intercept α', y = 'Predicted Prevalence',
       color = 'Context')

```
Figure out best values to use for simulation

```{r}
train_alpha = with(prevalences_df, approx(train, alpha, .15))$y
test_alpha = with(prevalences_df, approx(test, alpha, .30))$y
data.frame(train_alpha, test_alpha)
```
Do simulations

```{r}
train = do_sim(alpha = train_alpha, X = TRAIN_X)
test = do_sim(alpha = test_alpha, X = TEST_X)
full_data = rbind(train, test) %>%
  mutate(context = ifelse(alpha == train_alpha, 'Train', 'Test'))

ggplot(full_data, aes(x, p, color = context)) +
  geom_point() +
  labs(x = 'Predictor', y = 'True Probability')
```

```{r}
model = glm(y ~ x, data = train, family = binomial)
coefs = coef(model)
summary(model)
```

```{r}
make_predictions = function(alpha, beta, X){
  mu = alpha + X * beta
  logit(mu)
}
train$predictions = make_predictions(coefs[1], coefs[2], TRAIN_X)
test$unadjusted_predictions = make_predictions(coefs[1], coefs[2], TEST_X)
```

Figure out how much to adjust alpha
by finding out the value the predicts a prevalence of .3

```{r}
alphas = seq(-3, 3, length.out = 100)
predicted_prev = map_dbl(alphas, predict_prevalence,
                         beta = coefs[2], X = TEST_X)
# What value of alpha
adjusted_alpha = approx(predicted_prev, alphas, .3)$y
```
Make adjusted predictions

```{r}
test$adjusted_predictions = make_predictions(adjusted_alpha, coefs[2], TEST_X)
plot_df = test %>%
  select(x, p, y, uadj=unadjusted_predictions, adj=adjusted_predictions) %>%
  pivot_longer(c(p, uadj, adj)) %>%
  mutate(
    name = fct_recode(name,
                      'True prob.'='p',
                      'Unadjusted model'='uadj',
                      'Adjusted model'='adj'))

ggplot(plot_df, aes(x, value, color = name)) +
  geom_point() +
  labs(x = 'Predictor', y='Predicted Probability',
       color = NULL)
```
	---
	title: "glm_prevelance"
	author: "Eoin Travers"
	output: html_document
	---

	```{r}
	library(tidyverse)
	binom_smooth = function (link = "probit", ...) {
	geom_smooth(method = "glm",
	method.args = list(family = binomial(link = link)), ...)
	}
	nsubjs = 1000
	logit = plogis
	invlogit = qlogis

	TRAIN_X = rnorm(nsubjs, 0, 1)
	TEST_X = rnorm(nsubjs, .5, 1)

	do_sim = function(alpha = 0, beta = 1, X = TRAIN_X, nsubjs = 1000){
	# alpha = invlogit(baseline_prob)
	baseline_prob = logit(alpha)
	mu = alpha + X * beta
	p = logit(mu)
	y = rbernoulli(nsubjs, p = p) * 1
	data.frame(x=X, mu, p, y) %>%
	mutate(baseline = baseline_prob,
	alpha = alpha,
	mean_x = mean_x,
	beta = beta)

	}

	plot_data = function(data){
	ggplot(data, aes(x, y)) +
	binom_smooth() +
	geom_hline(linetype = 'dashed', yintercept = .5) +
	geom_vline(linetype = 'dashed', xintercept = 0)
	}
	```

	```{r}
	data = do_sim(0, 1)
	plot_data(data)
	```

	```{r}
	predict_prevalence = function(alpha, beta, X){
	df = do_sim(alpha = alpha, X = X)
	mean(df$p)
	}

	alphas = seq(-3, 3, length.out = 100)

	prevalences_df = data.frame(
	alpha = alphas,
	train = map_dbl(alphas, predict_prevalence, beta = 1, X = TRAIN_X),
	test = map_dbl(alphas, predict_prevalence, beta = 1, X = TEST_X)
	)

	prevalences_df %>%
	pivot_longer(-alpha) %>%
	ggplot(aes(alpha, value, color = name)) +
	geom_path() +
	labs(x = 'Intercept α', y = 'Predicted Prevalence',
	color = 'Context')

	```
	Figure out best values to use for simulation

	```{r}
	train_alpha = with(prevalences_df, approx(train, alpha, .15))$y
	test_alpha = with(prevalences_df, approx(test, alpha, .30))$y
	data.frame(train_alpha, test_alpha)
	```
	Do simulations

	```{r}
	train = do_sim(alpha = train_alpha, X = TRAIN_X)
	test = do_sim(alpha = test_alpha, X = TEST_X)
	full_data = rbind(train, test) %>%
	mutate(context = ifelse(alpha == train_alpha, 'Train', 'Test'))

	ggplot(full_data, aes(x, p, color = context)) +
	geom_point() +
	labs(x = 'Predictor', y = 'True Probability')
	```

	```{r}
	model = glm(y ~ x, data = train, family = binomial)
	coefs = coef(model)
	summary(model)
	```

	```{r}
	make_predictions = function(alpha, beta, X){
	mu = alpha + X * beta
	logit(mu)
	}
	train$predictions = make_predictions(coefs[1], coefs[2], TRAIN_X)
	test$unadjusted_predictions = make_predictions(coefs[1], coefs[2], TEST_X)
	```

	Figure out how much to adjust alpha
	by finding out the value the predicts a prevalence of .3

	```{r}
	alphas = seq(-3, 3, length.out = 100)
	predicted_prev = map_dbl(alphas, predict_prevalence,
	beta = coefs[2], X = TEST_X)
	# What value of alpha
	adjusted_alpha = approx(predicted_prev, alphas, .3)$y
	```
	Make adjusted predictions

	```{r}
	test$adjusted_predictions = make_predictions(adjusted_alpha, coefs[2], TEST_X)
	plot_df = test %>%
	select(x, p, y, uadj=unadjusted_predictions, adj=adjusted_predictions) %>%
	pivot_longer(c(p, uadj, adj)) %>%
	mutate(
	name = fct_recode(name,
	'True prob.'='p',
	'Unadjusted model'='uadj',
	'Adjusted model'='adj'))

	ggplot(plot_df, aes(x, value, color = name)) +
	geom_point() +
	labs(x = 'Predictor', y='Predicted Probability',
	color = NULL)
	```