timriffe/prevalence_shareable.R

## prevalence_shareable.R
library(tidyverse)
library(splines)

N <- 2000

x <- rnorm(N,mean = 2, sd = 5) |>
  round() |>
  table()

age_diff <- x |> names() |> as.integer()

data <-
  tibble(age_diff = age_diff,
         n = as.integer(x)) |>
  complete(age_diff = -20:20,
           fill = list(n=0)) |>
  mutate(prev = n / sum(n),
         prev = prev ^ 2,
         prev = prev / sum(prev),
         n = round(prev * N))

# Examine simulated age-diff prevalence:
data |>
  ggplot(aes(x = age_diff, y = prev)) +
  geom_line() +
  labs(title = "This distribution looks qualitatively\nlike age difference prevalence",
       subtitle = "Note, we also have 0s in tails") +
  theme_minimal()

# Problem: we'd like to share these data, but we can't share
# (1) counts below 5 (unless 0)
# (2) prevalence values that can be used to re-derive exact counts
# For this reason, we want to smooth the prevalence, but in such a way as to
# preserve the sharp peak and tail activity. The following is an ad hoc solution.

knots <- c(-15,-10,-5,seq(-3,5,by=2),10,15)

prev_pred <-
  data %>%
    glm(sqrt(prev) ~ ns(age_diff,
                        knots = knots),
        family = binomial(link = "logit"), data = .) |>
    predict(newdata = data, type=  "response") |>
    as.data.frame() |>
    rename(prev_hat = 1) |>
    mutate(age_diff = -20:20,
           prev_hat = prev_hat ^ 2,
           prev_hat = prev_hat / sum(prev_hat))

# Examine fit against simulated age-diff prevalence:
prev_pred |>
  ggplot(aes(x = age_diff, y = prev_hat)) +
  geom_line() +
    geom_point(data = data, aes(y=prev)) +
    labs(title = "This fit preserves the shape very well",
         subtitle = "importantly, it cannot be used to back out counts") +
  theme_minimal()

# and how to scale this to operate on multiple groups?
# make a function!

#' @param data tibble with columns `age_diff` and `prev`
#' @param knots a vector of knot locations in `age_diff` units
smooth_prev <- function(data, knots = c(-15,-10,-5,seq(-3,5,by=2),10,15)){
  .age_diffs <- data$age_diff
  data %>%
    glm(sqrt(prev) ~ ns(age_diff,
                        knots = knots),
        family = binomial(link = "logit"), data = .) |>
    predict(newdata = data, type=  "response") |>
    as.data.frame() |>
    rename(prev_hat = 1) |>
    mutate(age_diff = .age_diffs,
           prev_hat = prev_hat ^ 2,
           prev_hat = prev_hat / sum(prev_hat)) |>
    left_join(data, by = join_by(age_diff))
}

# how to use on all strata
data |>
  mutate(strata = 1, .before = 1) |>
  group_modify(~smooth_prev(data = .), .by = strata)
	library(tidyverse)
	library(splines)

	N <- 2000

	x <- rnorm(N,mean = 2, sd = 5) \|>
	round() \|>
	table()

	age_diff <- x \|> names() \|> as.integer()

	data <-
	tibble(age_diff = age_diff,
	n = as.integer(x)) \|>
	complete(age_diff = -20:20,
	fill = list(n=0)) \|>
	mutate(prev = n / sum(n),
	prev = prev ^ 2,
	prev = prev / sum(prev),
	n = round(prev * N))

	# Examine simulated age-diff prevalence:
	data \|>
	ggplot(aes(x = age_diff, y = prev)) +
	geom_line() +
	labs(title = "This distribution looks qualitatively\nlike age difference prevalence",
	subtitle = "Note, we also have 0s in tails") +
	theme_minimal()

	# Problem: we'd like to share these data, but we can't share
	# (1) counts below 5 (unless 0)
	# (2) prevalence values that can be used to re-derive exact counts
	# For this reason, we want to smooth the prevalence, but in such a way as to
	# preserve the sharp peak and tail activity. The following is an ad hoc solution.

	knots <- c(-15,-10,-5,seq(-3,5,by=2),10,15)

	prev_pred <-
	data %>%
	glm(sqrt(prev) ~ ns(age_diff,
	knots = knots),
	family = binomial(link = "logit"), data = .) \|>
	predict(newdata = data, type= "response") \|>
	as.data.frame() \|>
	rename(prev_hat = 1) \|>
	mutate(age_diff = -20:20,
	prev_hat = prev_hat ^ 2,
	prev_hat = prev_hat / sum(prev_hat))

	# Examine fit against simulated age-diff prevalence:
	prev_pred \|>
	ggplot(aes(x = age_diff, y = prev_hat)) +
	geom_line() +
	geom_point(data = data, aes(y=prev)) +
	labs(title = "This fit preserves the shape very well",
	subtitle = "importantly, it cannot be used to back out counts") +
	theme_minimal()

	# and how to scale this to operate on multiple groups?
	# make a function!

	#' @param data tibble with columns `age_diff` and `prev`
	#' @param knots a vector of knot locations in `age_diff` units
	smooth_prev <- function(data, knots = c(-15,-10,-5,seq(-3,5,by=2),10,15)){
	.age_diffs <- data$age_diff
	data %>%
	glm(sqrt(prev) ~ ns(age_diff,
	knots = knots),
	family = binomial(link = "logit"), data = .) \|>
	predict(newdata = data, type= "response") \|>
	as.data.frame() \|>
	rename(prev_hat = 1) \|>
	mutate(age_diff = .age_diffs,
	prev_hat = prev_hat ^ 2,
	prev_hat = prev_hat / sum(prev_hat)) \|>
	left_join(data, by = join_by(age_diff))
	}

	# how to use on all strata
	data \|>
	mutate(strata = 1, .before = 1) \|>
	group_modify(~smooth_prev(data = .), .by = strata)