mcfrank/gam_cdi.R

## gam_cdi.R
library(tidyverse)
library(gamlss)

d <- readxl::read_excel("sample_data.xlsx")

# model
max_vocab <- max(d$`Vocabulary production`)

# transformation to 0-1 for beta model
# note that beta data cannot be exactly 0 or 1, it may be necessary to add/subtract .001 for data including 0s and 1s
mod_data <- d |>
  mutate(vocab = `Vocabulary production`,
         age = `Age in months`) |>
  select(vocab, age) |>
  mutate(vocab = vocab / max_vocab) |>
  filter(vocab > 0, vocab < 1)

# gam will only work with complete data
mod_data <- mod_data[complete.cases(mod_data),]

# fit model
# note very smooth, monotonic splines on both mean and SD of age
gam_mod <- gamlss(vocab ~ pbm(age, lambda = 10000),
                 sigma.formula = ~ pbm(age, lambda = 10000),
                 family = BE,
                 control = gamlss.control(c.crit = .1),
                 data = mod_data)

# output appropriate percentiles and reshape for plotting
centiles <- centiles.pred(gam_mod, cent = c(10, 25, 50, 75, 90),
                              xname = "age", xvalues = 8:18,
                              data = mod_data) |>
    pivot_longer(-x, names_to = "percentile", values_to = "predicted") |>
    mutate(predicted = predicted * max_vocab)
	library(tidyverse)
	library(gamlss)

	d <- readxl::read_excel("sample_data.xlsx")

	# model
	max_vocab <- max(d$`Vocabulary production`)

	# transformation to 0-1 for beta model
	# note that beta data cannot be exactly 0 or 1, it may be necessary to add/subtract .001 for data including 0s and 1s
	mod_data <- d \|>
	mutate(vocab = `Vocabulary production`,
	age = `Age in months`) \|>
	select(vocab, age) \|>
	mutate(vocab = vocab / max_vocab) \|>
	filter(vocab > 0, vocab < 1)

	# gam will only work with complete data
	mod_data <- mod_data[complete.cases(mod_data),]

	# fit model
	# note very smooth, monotonic splines on both mean and SD of age
	gam_mod <- gamlss(vocab ~ pbm(age, lambda = 10000),
	sigma.formula = ~ pbm(age, lambda = 10000),
	family = BE,
	control = gamlss.control(c.crit = .1),
	data = mod_data)

	# output appropriate percentiles and reshape for plotting
	centiles <- centiles.pred(gam_mod, cent = c(10, 25, 50, 75, 90),
	xname = "age", xvalues = 8:18,
	data = mod_data) \|>
	pivot_longer(-x, names_to = "percentile", values_to = "predicted") \|>
	mutate(predicted = predicted * max_vocab)