timriffe/vglm_comparisons.R

## vglm_comparisons.R

# a little exercise for Angelo and me
library(VGAM)
library(markovchain)
library(tidyverse)
source("https://raw.githubusercontent.com/timriffe/PHDS_HLE_2021/main/Functions.R")

# read in original transition estimates
TR <- read_csv("https://raw.githubusercontent.com/timriffe/PHDS_HLE_2021/main/Data/TRv6.csv")

# example subset:
TRsub <- TR %>% filter(sex == "f",
                       edu == "terciary",
                       time == 1996)

# starting proportions in each state
init <- TRsub[1, c("s1_prop","s2_prop")] %>% unlist()
names(init) <- c("H", "U")

# now we simulate data from which to restimate the transitions,
# for sake of an example dataset

# Make the submatrices of U, the transient matrix
HH <- pi2u(pivec = TRsub$m11, from = "H", to = "H")
HU <- pi2u(pivec = TRsub$m12, from = "H", to = "U")
UH <- pi2u(pivec = TRsub$m21, from = "U", to = "H")
UU <- pi2u(pivec = TRsub$m22, from = "U", to = "U")

# terciary edu females
U <- u2U(HH = HH, # healthy to healthy
         HU = HU, # healthy to unhealthy
         UH = UH, # unhealthy to healthy
         UU = UU) # unhealthy to unhealthy


# complete the Markov matrix with Dead state,
# this turns it into a stochastic matrix (colSums = 1)
U <- cbind(U, 0)
U <- rbind(U, 1 - colSums(U)) # death probabilities are 1-sum(transient_probabilities) ...

# Need to name the dims, concatenating state and age
age_state   <- c(outer(seq(48, 110, by = 2), paste0("::", c("H","U")), paste0), "Dead")
(dimnames(U) <- list(to = age_state, from = age_state))

# ---------------
# switch from Leslie-Caswell to standard Markov orientation!!!!
# Gotta remember this!!!
U <- t(U)
# ---------------

# make s4 transition matrix from markovchain package
# RTFM
mcHLE <- new("markovchain",
             states = rownames(U),
             byrow = TRUE,
             transitionMatrix = U,
             name = "HLE")

# how many sequences should we generate?
# init, using prevalence
N  <- 2e4
Ns <- round(init * N)
set.seed(2022)

RHRS_H  <- replicate(Ns["H"],
                     rmarkovchain(n = 31,
                                  object = mcHLE,
                                  t0 = "48::H",
                                  parallel = TRUE)
)
RHRS_U  <- replicate(Ns["U"],
                     rmarkovchain(n = 31,
                                  object = mcHLE,
                                  t0 = "48::U",
                                  parallel = TRUE)
)

# stick together in one population,
# note, t0 from the random generator is not included
# in the trajectories, so we need to append it to be
# consistent with the other approaches.
RHRS                   <- cbind(rbind("H",RHRS_H), rbind("U",RHRS_U))

# only need the states, don't need the age part of the labels
RHRS_clean             <- gsub(".*:","",RHRS)
a                    <- seq(48, 110, by = 2)
rownames(RHRS_clean) <- a
colnames(RHRS_clean) <- 1:ncol(RHRS_clean)

# reshape to data from which we can estimate transitions
data_trans <-
  RHRS_clean %>%
  as.data.frame() %>%
  rownames_to_column("age") %>%
  pivot_longer(-age, names_to = "id",values_to = "i") %>%
  mutate(age = as.integer(age)) %>%
  arrange(id,age) %>%
  group_by(id) %>%
  mutate(j = lead(i),
         j = if_else(is.na(j),"Dead",j)) %>%
  ungroup() %>%
  mutate(j = case_when(j == "H" ~ 0,
                       j == "U" ~ 1,
                       j == "Dead" ~ 2)) %>%
  filter(i != "Dead")

# First show leverage of refLevel:
mod1 <-
  vgam(j ~ s(age) + i,
       data = data_trans,
       family=multinomial(refLevel = 1))
mod2 <-
   vgam(j ~ s(age) + i,
        data = data_trans,
        family = multinomial(refLevel = 2))
mod3 <-
  vgam(j ~ s(age) + i,
       data = data_trans,
       family = multinomial(refLevel = 3))

object.size(data_trans) %>% print(units = "Mb")

# these are also big because they store the original data!
# s3 <- summary(mod3)
# s2 <- summary(mod2)
# s1 <- summary(mod1)

# data for predicting
newdata <-data_trans %>%
  select(age,i) %>%
  distinct()

# tidy up predictions for easy stacking
pred1 <- bind_cols(
           newdata,
           predict(mod1, newdata, type="response")) %>%
  mutate(variant = "ref1") %>%
  pivot_longer(c(`0`,`1`,`2`), names_to = "j", values_to = "p")

pred2 <- bind_cols(
           newdata,
           predict(mod2, newdata, type="response"))%>%
  mutate(variant = "ref2")%>%
  pivot_longer(c(`0`,`1`,`2`), names_to = "j", values_to = "p")

pred3 <- bind_cols(
          newdata,
          predict(mod3, newdata, type="response"))%>%
  mutate(variant = "ref3")%>%
  pivot_longer(c(`0`,`1`,`2`), names_to = "j", values_to = "p")

# compare with original transitions:
original <-
TRsub %>%
  pivot_longer(m11:m24, names_to = "from_to", values_to = "p") %>%
  mutate(i = case_when(substr(from_to,2,2) == "1" ~ "H",
                       substr(from_to,2,2) == "2" ~ "U",
                       substr(from_to,3,3) == "4" ~ "2"),
         j = case_when(substr(from_to,3,3) == "1" ~ "0",
                       substr(from_to,3,3) == "2" ~ "1",
                       substr(from_to,3,3) == "4" ~ "2"),
         variant = "original") %>%
  select(age,i,variant,j,p)


bind_rows(pred1,pred2,pred3,original) %>%
  ggplot(aes(x=age,y=p,color = as.factor(j), lty = variant)) +
  geom_line() +
  facet_wrap(~i) +
  labs(title = "compare refstate",
       subtitle = "differences with original data mostly due to starting conditions?\n
and a bit due to offset start age\nBut still minor differences between refstate variants")


# now show individual vs count estimates:

# first tabulate data
data_trans_n <-
  data_trans %>%
  select(age,i,j) %>%
  group_by(age,i,j) %>%
  summarize(n = n(), .groups = "drop") %>%
  group_by(age, i) %>%
  # to examine simulated empirical transition probabilities
  mutate(p = n / sum(n))

# refit using weights
mod2n <-
  vgam(j~s(age) + i,
       data = data_trans_n,
       family=multinomial(refLevel=2),
       weights = n)

# compare apples w apples
pred2n <- bind_cols(
  newdata,
  predict(mod2n, newdata, type="response")) %>%
  mutate(variant = "ref2n")%>%
  pivot_longer(c(`0`,`1`,`2`), names_to = "j", values_to = "p")

# compare in plot
bind_rows(pred2n,pred2) %>%
  ggplot(aes(x=age,y=p,color = as.factor(j), lty = variant)) +
  geom_line() +
  facet_wrap(~i) +
  labs(title = "individual vs count data, point estimates, identical")

# So which reflevel should we use? 0? = healthy?

	# a little exercise for Angelo and me
	library(VGAM)
	library(markovchain)
	library(tidyverse)
	source("https://raw.githubusercontent.com/timriffe/PHDS_HLE_2021/main/Functions.R")

	# read in original transition estimates
	TR <- read_csv("https://raw.githubusercontent.com/timriffe/PHDS_HLE_2021/main/Data/TRv6.csv")

	# example subset:
	TRsub <- TR %>% filter(sex == "f",
	edu == "terciary",
	time == 1996)

	# starting proportions in each state
	init <- TRsub[1, c("s1_prop","s2_prop")] %>% unlist()
	names(init) <- c("H", "U")

	# now we simulate data from which to restimate the transitions,
	# for sake of an example dataset

	# Make the submatrices of U, the transient matrix
	HH <- pi2u(pivec = TRsub$m11, from = "H", to = "H")
	HU <- pi2u(pivec = TRsub$m12, from = "H", to = "U")
	UH <- pi2u(pivec = TRsub$m21, from = "U", to = "H")
	UU <- pi2u(pivec = TRsub$m22, from = "U", to = "U")

	# terciary edu females
	U <- u2U(HH = HH, # healthy to healthy
	HU = HU, # healthy to unhealthy
	UH = UH, # unhealthy to healthy
	UU = UU) # unhealthy to unhealthy


	# complete the Markov matrix with Dead state,
	# this turns it into a stochastic matrix (colSums = 1)
	U <- cbind(U, 0)
	U <- rbind(U, 1 - colSums(U)) # death probabilities are 1-sum(transient_probabilities) ...

	# Need to name the dims, concatenating state and age
	age_state <- c(outer(seq(48, 110, by = 2), paste0("::", c("H","U")), paste0), "Dead")
	(dimnames(U) <- list(to = age_state, from = age_state))

	# ---------------
	# switch from Leslie-Caswell to standard Markov orientation!!!!
	# Gotta remember this!!!
	U <- t(U)
	# ---------------

	# make s4 transition matrix from markovchain package
	# RTFM
	mcHLE <- new("markovchain",
	states = rownames(U),
	byrow = TRUE,
	transitionMatrix = U,
	name = "HLE")

	# how many sequences should we generate?
	# init, using prevalence
	N <- 2e4
	Ns <- round(init * N)
	set.seed(2022)

	RHRS_H <- replicate(Ns["H"],
	rmarkovchain(n = 31,
	object = mcHLE,
	t0 = "48::H",
	parallel = TRUE)
	)
	RHRS_U <- replicate(Ns["U"],
	rmarkovchain(n = 31,
	object = mcHLE,
	t0 = "48::U",
	parallel = TRUE)
	)

	# stick together in one population,
	# note, t0 from the random generator is not included
	# in the trajectories, so we need to append it to be
	# consistent with the other approaches.
	RHRS <- cbind(rbind("H",RHRS_H), rbind("U",RHRS_U))

	# only need the states, don't need the age part of the labels
	RHRS_clean <- gsub(".*:","",RHRS)
	a <- seq(48, 110, by = 2)
	rownames(RHRS_clean) <- a
	colnames(RHRS_clean) <- 1:ncol(RHRS_clean)

	# reshape to data from which we can estimate transitions
	data_trans <-
	RHRS_clean %>%
	as.data.frame() %>%
	rownames_to_column("age") %>%
	pivot_longer(-age, names_to = "id",values_to = "i") %>%
	mutate(age = as.integer(age)) %>%
	arrange(id,age) %>%
	group_by(id) %>%
	mutate(j = lead(i),
	j = if_else(is.na(j),"Dead",j)) %>%
	ungroup() %>%
	mutate(j = case_when(j == "H" ~ 0,
	j == "U" ~ 1,
	j == "Dead" ~ 2)) %>%
	filter(i != "Dead")

	# First show leverage of refLevel:
	mod1 <-
	vgam(j ~ s(age) + i,
	data = data_trans,
	family=multinomial(refLevel = 1))
	mod2 <-
	vgam(j ~ s(age) + i,
	data = data_trans,
	family = multinomial(refLevel = 2))
	mod3 <-
	vgam(j ~ s(age) + i,
	data = data_trans,
	family = multinomial(refLevel = 3))

	object.size(data_trans) %>% print(units = "Mb")

	# these are also big because they store the original data!
	# s3 <- summary(mod3)
	# s2 <- summary(mod2)
	# s1 <- summary(mod1)

	# data for predicting
	newdata <-data_trans %>%
	select(age,i) %>%
	distinct()

	# tidy up predictions for easy stacking
	pred1 <- bind_cols(
	newdata,
	predict(mod1, newdata, type="response")) %>%
	mutate(variant = "ref1") %>%
	pivot_longer(c(`0`,`1`,`2`), names_to = "j", values_to = "p")

	pred2 <- bind_cols(
	newdata,
	predict(mod2, newdata, type="response"))%>%
	mutate(variant = "ref2")%>%
	pivot_longer(c(`0`,`1`,`2`), names_to = "j", values_to = "p")

	pred3 <- bind_cols(
	newdata,
	predict(mod3, newdata, type="response"))%>%
	mutate(variant = "ref3")%>%
	pivot_longer(c(`0`,`1`,`2`), names_to = "j", values_to = "p")

	# compare with original transitions:
	original <-
	TRsub %>%
	pivot_longer(m11:m24, names_to = "from_to", values_to = "p") %>%
	mutate(i = case_when(substr(from_to,2,2) == "1" ~ "H",
	substr(from_to,2,2) == "2" ~ "U",
	substr(from_to,3,3) == "4" ~ "2"),
	j = case_when(substr(from_to,3,3) == "1" ~ "0",
	substr(from_to,3,3) == "2" ~ "1",
	substr(from_to,3,3) == "4" ~ "2"),
	variant = "original") %>%
	select(age,i,variant,j,p)


	bind_rows(pred1,pred2,pred3,original) %>%
	ggplot(aes(x=age,y=p,color = as.factor(j), lty = variant)) +
	geom_line() +
	facet_wrap(~i) +
	labs(title = "compare refstate",
	subtitle = "differences with original data mostly due to starting conditions?\n
	and a bit due to offset start age\nBut still minor differences between refstate variants")


	# now show individual vs count estimates:

	# first tabulate data
	data_trans_n <-
	data_trans %>%
	select(age,i,j) %>%
	group_by(age,i,j) %>%
	summarize(n = n(), .groups = "drop") %>%
	group_by(age, i) %>%
	# to examine simulated empirical transition probabilities
	mutate(p = n / sum(n))

	# refit using weights
	mod2n <-
	vgam(j~s(age) + i,
	data = data_trans_n,
	family=multinomial(refLevel=2),
	weights = n)

	# compare apples w apples
	pred2n <- bind_cols(
	newdata,
	predict(mod2n, newdata, type="response")) %>%
	mutate(variant = "ref2n")%>%
	pivot_longer(c(`0`,`1`,`2`), names_to = "j", values_to = "p")

	# compare in plot
	bind_rows(pred2n,pred2) %>%
	ggplot(aes(x=age,y=p,color = as.factor(j), lty = variant)) +
	geom_line() +
	facet_wrap(~i) +
	labs(title = "individual vs count data, point estimates, identical")

	# So which reflevel should we use? 0? = healthy?