Skip to content

Instantly share code, notes, and snippets.

@timriffe
Last active June 7, 2022 20:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save timriffe/13e47a21ba50e27a09af8a6e71cecebf to your computer and use it in GitHub Desktop.
Save timriffe/13e47a21ba50e27a09af8a6e71cecebf to your computer and use it in GitHub Desktop.
Comparisons of refstate leverage, as well as
# a little exercise for Angelo and me
library(VGAM)
library(markovchain)
library(tidyverse)
source("https://raw.githubusercontent.com/timriffe/PHDS_HLE_2021/main/Functions.R")
# read in original transition estimates
TR <- read_csv("https://raw.githubusercontent.com/timriffe/PHDS_HLE_2021/main/Data/TRv6.csv")
# example subset:
TRsub <- TR %>% filter(sex == "f",
edu == "terciary",
time == 1996)
# starting proportions in each state
init <- TRsub[1, c("s1_prop","s2_prop")] %>% unlist()
names(init) <- c("H", "U")
# now we simulate data from which to restimate the transitions,
# for sake of an example dataset
# Make the submatrices of U, the transient matrix
HH <- pi2u(pivec = TRsub$m11, from = "H", to = "H")
HU <- pi2u(pivec = TRsub$m12, from = "H", to = "U")
UH <- pi2u(pivec = TRsub$m21, from = "U", to = "H")
UU <- pi2u(pivec = TRsub$m22, from = "U", to = "U")
# terciary edu females
U <- u2U(HH = HH, # healthy to healthy
HU = HU, # healthy to unhealthy
UH = UH, # unhealthy to healthy
UU = UU) # unhealthy to unhealthy
# complete the Markov matrix with Dead state,
# this turns it into a stochastic matrix (colSums = 1)
U <- cbind(U, 0)
U <- rbind(U, 1 - colSums(U)) # death probabilities are 1-sum(transient_probabilities) ...
# Need to name the dims, concatenating state and age
age_state <- c(outer(seq(48, 110, by = 2), paste0("::", c("H","U")), paste0), "Dead")
(dimnames(U) <- list(to = age_state, from = age_state))
# ---------------
# switch from Leslie-Caswell to standard Markov orientation!!!!
# Gotta remember this!!!
U <- t(U)
# ---------------
# make s4 transition matrix from markovchain package
# RTFM
mcHLE <- new("markovchain",
states = rownames(U),
byrow = TRUE,
transitionMatrix = U,
name = "HLE")
# how many sequences should we generate?
# init, using prevalence
N <- 2e4
Ns <- round(init * N)
set.seed(2022)
RHRS_H <- replicate(Ns["H"],
rmarkovchain(n = 31,
object = mcHLE,
t0 = "48::H",
parallel = TRUE)
)
RHRS_U <- replicate(Ns["U"],
rmarkovchain(n = 31,
object = mcHLE,
t0 = "48::U",
parallel = TRUE)
)
# stick together in one population,
# note, t0 from the random generator is not included
# in the trajectories, so we need to append it to be
# consistent with the other approaches.
RHRS <- cbind(rbind("H",RHRS_H), rbind("U",RHRS_U))
# only need the states, don't need the age part of the labels
RHRS_clean <- gsub(".*:","",RHRS)
a <- seq(48, 110, by = 2)
rownames(RHRS_clean) <- a
colnames(RHRS_clean) <- 1:ncol(RHRS_clean)
# reshape to data from which we can estimate transitions
data_trans <-
RHRS_clean %>%
as.data.frame() %>%
rownames_to_column("age") %>%
pivot_longer(-age, names_to = "id",values_to = "i") %>%
mutate(age = as.integer(age)) %>%
arrange(id,age) %>%
group_by(id) %>%
mutate(j = lead(i),
j = if_else(is.na(j),"Dead",j)) %>%
ungroup() %>%
mutate(j = case_when(j == "H" ~ 0,
j == "U" ~ 1,
j == "Dead" ~ 2)) %>%
filter(i != "Dead")
# First show leverage of refLevel:
mod1 <-
vgam(j ~ s(age) + i,
data = data_trans,
family=multinomial(refLevel = 1))
mod2 <-
vgam(j ~ s(age) + i,
data = data_trans,
family = multinomial(refLevel = 2))
mod3 <-
vgam(j ~ s(age) + i,
data = data_trans,
family = multinomial(refLevel = 3))
object.size(data_trans) %>% print(units = "Mb")
# these are also big because they store the original data!
# s3 <- summary(mod3)
# s2 <- summary(mod2)
# s1 <- summary(mod1)
# data for predicting
newdata <-data_trans %>%
select(age,i) %>%
distinct()
# tidy up predictions for easy stacking
pred1 <- bind_cols(
newdata,
predict(mod1, newdata, type="response")) %>%
mutate(variant = "ref1") %>%
pivot_longer(c(`0`,`1`,`2`), names_to = "j", values_to = "p")
pred2 <- bind_cols(
newdata,
predict(mod2, newdata, type="response"))%>%
mutate(variant = "ref2")%>%
pivot_longer(c(`0`,`1`,`2`), names_to = "j", values_to = "p")
pred3 <- bind_cols(
newdata,
predict(mod3, newdata, type="response"))%>%
mutate(variant = "ref3")%>%
pivot_longer(c(`0`,`1`,`2`), names_to = "j", values_to = "p")
# compare with original transitions:
original <-
TRsub %>%
pivot_longer(m11:m24, names_to = "from_to", values_to = "p") %>%
mutate(i = case_when(substr(from_to,2,2) == "1" ~ "H",
substr(from_to,2,2) == "2" ~ "U",
substr(from_to,3,3) == "4" ~ "2"),
j = case_when(substr(from_to,3,3) == "1" ~ "0",
substr(from_to,3,3) == "2" ~ "1",
substr(from_to,3,3) == "4" ~ "2"),
variant = "original") %>%
select(age,i,variant,j,p)
bind_rows(pred1,pred2,pred3,original) %>%
ggplot(aes(x=age,y=p,color = as.factor(j), lty = variant)) +
geom_line() +
facet_wrap(~i) +
labs(title = "compare refstate",
subtitle = "differences with original data mostly due to starting conditions?\n
and a bit due to offset start age\nBut still minor differences between refstate variants")
# now show individual vs count estimates:
# first tabulate data
data_trans_n <-
data_trans %>%
select(age,i,j) %>%
group_by(age,i,j) %>%
summarize(n = n(), .groups = "drop") %>%
group_by(age, i) %>%
# to examine simulated empirical transition probabilities
mutate(p = n / sum(n))
# refit using weights
mod2n <-
vgam(j~s(age) + i,
data = data_trans_n,
family=multinomial(refLevel=2),
weights = n)
# compare apples w apples
pred2n <- bind_cols(
newdata,
predict(mod2n, newdata, type="response")) %>%
mutate(variant = "ref2n")%>%
pivot_longer(c(`0`,`1`,`2`), names_to = "j", values_to = "p")
# compare in plot
bind_rows(pred2n,pred2) %>%
ggplot(aes(x=age,y=p,color = as.factor(j), lty = variant)) +
geom_line() +
facet_wrap(~i) +
labs(title = "individual vs count data, point estimates, identical")
# So which reflevel should we use? 0? = healthy?
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment