Last active
June 7, 2022 20:13
-
-
Save timriffe/13e47a21ba50e27a09af8a6e71cecebf to your computer and use it in GitHub Desktop.
Comparisons of refstate leverage, as well as
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# a little exercise for Angelo and me | |
library(VGAM) | |
library(markovchain) | |
library(tidyverse) | |
source("https://raw.githubusercontent.com/timriffe/PHDS_HLE_2021/main/Functions.R") | |
# read in original transition estimates | |
TR <- read_csv("https://raw.githubusercontent.com/timriffe/PHDS_HLE_2021/main/Data/TRv6.csv") | |
# example subset: | |
TRsub <- TR %>% filter(sex == "f", | |
edu == "terciary", | |
time == 1996) | |
# starting proportions in each state | |
init <- TRsub[1, c("s1_prop","s2_prop")] %>% unlist() | |
names(init) <- c("H", "U") | |
# now we simulate data from which to restimate the transitions, | |
# for sake of an example dataset | |
# Make the submatrices of U, the transient matrix | |
HH <- pi2u(pivec = TRsub$m11, from = "H", to = "H") | |
HU <- pi2u(pivec = TRsub$m12, from = "H", to = "U") | |
UH <- pi2u(pivec = TRsub$m21, from = "U", to = "H") | |
UU <- pi2u(pivec = TRsub$m22, from = "U", to = "U") | |
# terciary edu females | |
U <- u2U(HH = HH, # healthy to healthy | |
HU = HU, # healthy to unhealthy | |
UH = UH, # unhealthy to healthy | |
UU = UU) # unhealthy to unhealthy | |
# complete the Markov matrix with Dead state, | |
# this turns it into a stochastic matrix (colSums = 1) | |
U <- cbind(U, 0) | |
U <- rbind(U, 1 - colSums(U)) # death probabilities are 1-sum(transient_probabilities) ... | |
# Need to name the dims, concatenating state and age | |
age_state <- c(outer(seq(48, 110, by = 2), paste0("::", c("H","U")), paste0), "Dead") | |
(dimnames(U) <- list(to = age_state, from = age_state)) | |
# --------------- | |
# switch from Leslie-Caswell to standard Markov orientation!!!! | |
# Gotta remember this!!! | |
U <- t(U) | |
# --------------- | |
# make s4 transition matrix from markovchain package | |
# RTFM | |
mcHLE <- new("markovchain", | |
states = rownames(U), | |
byrow = TRUE, | |
transitionMatrix = U, | |
name = "HLE") | |
# how many sequences should we generate? | |
# init, using prevalence | |
N <- 2e4 | |
Ns <- round(init * N) | |
set.seed(2022) | |
RHRS_H <- replicate(Ns["H"], | |
rmarkovchain(n = 31, | |
object = mcHLE, | |
t0 = "48::H", | |
parallel = TRUE) | |
) | |
RHRS_U <- replicate(Ns["U"], | |
rmarkovchain(n = 31, | |
object = mcHLE, | |
t0 = "48::U", | |
parallel = TRUE) | |
) | |
# stick together in one population, | |
# note, t0 from the random generator is not included | |
# in the trajectories, so we need to append it to be | |
# consistent with the other approaches. | |
RHRS <- cbind(rbind("H",RHRS_H), rbind("U",RHRS_U)) | |
# only need the states, don't need the age part of the labels | |
RHRS_clean <- gsub(".*:","",RHRS) | |
a <- seq(48, 110, by = 2) | |
rownames(RHRS_clean) <- a | |
colnames(RHRS_clean) <- 1:ncol(RHRS_clean) | |
# reshape to data from which we can estimate transitions | |
data_trans <- | |
RHRS_clean %>% | |
as.data.frame() %>% | |
rownames_to_column("age") %>% | |
pivot_longer(-age, names_to = "id",values_to = "i") %>% | |
mutate(age = as.integer(age)) %>% | |
arrange(id,age) %>% | |
group_by(id) %>% | |
mutate(j = lead(i), | |
j = if_else(is.na(j),"Dead",j)) %>% | |
ungroup() %>% | |
mutate(j = case_when(j == "H" ~ 0, | |
j == "U" ~ 1, | |
j == "Dead" ~ 2)) %>% | |
filter(i != "Dead") | |
# First show leverage of refLevel: | |
mod1 <- | |
vgam(j ~ s(age) + i, | |
data = data_trans, | |
family=multinomial(refLevel = 1)) | |
mod2 <- | |
vgam(j ~ s(age) + i, | |
data = data_trans, | |
family = multinomial(refLevel = 2)) | |
mod3 <- | |
vgam(j ~ s(age) + i, | |
data = data_trans, | |
family = multinomial(refLevel = 3)) | |
object.size(data_trans) %>% print(units = "Mb") | |
# these are also big because they store the original data! | |
# s3 <- summary(mod3) | |
# s2 <- summary(mod2) | |
# s1 <- summary(mod1) | |
# data for predicting | |
newdata <-data_trans %>% | |
select(age,i) %>% | |
distinct() | |
# tidy up predictions for easy stacking | |
pred1 <- bind_cols( | |
newdata, | |
predict(mod1, newdata, type="response")) %>% | |
mutate(variant = "ref1") %>% | |
pivot_longer(c(`0`,`1`,`2`), names_to = "j", values_to = "p") | |
pred2 <- bind_cols( | |
newdata, | |
predict(mod2, newdata, type="response"))%>% | |
mutate(variant = "ref2")%>% | |
pivot_longer(c(`0`,`1`,`2`), names_to = "j", values_to = "p") | |
pred3 <- bind_cols( | |
newdata, | |
predict(mod3, newdata, type="response"))%>% | |
mutate(variant = "ref3")%>% | |
pivot_longer(c(`0`,`1`,`2`), names_to = "j", values_to = "p") | |
# compare with original transitions: | |
original <- | |
TRsub %>% | |
pivot_longer(m11:m24, names_to = "from_to", values_to = "p") %>% | |
mutate(i = case_when(substr(from_to,2,2) == "1" ~ "H", | |
substr(from_to,2,2) == "2" ~ "U", | |
substr(from_to,3,3) == "4" ~ "2"), | |
j = case_when(substr(from_to,3,3) == "1" ~ "0", | |
substr(from_to,3,3) == "2" ~ "1", | |
substr(from_to,3,3) == "4" ~ "2"), | |
variant = "original") %>% | |
select(age,i,variant,j,p) | |
bind_rows(pred1,pred2,pred3,original) %>% | |
ggplot(aes(x=age,y=p,color = as.factor(j), lty = variant)) + | |
geom_line() + | |
facet_wrap(~i) + | |
labs(title = "compare refstate", | |
subtitle = "differences with original data mostly due to starting conditions?\n | |
and a bit due to offset start age\nBut still minor differences between refstate variants") | |
# now show individual vs count estimates: | |
# first tabulate data | |
data_trans_n <- | |
data_trans %>% | |
select(age,i,j) %>% | |
group_by(age,i,j) %>% | |
summarize(n = n(), .groups = "drop") %>% | |
group_by(age, i) %>% | |
# to examine simulated empirical transition probabilities | |
mutate(p = n / sum(n)) | |
# refit using weights | |
mod2n <- | |
vgam(j~s(age) + i, | |
data = data_trans_n, | |
family=multinomial(refLevel=2), | |
weights = n) | |
# compare apples w apples | |
pred2n <- bind_cols( | |
newdata, | |
predict(mod2n, newdata, type="response")) %>% | |
mutate(variant = "ref2n")%>% | |
pivot_longer(c(`0`,`1`,`2`), names_to = "j", values_to = "p") | |
# compare in plot | |
bind_rows(pred2n,pred2) %>% | |
ggplot(aes(x=age,y=p,color = as.factor(j), lty = variant)) + | |
geom_line() + | |
facet_wrap(~i) + | |
labs(title = "individual vs count data, point estimates, identical") | |
# So which reflevel should we use? 0? = healthy? | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment