samuelsaari/z1det_MVE_biofam.R

## z1det_MVE_biofam.R
#---------------------------------------------------------
# libraries
rm(list=ls())

library(flexmix) # GMM & LCGM
library(TraMineR) # example data
library(OpenRepGrid)# random words for headings
library(khroma) # color palletttes
library(tidyverse)
library(car)
library(jtools) # for a nice theme


#---------------------------------------------------------
# parameters & options
TEST_RUN=T
set.seed(102)
theme_set(theme_nice(base_family = 'Consolas' ))

#---------------------------------------------------------
# loading and wrangling data
data(biofam)
biofam
biofam <- biofam %>% rename(gender=sex)
d <- biofam %>% select(.,gender,starts_with('a'),p02r01)
d$id <- 1:nrow(d)
d <- d %>% pivot_longer(.,names_to = "age",cols = starts_with('a'),values_to="rel_stat")

d <- d %>% mutate(across(c(gender,rel_stat), as.factor))
d <-  d %>% transform(age=str_replace(age,"a",""))
d <- d %>% mutate(age=as.integer(age))
d <- tibble(d)
# 1 single
# 2 married
# 3 child
# 4 divorced
d$rel_stat <- car::recode(d$rel_stat,"0=1;1=1;2=2;3=2;4=3;5=3;6=3;7=4")

#---------------------------------------------------------
# making alternative ways to run the model
if (TEST_RUN) {
  d <- d %>% filter(p02r01 %in% c("no denomination or religion"))
  nr_of_classes <- 3
} else {
  nr_of_classes <- 4
}
vector_of_chosen_classes <- 1:nr_of_classes
levels_rel_stat <- 1:4 # note that this is defined manually


#----------------------------------------------------------------------------------
#----------------------------------------------------------------------------------
#----------------------------------------------------------------------------------
# running Latent class growth modelling
lcgm_formula <- as.formula(rel_stat~age + I(age^2) + gender + gender:age)
lcgm <- flexmix::stepFlexmix(.~ .| id,
                             data=d,
                             k=nr_of_classes, # would be 1:12 in real analysis
                             nrep=1, # would be 50 in real analysis to avoid local maxima
                             control = list(iter.max = 500, minprior = 0),
                             model = flexmix::FLXMRmultinom(lcgm_formula,varFix=T,fixed = ~0))
#---------------------------------------------------------------------------------
#----------------------------------------------------------------------------------
#----------------------------------------------------------------------------------


#----------------------------------------------------------------------------------
#fitting the values
fitted_lcgm <- fitted(lcgm)
fitted_tibbles <- lapply(fitted_lcgm, function(x) cbind(x,d$age,d$gender))
fitted_tibbles <- lapply(fitted_tibbles,function(x) setNames(as_tibble(x,.name_repair = "minimal"),c(levels_rel_stat,"age","gender") ))
fitted_tibbles_long <-purrr::map(fitted_tibbles, function(x) {
  pivot_longer(data=x,cols = -all_of(c("age","gender")), names_to = "rel_stat", values_to = "probability")
} )
fitted_tibbles_long <- purrr::map(fitted_tibbles_long,distinct) # remove duplicate rows

# have tried two altenative approaches to predict the probabilites - predicting values by age and gender as opposed to fitting to original data,and calculating the fitted values by hand (https://www.ibm.com/support/pages/compute-predicted-probabilities-multinomial-logistic-new-cases-or-outside-spss) with identical results

#----------------------------------------------------------------------------------
# helpers for plotting

text_size <- 12
title_size <- text_size*1.2
line_size=2
line_size_legend <- line_size*1.75
gender_labels <- c(`1`="\U2642",`2`="\U2640")

rw <- function() {
  randomWords(nr_of_classes)
}

(class_titles <- paste(vector_of_chosen_classes,"class:",rw(),rw(),rw()))


#--------------------------------------------------------------------------------
# plotting

plot_single_class_line <- function(CLASS_DATA,TITLE){
  ggplot(CLASS_DATA, aes(x = age,y = probability,color=rel_stat)) +
    geom_line(size=line_size) +
    scale_color_vibrant() +
    guides(color = guide_legend(reverse = TRUE))  +
    # common for both plots +
    ggtitle(TITLE) +
    labs (x = NULL, y=NULL) +
    facet_wrap(~ gender,strip.position="left",labeller=labeller(gender=gender_labels))+
    theme(text = element_text(size=text_size),
          #legend.key.size = unit(1,"pt"),
          legend.title=element_blank(),
          plot.title = element_text(size =title_size,margin=margin(b=5)),
          strip.text.y.left= element_text(angle=0, size=text_size*1.6))
}

plot_single_class_line(fitted_tibbles_long[[1]],TITLE=class_titles[1])

class_plot_list_line<- purrr::map(vector_of_chosen_classes,function(i) plot_single_class_line(fitted_tibbles_long[[i]],TITLE=class_titles[i]))

(plot_lcgm_line <- do.call(ggpubr::ggarrange,c(class_plot_list_line, list(common.legend = TRUE, legend = "bottom",ncol=1))))
	#---------------------------------------------------------
	# libraries
	rm(list=ls())

	library(flexmix) # GMM & LCGM
	library(TraMineR) # example data
	library(OpenRepGrid)# random words for headings
	library(khroma) # color palletttes
	library(tidyverse)
	library(car)
	library(jtools) # for a nice theme


	#---------------------------------------------------------
	# parameters & options
	TEST_RUN=T
	set.seed(102)
	theme_set(theme_nice(base_family = 'Consolas' ))

	#---------------------------------------------------------
	# loading and wrangling data
	data(biofam)
	biofam
	biofam <- biofam %>% rename(gender=sex)
	d <- biofam %>% select(.,gender,starts_with('a'),p02r01)
	d$id <- 1:nrow(d)
	d <- d %>% pivot_longer(.,names_to = "age",cols = starts_with('a'),values_to="rel_stat")

	d <- d %>% mutate(across(c(gender,rel_stat), as.factor))
	d <- d %>% transform(age=str_replace(age,"a",""))
	d <- d %>% mutate(age=as.integer(age))
	d <- tibble(d)
	# 1 single
	# 2 married
	# 3 child
	# 4 divorced
	d$rel_stat <- car::recode(d$rel_stat,"0=1;1=1;2=2;3=2;4=3;5=3;6=3;7=4")

	#---------------------------------------------------------
	# making alternative ways to run the model
	if (TEST_RUN) {
	d <- d %>% filter(p02r01 %in% c("no denomination or religion"))
	nr_of_classes <- 3
	} else {
	nr_of_classes <- 4
	}
	vector_of_chosen_classes <- 1:nr_of_classes
	levels_rel_stat <- 1:4 # note that this is defined manually


	#----------------------------------------------------------------------------------
	#----------------------------------------------------------------------------------
	#----------------------------------------------------------------------------------
	# running Latent class growth modelling
	lcgm_formula <- as.formula(rel_stat~age + I(age^2) + gender + gender:age)
	lcgm <- flexmix::stepFlexmix(.~ .\| id,
	data=d,
	k=nr_of_classes, # would be 1:12 in real analysis
	nrep=1, # would be 50 in real analysis to avoid local maxima
	control = list(iter.max = 500, minprior = 0),
	model = flexmix::FLXMRmultinom(lcgm_formula,varFix=T,fixed = ~0))
	#---------------------------------------------------------------------------------
	#----------------------------------------------------------------------------------
	#----------------------------------------------------------------------------------


	#----------------------------------------------------------------------------------
	#fitting the values
	fitted_lcgm <- fitted(lcgm)
	fitted_tibbles <- lapply(fitted_lcgm, function(x) cbind(x,d$age,d$gender))
	fitted_tibbles <- lapply(fitted_tibbles,function(x) setNames(as_tibble(x,.name_repair = "minimal"),c(levels_rel_stat,"age","gender") ))
	fitted_tibbles_long <-purrr::map(fitted_tibbles, function(x) {
	pivot_longer(data=x,cols = -all_of(c("age","gender")), names_to = "rel_stat", values_to = "probability")
	} )
	fitted_tibbles_long <- purrr::map(fitted_tibbles_long,distinct) # remove duplicate rows

	# have tried two altenative approaches to predict the probabilites - predicting values by age and gender as opposed to fitting to original data,and calculating the fitted values by hand (https://www.ibm.com/support/pages/compute-predicted-probabilities-multinomial-logistic-new-cases-or-outside-spss) with identical results

	#----------------------------------------------------------------------------------
	# helpers for plotting

	text_size <- 12
	title_size <- text_size*1.2
	line_size=2
	line_size_legend <- line_size*1.75
	gender_labels <- c(`1`="\U2642",`2`="\U2640")

	rw <- function() {
	randomWords(nr_of_classes)
	}

	(class_titles <- paste(vector_of_chosen_classes,"class:",rw(),rw(),rw()))


	#--------------------------------------------------------------------------------
	# plotting

	plot_single_class_line <- function(CLASS_DATA,TITLE){
	ggplot(CLASS_DATA, aes(x = age,y = probability,color=rel_stat)) +
	geom_line(size=line_size) +
	scale_color_vibrant() +
	guides(color = guide_legend(reverse = TRUE)) +
	# common for both plots +
	ggtitle(TITLE) +
	labs (x = NULL, y=NULL) +
	facet_wrap(~ gender,strip.position="left",labeller=labeller(gender=gender_labels))+
	theme(text = element_text(size=text_size),
	#legend.key.size = unit(1,"pt"),
	legend.title=element_blank(),
	plot.title = element_text(size =title_size,margin=margin(b=5)),
	strip.text.y.left= element_text(angle=0, size=text_size*1.6))
	}

	plot_single_class_line(fitted_tibbles_long[[1]],TITLE=class_titles[1])

	class_plot_list_line<- purrr::map(vector_of_chosen_classes,function(i) plot_single_class_line(fitted_tibbles_long[[i]],TITLE=class_titles[i]))

	(plot_lcgm_line <- do.call(ggpubr::ggarrange,c(class_plot_list_line, list(common.legend = TRUE, legend = "bottom",ncol=1))))