samuelsaari/z1det_MVE_biofam_POSSIBLE_SOLUTION.R

## z1det_MVE_biofam_POSSIBLE_SOLUTION.R
#-------------------------------------------------------------------------------------------------
# libraries
rm(list=ls())

# install.packages("TraMineR") # example data
# install.packages("OpenRepGrid")# random words for headings
# install.packages("khroma") # color palletttes
# install.packages("tidyverse")
# install.packages("modeldb")
# install.packages("car")
# install.packages("jtools") # for a nice theme

library(flexmix) # GMM & LCGM
library(TraMineR) # example data
library(OpenRepGrid)# random words for headings
library(khroma) # color palletttes
library(tidyverse)
#library(modeldb)
library(car)
library(jtools) # for a nice theme
library(foreach)
library(ggpubr)

theme_set(theme_nice(base_family = 'Consolas' ))

# parameters
TEST_RUN=T
set.seed(102)

# loading and wrangling data
data(biofam)
biofam
biofam <- biofam %>% rename(gender=sex)
d <- biofam %>% select(.,gender,starts_with('a'),p02r01)
d$id <- 1:nrow(d)
d <- d %>% pivot_longer(.,names_to = "age",cols = starts_with('a'),values_to="rel_stat")
d
d <- d %>% mutate(across(c(gender,rel_stat), as.factor))
d <-  d %>% transform(age=str_replace(age,"a",""))
d <- d %>% mutate(age=as.integer(age))
d <- tibble(d)
#d <- d %>% mutate(age=age-14)
d
# 1 single
# 2 married
# 3 child
# 4 divorced
d$rel_stat <- car::recode(d$rel_stat,"0=1;1=1;2=2;3=2;4=3;5=3;6=3;7=4")

#d2 <- d %>% modeldb::add_dummy_variables(rel_stat,auto_values = T)

# making alternative ways to run the model
if (TEST_RUN) {
  d <- d %>% filter(p02r01 %in% c("no denomination or religion"))
  nr_of_classes <- 3
} else {
  nr_of_classes <- 4
}
vector_of_chosen_classes <- 1:nr_of_classes
levels_rel_stat <- 1:4 # note that this is defined manually

AGELOW=15
AGEHIGH=30
NR_OF_YEARS_VECTOR <- AGELOW:AGEHIGH
NR_OF_YEARS=length(NR_OF_YEARS_VECTOR)


#----------------------------------------------------------------------------------
# running Latent class growth modelling
#lcgm_formula <- as.formula(rel_stat~age + I(age^2) + gender + gender:age)
lcgm_formula <- as.formula(rel_stat~ age + I(age^2) +gender + age:gender + I(age^2):gender)
lcgm <- flexmix::flexmix(.~ .| id,
                         data=d,
                         k=nr_of_classes, # would be 1:12 in real analysis
                         #nrep=1, # would be 50 in real analysis to avoid local maxima (and we would use the stepFlexmix function instead)
                         control = list(iter.max = 500, minprior = 0),
                         model = flexmix::FLXMRmultinom(lcgm_formula))


#---------------------------------------------------------------------------------


# finding parameters
(parameter_list <- purrr::map(lcgm@components, function(x) x[[1]]@parameters$coef))

# adding class to original data
d$class_orig <- lcgm@cluster

#-----------------------------------------------------------------------------
# Changing the order of the classes - for data

does_custom_order_matrix_exist <- ifelse(nr_of_classes==3,T,F)

if (does_custom_order_matrix_exist) {
  (order_matrix <- matrix(c(1,3, #the number indicates where the classes were before...
                            2,1,  #the order indicates were we want to have them a.k.a. new classes
                            3,2),
                          ncol=2,byrow=T))
} else {
  order_matrix <- matrix(cbind(rep(vector_of_chosen_classes,2)),ncol=2,byrow=F)
}
order_matrix


# helper function to swap the order of the classes
recode_class <- function(ORIG_CLASS,IS_MALE) {
  column <- ifelse(IS_MALE,1,2)
  vector_of_chosen_classes[ match(ORIG_CLASS,order_matrix[,column]) ] # logic: c(1,2,3)[c(T,F,F)]=1
}


# change order of classses
d <- d %>% mutate(.,class=ifelse(gender=="man",recode_class(class_orig,IS_MALE=TRUE),recode_class(class_orig,IS_MALE=FALSE)))

#-----------------------------------------------------------------------------
# Changing the order of the classes - for parameters and predictions


stratify_coefs_by_gender_helper <- function(COEF_MATRIX,WOMAN_INDICATOR) {
  intercept <- COEF_MATRIX[,"(Intercept)"]+WOMAN_INDICATOR*COEF_MATRIX[,"genderwoman"]
  age <- COEF_MATRIX[,"age"]+WOMAN_INDICATOR*COEF_MATRIX[,"age:genderwoman"]
  age2 <- COEF_MATRIX[,"I(age^2)"] +WOMAN_INDICATOR*COEF_MATRIX[,"I(age^2):genderwoman"]
  t(cbind(intercept,age,age2))
}

stratify_coefs_by_gender <- function(COEF_MATRIX) {
  list("male_coefs"=stratify_coefs_by_gender_helper(COEF_MATRIX,0), "female_coefs"=stratify_coefs_by_gender_helper(COEF_MATRIX,1))
}

# after this, order original, but seperate estimates for men/women
parameter_list_by_gender_orig_order <- purrr::map(parameter_list,stratify_coefs_by_gender)
parameter_list_by_gender_orig_order


# making a copy of the parameter list that is stratified by gender
parameter_list_by_gender <- parameter_list_by_gender_orig_order


# substituting new values based on the order_matrix (defined above)
for (j in vector_of_chosen_classes) {
  for (i in 1:2) {
    class_name=paste0('class',i)
    coef_gender <- ifelse(i==1,"male_coefs","female_coefs")
    column <- ifelse(i==1,1,2)
    new_index <- which(order_matrix[,column]==j)
    parameter_list_by_gender[[new_index]][[i]] <- parameter_list_by_gender_orig_order[[j]][[i]]
  }
}
parameter_list_by_gender_orig_order
parameter_list_by_gender # now the order is changed


# we will predict the values based on all combinations of age, age^2 and gender
prediction_dataframe <- expand.grid(c=1,age=NR_OF_YEARS_VECTOR) # gender is included implicitly (coef_matrices different for both genders)
prediction_dataframe$age2 <- prediction_dataframe$age^2
(prediction_matrix <- as.matrix(prediction_dataframe))

predict_one_gender <- function(COEF_MATRIX,PREDICTION_MATRIX=prediction_matrix) {
  #add vector for the number of rel_stats!!!!!!!!!!!!!!! Obs nb!
  # https://www.ibm.com/support/pages/compute-predicted-probabilities-multinomial-logistic-new-cases-or-outside-spss
  predictions <- PREDICTION_MATRIX %*% COEF_MATRIX
  predictions <- exp(predictions)
  predictions <- cbind(1,predictions) # adding ref category
  predictions <- cbind(predictions,rowSums(predictions))
  colnames(predictions) <- c(levels_rel_stat,"row_sums")
  probabilities <- apply(predictions[,levels_rel_stat],2,function(x) x/predictions[,"row_sums"] )
  return(probabilities)
}


predict_one_class <- function(COEF_MATRIX_LIST,PREDICTION_MATRIX=prediction_matrix) {
  male_probabilities <- predict_one_gender(COEF_MATRIX_LIST$male_coefs)
  female_probabilities <- predict_one_gender(COEF_MATRIX_LIST$female_coefs)
  return(list("male_probabilities"=male_probabilities,"female_probabilities"=female_probabilities))
}


prediction_by_class_and_gender <- purrr::map(parameter_list_by_gender,predict_one_class)

# now we have all the probabilities we need, and will add some info and shape the data


add_gender_to_prediction_matrix <- function(my_list) {
  male_probabilities <- cbind(my_list$male_probabilities,gender=1)
  female_probabilities  <- cbind(my_list$female_probabilities,gender=2)
  return(list("male_probabilities"=male_probabilities,"female_probabilities"=female_probabilities))
}


combine_male_and_female_prediction <- function(my_list){
  my_list <- purrr::map(my_list,function(x) cbind(x,age=NR_OF_YEARS_VECTOR)) # add age
  my_list <- add_gender_to_prediction_matrix(my_list)
  combined_matrix <- do.call(rbind,my_list)
  return(combined_matrix)
}


prediction_by_class <- purrr::map(prediction_by_class_and_gender,
                                                 combine_male_and_female_prediction)

pivot_predictions_to_long <- function(combined_matrix) {
  combined_tibble <- setNames(as_tibble(combined_matrix,.name_repair = "minimal"),c(levels_rel_stat,"age","gender"))
  long_matrix <- pivot_longer(data=combined_tibble,
                              cols = -all_of(c("age","gender")),
                              names_to = "rel_stat",
                              values_to = "probability")
  return(long_matrix)
}


prediction_tibbles_long <- purrr::map(prediction_by_class,
                                       pivot_predictions_to_long)
# now we have the predicted data in the right format.

#--------------------------------------------------------------------------------------------------
# let us confirm that the results would be identical if we fitted the data automatically and not manually as above.
# The order will be different

fitted_lcgm <- fitted(lcgm)
fitted_tibbles <- lapply(fitted_lcgm, function(x) cbind(x,d$age,d$gender))
fitted_tibbles <- lapply(fitted_tibbles,function(x) setNames(as_tibble(x,.name_repair = "minimal"),c(levels_rel_stat,"age","gender") ))
fitted_tibbles_long <-purrr::map(fitted_tibbles, function(x) {
  pivot_longer(data=x,cols = -all_of(c("age","gender")), names_to = "rel_stat", values_to = "probability")
} )
fitted_tibbles_long <- purrr::map(fitted_tibbles_long,distinct) # remove duplicate rows


# helpers for plotting
text_size <- 12
title_size <- text_size*1.2
line_size=1.6
line_size_legend <- line_size*1.75
gender_labels <- c(`1`="\U2642",`2`="\U2640")

rw <- function() {
  randomWords(nr_of_classes)
}

(class_titles <- paste(vector_of_chosen_classes,"class:",rw(),rw(),rw()))


#--------------------------------------------------------------------------------
# plotting

plot_single_class_line <- function(CLASS_DATA,TITLE){
  ggplot(CLASS_DATA, aes(x = age,y = probability,color=rel_stat)) +
    geom_line(linewidth=line_size) +
    scale_color_vibrant() +
    guides(color = guide_legend(reverse = TRUE))  +
    # common for both plots +
    ggtitle(TITLE) +
    labs (x = NULL, y=NULL) +
    facet_wrap(~ gender,strip.position="left",labeller=labeller(gender=gender_labels))+
    theme(text = element_text(size=text_size),
          #legend.key.size = unit(1,"pt"),
          legend.title=element_blank(),
          plot.title = element_text(size =title_size,margin=margin(b=5)),
          strip.text.y.left= element_text(angle=0, size=text_size*1.6))
}

#plot_single_class_line(fitted_tibbles_long[[1]],TITLE=class_titles[1])

# saving the plots
class_plot_list_line<- purrr::map(vector_of_chosen_classes,function(i) plot_single_class_line(fitted_tibbles_long[[i]],TITLE=class_titles[i]))
class_plot_list_line_prediction <- purrr::map(vector_of_chosen_classes,function(i) plot_single_class_line(prediction_tibbles_long[[i]],TITLE=class_titles[i]))


# arranging the plot lists with ggarrange
(plot_lcgm_line <- do.call(ggpubr::ggarrange,c(class_plot_list_line, list(common.legend = TRUE, legend = "bottom",ncol=1))))
(plot_lcgm_line <- do.call(ggpubr::ggarrange,c(class_plot_list_line, list(common.legend = TRUE, legend = "bottom",ncol=1))))
ggpubr::annotate_figure(plot_lcgm_line, top =text_grob("Original model", size = 20,face="bold"))
(plot_lcgm_line_prediction <- do.call(ggpubr::ggarrange,c(class_plot_list_line_prediction, list(common.legend = TRUE, legend = "bottom",ncol=1))))
ggpubr::annotate_figure(plot_lcgm_line_prediction, top =text_grob("Label switching corrected", size = 20,face="bold"))
	#-------------------------------------------------------------------------------------------------
	# libraries
	rm(list=ls())

	# install.packages("TraMineR") # example data
	# install.packages("OpenRepGrid")# random words for headings
	# install.packages("khroma") # color palletttes
	# install.packages("tidyverse")
	# install.packages("modeldb")
	# install.packages("car")
	# install.packages("jtools") # for a nice theme

	library(flexmix) # GMM & LCGM
	library(TraMineR) # example data
	library(OpenRepGrid)# random words for headings
	library(khroma) # color palletttes
	library(tidyverse)
	#library(modeldb)
	library(car)
	library(jtools) # for a nice theme
	library(foreach)
	library(ggpubr)

	theme_set(theme_nice(base_family = 'Consolas' ))

	# parameters
	TEST_RUN=T
	set.seed(102)

	# loading and wrangling data
	data(biofam)
	biofam
	biofam <- biofam %>% rename(gender=sex)
	d <- biofam %>% select(.,gender,starts_with('a'),p02r01)
	d$id <- 1:nrow(d)
	d <- d %>% pivot_longer(.,names_to = "age",cols = starts_with('a'),values_to="rel_stat")
	d
	d <- d %>% mutate(across(c(gender,rel_stat), as.factor))
	d <- d %>% transform(age=str_replace(age,"a",""))
	d <- d %>% mutate(age=as.integer(age))
	d <- tibble(d)
	#d <- d %>% mutate(age=age-14)
	d
	# 1 single
	# 2 married
	# 3 child
	# 4 divorced
	d$rel_stat <- car::recode(d$rel_stat,"0=1;1=1;2=2;3=2;4=3;5=3;6=3;7=4")

	#d2 <- d %>% modeldb::add_dummy_variables(rel_stat,auto_values = T)

	# making alternative ways to run the model
	if (TEST_RUN) {
	d <- d %>% filter(p02r01 %in% c("no denomination or religion"))
	nr_of_classes <- 3
	} else {
	nr_of_classes <- 4
	}
	vector_of_chosen_classes <- 1:nr_of_classes
	levels_rel_stat <- 1:4 # note that this is defined manually

	AGELOW=15
	AGEHIGH=30
	NR_OF_YEARS_VECTOR <- AGELOW:AGEHIGH
	NR_OF_YEARS=length(NR_OF_YEARS_VECTOR)




	#----------------------------------------------------------------------------------
	# running Latent class growth modelling
	#lcgm_formula <- as.formula(rel_stat~age + I(age^2) + gender + gender:age)
	lcgm_formula <- as.formula(rel_stat~ age + I(age^2) +gender + age:gender + I(age^2):gender)
	lcgm <- flexmix::flexmix(.~ .\| id,
	data=d,
	k=nr_of_classes, # would be 1:12 in real analysis
	#nrep=1, # would be 50 in real analysis to avoid local maxima (and we would use the stepFlexmix function instead)
	control = list(iter.max = 500, minprior = 0),
	model = flexmix::FLXMRmultinom(lcgm_formula))


	#---------------------------------------------------------------------------------


	# finding parameters
	(parameter_list <- purrr::map(lcgm@components, function(x) x[[1]]@parameters$coef))

	# adding class to original data
	d$class_orig <- lcgm@cluster

	#-----------------------------------------------------------------------------
	# Changing the order of the classes - for data

	does_custom_order_matrix_exist <- ifelse(nr_of_classes==3,T,F)

	if (does_custom_order_matrix_exist) {
	(order_matrix <- matrix(c(1,3, #the number indicates where the classes were before...
	2,1, #the order indicates were we want to have them a.k.a. new classes
	3,2),
	ncol=2,byrow=T))
	} else {
	order_matrix <- matrix(cbind(rep(vector_of_chosen_classes,2)),ncol=2,byrow=F)
	}
	order_matrix


	# helper function to swap the order of the classes
	recode_class <- function(ORIG_CLASS,IS_MALE) {
	column <- ifelse(IS_MALE,1,2)
	vector_of_chosen_classes[ match(ORIG_CLASS,order_matrix[,column]) ] # logic: c(1,2,3)[c(T,F,F)]=1
	}


	# change order of classses
	d <- d %>% mutate(.,class=ifelse(gender=="man",recode_class(class_orig,IS_MALE=TRUE),recode_class(class_orig,IS_MALE=FALSE)))

	#-----------------------------------------------------------------------------
	# Changing the order of the classes - for parameters and predictions


	stratify_coefs_by_gender_helper <- function(COEF_MATRIX,WOMAN_INDICATOR) {
	intercept <- COEF_MATRIX[,"(Intercept)"]+WOMAN_INDICATOR*COEF_MATRIX[,"genderwoman"]
	age <- COEF_MATRIX[,"age"]+WOMAN_INDICATOR*COEF_MATRIX[,"age:genderwoman"]
	age2 <- COEF_MATRIX[,"I(age^2)"] +WOMAN_INDICATOR*COEF_MATRIX[,"I(age^2):genderwoman"]
	t(cbind(intercept,age,age2))
	}

	stratify_coefs_by_gender <- function(COEF_MATRIX) {
	list("male_coefs"=stratify_coefs_by_gender_helper(COEF_MATRIX,0), "female_coefs"=stratify_coefs_by_gender_helper(COEF_MATRIX,1))
	}

	# after this, order original, but seperate estimates for men/women
	parameter_list_by_gender_orig_order <- purrr::map(parameter_list,stratify_coefs_by_gender)
	parameter_list_by_gender_orig_order


	# making a copy of the parameter list that is stratified by gender
	parameter_list_by_gender <- parameter_list_by_gender_orig_order


	# substituting new values based on the order_matrix (defined above)
	for (j in vector_of_chosen_classes) {
	for (i in 1:2) {
	class_name=paste0('class',i)
	coef_gender <- ifelse(i==1,"male_coefs","female_coefs")
	column <- ifelse(i==1,1,2)
	new_index <- which(order_matrix[,column]==j)
	parameter_list_by_gender[[new_index]][[i]] <- parameter_list_by_gender_orig_order[[j]][[i]]
	}
	}
	parameter_list_by_gender_orig_order
	parameter_list_by_gender # now the order is changed


	# we will predict the values based on all combinations of age, age^2 and gender
	prediction_dataframe <- expand.grid(c=1,age=NR_OF_YEARS_VECTOR) # gender is included implicitly (coef_matrices different for both genders)
	prediction_dataframe$age2 <- prediction_dataframe$age^2
	(prediction_matrix <- as.matrix(prediction_dataframe))

	predict_one_gender <- function(COEF_MATRIX,PREDICTION_MATRIX=prediction_matrix) {
	#add vector for the number of rel_stats!!!!!!!!!!!!!!! Obs nb!
	# https://www.ibm.com/support/pages/compute-predicted-probabilities-multinomial-logistic-new-cases-or-outside-spss
	predictions <- PREDICTION_MATRIX %*% COEF_MATRIX
	predictions <- exp(predictions)
	predictions <- cbind(1,predictions) # adding ref category
	predictions <- cbind(predictions,rowSums(predictions))
	colnames(predictions) <- c(levels_rel_stat,"row_sums")
	probabilities <- apply(predictions[,levels_rel_stat],2,function(x) x/predictions[,"row_sums"] )
	return(probabilities)
	}



	predict_one_class <- function(COEF_MATRIX_LIST,PREDICTION_MATRIX=prediction_matrix) {
	male_probabilities <- predict_one_gender(COEF_MATRIX_LIST$male_coefs)
	female_probabilities <- predict_one_gender(COEF_MATRIX_LIST$female_coefs)
	return(list("male_probabilities"=male_probabilities,"female_probabilities"=female_probabilities))
	}



	prediction_by_class_and_gender <- purrr::map(parameter_list_by_gender,predict_one_class)

	# now we have all the probabilities we need, and will add some info and shape the data


	add_gender_to_prediction_matrix <- function(my_list) {
	male_probabilities <- cbind(my_list$male_probabilities,gender=1)
	female_probabilities <- cbind(my_list$female_probabilities,gender=2)
	return(list("male_probabilities"=male_probabilities,"female_probabilities"=female_probabilities))
	}


	combine_male_and_female_prediction <- function(my_list){
	my_list <- purrr::map(my_list,function(x) cbind(x,age=NR_OF_YEARS_VECTOR)) # add age
	my_list <- add_gender_to_prediction_matrix(my_list)
	combined_matrix <- do.call(rbind,my_list)
	return(combined_matrix)
	}


	prediction_by_class <- purrr::map(prediction_by_class_and_gender,
	combine_male_and_female_prediction)

	pivot_predictions_to_long <- function(combined_matrix) {
	combined_tibble <- setNames(as_tibble(combined_matrix,.name_repair = "minimal"),c(levels_rel_stat,"age","gender"))
	long_matrix <- pivot_longer(data=combined_tibble,
	cols = -all_of(c("age","gender")),
	names_to = "rel_stat",
	values_to = "probability")
	return(long_matrix)
	}


	prediction_tibbles_long <- purrr::map(prediction_by_class,
	pivot_predictions_to_long)
	# now we have the predicted data in the right format.

	#--------------------------------------------------------------------------------------------------
	# let us confirm that the results would be identical if we fitted the data automatically and not manually as above.
	# The order will be different

	fitted_lcgm <- fitted(lcgm)
	fitted_tibbles <- lapply(fitted_lcgm, function(x) cbind(x,d$age,d$gender))
	fitted_tibbles <- lapply(fitted_tibbles,function(x) setNames(as_tibble(x,.name_repair = "minimal"),c(levels_rel_stat,"age","gender") ))
	fitted_tibbles_long <-purrr::map(fitted_tibbles, function(x) {
	pivot_longer(data=x,cols = -all_of(c("age","gender")), names_to = "rel_stat", values_to = "probability")
	} )
	fitted_tibbles_long <- purrr::map(fitted_tibbles_long,distinct) # remove duplicate rows


	# helpers for plotting
	text_size <- 12
	title_size <- text_size*1.2
	line_size=1.6
	line_size_legend <- line_size*1.75
	gender_labels <- c(`1`="\U2642",`2`="\U2640")

	rw <- function() {
	randomWords(nr_of_classes)
	}

	(class_titles <- paste(vector_of_chosen_classes,"class:",rw(),rw(),rw()))


	#--------------------------------------------------------------------------------
	# plotting

	plot_single_class_line <- function(CLASS_DATA,TITLE){
	ggplot(CLASS_DATA, aes(x = age,y = probability,color=rel_stat)) +
	geom_line(linewidth=line_size) +
	scale_color_vibrant() +
	guides(color = guide_legend(reverse = TRUE)) +
	# common for both plots +
	ggtitle(TITLE) +
	labs (x = NULL, y=NULL) +
	facet_wrap(~ gender,strip.position="left",labeller=labeller(gender=gender_labels))+
	theme(text = element_text(size=text_size),
	#legend.key.size = unit(1,"pt"),
	legend.title=element_blank(),
	plot.title = element_text(size =title_size,margin=margin(b=5)),
	strip.text.y.left= element_text(angle=0, size=text_size*1.6))
	}

	#plot_single_class_line(fitted_tibbles_long[[1]],TITLE=class_titles[1])

	# saving the plots
	class_plot_list_line<- purrr::map(vector_of_chosen_classes,function(i) plot_single_class_line(fitted_tibbles_long[[i]],TITLE=class_titles[i]))
	class_plot_list_line_prediction <- purrr::map(vector_of_chosen_classes,function(i) plot_single_class_line(prediction_tibbles_long[[i]],TITLE=class_titles[i]))


	# arranging the plot lists with ggarrange
	(plot_lcgm_line <- do.call(ggpubr::ggarrange,c(class_plot_list_line, list(common.legend = TRUE, legend = "bottom",ncol=1))))
	(plot_lcgm_line <- do.call(ggpubr::ggarrange,c(class_plot_list_line, list(common.legend = TRUE, legend = "bottom",ncol=1))))
	ggpubr::annotate_figure(plot_lcgm_line, top =text_grob("Original model", size = 20,face="bold"))
	(plot_lcgm_line_prediction <- do.call(ggpubr::ggarrange,c(class_plot_list_line_prediction, list(common.legend = TRUE, legend = "bottom",ncol=1))))
	ggpubr::annotate_figure(plot_lcgm_line_prediction, top =text_grob("Label switching corrected", size = 20,face="bold"))