rubenarslan/partner-preference-sim.R

## partner-preference-sim.R
# Load necessary libraries for data manipulation, visualization, and parallel processing
library(tidyverse)    # Provides a collection of R packages for data manipulation
library(furrr)        # Parallel processing using purrr
library(dtplyr)

# Set up parallel processing plan with 6 workers using multisession
plan(multisession(workers = 20))

# Check current memory usage
pryr::mem_used()

# Set random seed for reproducibility
set.seed(05102019)

# Create a grid of parameters for simulations
parameters <- expand_grid(
  Iteration = 1,
  N = 500,                      # Number of individuals
  num_traits = 35,                # Number of traits/preferences
  trait_pref_correlation = c(0, 0.4),
  trait_mean_weight = c(0, 0.3, 0.5),
  pref_mean_weight = c(0, 0.3, 0.5),
  great_partner_weight = c(0, 0.3, 0.5),
  high_standards_weight = c(0, 0.3, 0.5),
  trait_intercept = c(-1, 0, 1),
  pref_intercept = c(-1, 0, 1),
  normative_pref_strength_fun = c("rexp", "logrnorm"),
  normative_pref_strength_rate = 1,
  random_responding_sd = c(0, 1, 2),
  likert_noise_sd = c(0.1, 0.3),
  true_liking_filter = c(-Inf, 0, 2)
) %>%
  arrange(runif(n()))   # Randomize the order of parameter combinations

# Display the number of parameter combinations
nrow(parameters)
#
# # Parameters modeled on the original data
p <- list(
  N = 5000,
  num_traits = 35,
  trait_intercept = 0,
  pref_intercept = 0,
  trait_pref_correlation = 0.4,
  trait_mean_weight = 0.5,
  pref_mean_weight = 0.5,
  great_partner_weight = 0.3,
  high_standards_weight = 0.3,
  normative_pref_strength_fun = "rexp",
  normative_pref_strength_rate = 1.5,
  random_responding_sd = 2,
  likert_noise_sd = 0.5,
  true_liking_filter = -Inf
)

# Function to extract the upper triangle of a matrix
get_upper_tri <- function(x) {
  x[upper.tri(x)]
}

# Function to run a single iteration of the simulation
run_simulation <- function(...) {
  # Convert input parameters to a list
  p <- list(...)

  # Define function for a log-transformed normal dist (left-skewed)
  logrnorm <- function(n, rate) {
    log(abs(rnorm(n, mean = 0, sd = rate)))
  }

  # Function to convert continuous values to an 11-point Likert scale
  to_likert_11 <- function(x, noise_sd) {
    x <- round(6 + 4 * (x + p$likert_noise_sd * rnorm(length(x), sd = exp(noise_sd))))
    x[x < 1] <- 1
    x[x > 11] <- 11
    x
  }

  scale <- function(...) { base::scale(...)[,1] }


  # Adjust N to ensure the desired number of individuals after filtering
  initial_N <- round(p$N / (1 - pnorm(p$true_liking_filter)))

  # Generate data frame with individual traits and preferences
  df <- tibble(
    individual_id = rep(1:initial_N, each = p$num_traits),
    trait_id = rep(1:p$num_traits, times = initial_N),
    # Generate random responding noise for each individual
    random_responding = rep(rnorm(initial_N, sd = p$random_responding_sd), each = p$num_traits),
    # Generate trait means for each trait
    trait_mean = rep(rnorm(p$num_traits), times = initial_N),
    # Generate individual-level variables
    great_partner = rep(rnorm(initial_N, mean = 0), each = p$num_traits),
    high_standards = rep(rnorm(initial_N, mean = 0), each = p$num_traits),
    # Generate preference means influenced by trait means
    pref_mean = rep(rnorm(p$num_traits), times = initial_N) + trait_mean,
    # Generate normative preference strengths
    normative_pref_strength = rep(get(p$normative_pref_strength_fun)(p$num_traits, rate = p$normative_pref_strength_rate), times = initial_N)
  ) %>%
    lazy_dt() %>%
    # Generate trait and preference values
    mutate(
      trait_value = p$trait_intercept +
        p$trait_mean_weight * trait_mean +
        p$great_partner_weight * great_partner +
        rnorm(n()),  # Add random noise
      pref_value = p$pref_intercept +
        p$high_standards_weight * high_standards +
        p$pref_mean_weight * pref_mean +
        p$trait_pref_correlation * trait_value +
        rnorm(n())  # Add random noise
    ) %>%
    # Convert trait and preference values to Likert scale
    group_by(trait_id) %>%
    mutate(
      trait_likert = to_likert_11(trait_value, random_responding),
      pref_likert = to_likert_11(pref_value, random_responding)
    ) %>%
    ungroup()

  # Compute true liking scores for each individual
  liking <- df %>%
    group_by(individual_id) %>%
    summarise(
      random_responding = first(random_responding),
      true_liking = sum(trait_value * normative_pref_strength) / sqrt(p$num_traits)
    ) %>%
    # Standardize true liking scores
    mutate(true_liking = scale(true_liking)) %>%
    # Filter individuals based on true liking threshold
    filter(true_liking > p$true_liking_filter) %>%
    # Convert true liking to Likert scale
    mutate(liking_likert = to_likert_11(true_liking/2, (random_responding/3)))

  # Subset df to only include individuals who passed the true liking filter
  df <- liking %>% select(individual_id) %>%
    left_join(df, by = "individual_id") %>%
    collect()

  # Compute summary statistics for traits and preferences
  traits <- df %>%
    group_by(trait_id) %>%
    summarise(
      trait_mean = mean(trait_likert, na.rm = TRUE),
      trait_sd = sd(trait_likert, na.rm = TRUE),
      pref_mean = mean(pref_likert, na.rm = TRUE),
      pref_sd = sd(pref_likert, na.rm = TRUE)
    )
  # Reshape data to wide format for correlation analysis
  traits_prefs_wide <- df %>%
    select(individual_id, trait_id, trait_likert, pref_likert) %>%
    pivot_wider(names_from = trait_id, values_from = c(trait_likert, pref_likert)) %>%
    collect()

  # Compute average correlations among preferences and traits
  avg_pref_cor <- traits_prefs_wide %>%
    select(starts_with("pref_likert_")) %>%
    cor() %>%
    get_upper_tri()
  avg_trait_cor <- traits_prefs_wide %>%
    select(starts_with("trait_likert_")) %>%
    cor() %>%
    get_upper_tri()

  # Compute pattern metrics
  pattern_metric <- df %>%
    group_by(trait_id) %>%
    mutate(
      trait_mean = mean(trait_likert, na.rm = TRUE),
      trait_norm = trait_likert - trait_mean,
      pref_mean = mean(pref_likert, na.rm = TRUE),
      pref_norm = pref_likert - pref_mean
    ) %>%
    group_by(individual_id) %>%
    summarise(
      corrected_pattern_metric = cor(trait_norm, pref_norm, use = 'pairwise.complete.obs')[1],
      norm_pattern_metric = cor(trait_likert, pref_mean, use = 'pairwise.complete.obs')[1],
      trait_mean = mean(trait_likert),
      pref_mean = mean(pref_likert),
      raw_pattern_metric = cor(trait_likert, pref_likert, use = 'pairwise.complete.obs')[1]
    ) %>%
    # Apply Fisher's z-transformation to correlation coefficients
    mutate(
      corrected_pattern_metric = psych::fisherz(corrected_pattern_metric),
      norm_pattern_metric = psych::fisherz(norm_pattern_metric),
      raw_pattern_metric = psych::fisherz(raw_pattern_metric)
    ) %>%
    # Handle infinite values resulting from Fisher transformation
    mutate(
      raw_pattern_metric = if_else(is.finite(raw_pattern_metric), raw_pattern_metric, NA_real_),
      norm_pattern_metric = if_else(is.finite(norm_pattern_metric), norm_pattern_metric, NA_real_),
      corrected_pattern_metric = if_else(is.finite(corrected_pattern_metric), corrected_pattern_metric, NA_real_)
    )

  # Combine pattern metrics with liking scores and wide data
  wide_df <- pattern_metric %>%
    left_join(liking %>% collect(), by = "individual_id") %>%
    left_join(traits_prefs_wide, by = "individual_id")


  # Linear models to compute incremental variance explained by corrected pattern metric
  m1 <- lm(liking_likert ~ ., wide_df %>% select(liking_likert, starts_with("trait_likert_")))
  m2 <- lm(liking_likert ~ ., wide_df %>% select(liking_likert, corrected_pattern_metric, starts_with("trait_likert_")))
  incremental_var_explained_by_cpm <- diff(sapply(list(m1, m2), function(x) summary(x)$adj.r.squared))

  # Compile results
  results <- cbind(
    as_tibble(p),  # Return parameters
    final_N = nrow(wide_df),
    # Proportion of responses at ceiling for preferences and traits
    df %>% summarise(
      pref_at_ceiling = mean(pref_likert == 11),
      trait_at_ceiling = mean(trait_likert == 11)
    ),
    wide_df %>% summarise(
      liking_at_ceiling = mean(liking_likert == 11),
      liking_at_floor = mean(liking_likert == 1)
    ),
    # Correlation between random responding and pattern metrics
    df %>%
      select(individual_id, random_responding) %>%
      left_join(pattern_metric, by = "individual_id") %>%
      summarise(
        cor_random_responding_cpm = cor(random_responding, corrected_pattern_metric, use = 'pairwise.complete.obs'),
        cor_random_responding_rpm = cor(random_responding, raw_pattern_metric, use = 'pairwise.complete.obs')
      ),
    # Average correlation between preferences and traits across traits
    df %>%
      group_by(trait_id) %>%
      summarise(avg_pref_trait_cor = cor(pref_likert, trait_likert, use = 'pairwise.complete.obs')) %>%
      summarise(avg_pref_trait_cor = mean(avg_pref_trait_cor, na.rm = TRUE)),
    # Standard deviations of trait and preference means
    traits %>% summarise(
      sd_norm_prefs = sd(pref_mean),
      sd_traits = sd(trait_mean)
    ),
    # Mean and SD of correlations among traits and preferences
    tibble(
      trait_cor_mean = mean(avg_trait_cor, na.rm = TRUE),
      trait_cor_sd = sd(avg_trait_cor, na.rm = TRUE),
      pref_cor_mean = mean(avg_pref_cor, na.rm = TRUE),
      pref_cor_sd = sd(avg_pref_cor, na.rm = TRUE)
    ),
    # Incremental variance explained by corrected pattern metric
    incremental_var_explained_by_cpm = incremental_var_explained_by_cpm,
    pattern_metric %>%
      summarise(
        cor_corrected_trait_mean = cor(corrected_pattern_metric, trait_mean, use = 'pairwise.complete.obs')
      ),
    # Correlations between pattern metrics and liking
    wide_df %>%
      summarise(
        avg_cor_corrected = mean(corrected_pattern_metric, na.rm = TRUE),
        cor_corrected_latent = cor(corrected_pattern_metric, true_liking, use = 'pairwise.complete.obs'),
        cor_corrected = cor(corrected_pattern_metric, liking_likert, use = 'pairwise.complete.obs'),
        cor_test_p = broom::tidy(cor.test(corrected_pattern_metric, liking_likert))$p.value[1],
        cor_raw = cor(raw_pattern_metric, liking_likert, use = 'pairwise.complete.obs'),
        cor_norm = cor(norm_pattern_metric, liking_likert, use = 'pairwise.complete.obs')
      )
  )
  write.table(results, file = "sim_results.csv", sep = ",",
              row.names = FALSE, col.names = !file.exists("sim_results.csv"), append = TRUE)

  as.data.frame(results)
}


# Test the run_simulation function with parameter set p
system.time({
  p %>% pmap(run_simulation)
})

# Function to safely run simulation and capture errors
safe_run_simulation <- function(...) {
  tryCatch({
    result <- run_simulation(...)
    cbind(success = TRUE, result)
  }, error = function(e) {
    cbind(success = FALSE, as_tibble(list(...)))
  })
}


# Check memory usage before simulations
pryr::mem_used()

# Run simulations in parallel over the parameter grid
SimulationResults <- parameters %>%
  pmap(
    safe_run_simulation,
    .progress = TRUE
    # .options = furrr_options(seed = 20191005)
  ) %>%
  # Combine all results into a single data frame
  bind_rows()

# Check memory usage after simulations
pryr::mem_used()
	# Load necessary libraries for data manipulation, visualization, and parallel processing
	library(tidyverse) # Provides a collection of R packages for data manipulation
	library(furrr) # Parallel processing using purrr
	library(dtplyr)

	# Set up parallel processing plan with 6 workers using multisession
	plan(multisession(workers = 20))

	# Check current memory usage
	pryr::mem_used()

	# Set random seed for reproducibility
	set.seed(05102019)

	# Create a grid of parameters for simulations
	parameters <- expand_grid(
	Iteration = 1,
	N = 500, # Number of individuals
	num_traits = 35, # Number of traits/preferences
	trait_pref_correlation = c(0, 0.4),
	trait_mean_weight = c(0, 0.3, 0.5),
	pref_mean_weight = c(0, 0.3, 0.5),
	great_partner_weight = c(0, 0.3, 0.5),
	high_standards_weight = c(0, 0.3, 0.5),
	trait_intercept = c(-1, 0, 1),
	pref_intercept = c(-1, 0, 1),
	normative_pref_strength_fun = c("rexp", "logrnorm"),
	normative_pref_strength_rate = 1,
	random_responding_sd = c(0, 1, 2),
	likert_noise_sd = c(0.1, 0.3),
	true_liking_filter = c(-Inf, 0, 2)
	) %>%
	arrange(runif(n())) # Randomize the order of parameter combinations

	# Display the number of parameter combinations
	nrow(parameters)
	#
	# # Parameters modeled on the original data
	p <- list(
	N = 5000,
	num_traits = 35,
	trait_intercept = 0,
	pref_intercept = 0,
	trait_pref_correlation = 0.4,
	trait_mean_weight = 0.5,
	pref_mean_weight = 0.5,
	great_partner_weight = 0.3,
	high_standards_weight = 0.3,
	normative_pref_strength_fun = "rexp",
	normative_pref_strength_rate = 1.5,
	random_responding_sd = 2,
	likert_noise_sd = 0.5,
	true_liking_filter = -Inf
	)

	# Function to extract the upper triangle of a matrix
	get_upper_tri <- function(x) {
	x[upper.tri(x)]
	}

	# Function to run a single iteration of the simulation
	run_simulation <- function(...) {
	# Convert input parameters to a list
	p <- list(...)

	# Define function for a log-transformed normal dist (left-skewed)
	logrnorm <- function(n, rate) {
	log(abs(rnorm(n, mean = 0, sd = rate)))
	}

	# Function to convert continuous values to an 11-point Likert scale
	to_likert_11 <- function(x, noise_sd) {
	x <- round(6 + 4 * (x + p$likert_noise_sd * rnorm(length(x), sd = exp(noise_sd))))
	x[x < 1] <- 1
	x[x > 11] <- 11
	x
	}

	scale <- function(...) { base::scale(...)[,1] }


	# Adjust N to ensure the desired number of individuals after filtering
	initial_N <- round(p$N / (1 - pnorm(p$true_liking_filter)))

	# Generate data frame with individual traits and preferences
	df <- tibble(
	individual_id = rep(1:initial_N, each = p$num_traits),
	trait_id = rep(1:p$num_traits, times = initial_N),
	# Generate random responding noise for each individual
	random_responding = rep(rnorm(initial_N, sd = p$random_responding_sd), each = p$num_traits),
	# Generate trait means for each trait
	trait_mean = rep(rnorm(p$num_traits), times = initial_N),
	# Generate individual-level variables
	great_partner = rep(rnorm(initial_N, mean = 0), each = p$num_traits),
	high_standards = rep(rnorm(initial_N, mean = 0), each = p$num_traits),
	# Generate preference means influenced by trait means
	pref_mean = rep(rnorm(p$num_traits), times = initial_N) + trait_mean,
	# Generate normative preference strengths
	normative_pref_strength = rep(get(p$normative_pref_strength_fun)(p$num_traits, rate = p$normative_pref_strength_rate), times = initial_N)
	) %>%
	lazy_dt() %>%
	# Generate trait and preference values
	mutate(
	trait_value = p$trait_intercept +
	p$trait_mean_weight * trait_mean +
	p$great_partner_weight * great_partner +
	rnorm(n()), # Add random noise
	pref_value = p$pref_intercept +
	p$high_standards_weight * high_standards +
	p$pref_mean_weight * pref_mean +
	p$trait_pref_correlation * trait_value +
	rnorm(n()) # Add random noise
	) %>%
	# Convert trait and preference values to Likert scale
	group_by(trait_id) %>%
	mutate(
	trait_likert = to_likert_11(trait_value, random_responding),
	pref_likert = to_likert_11(pref_value, random_responding)
	) %>%
	ungroup()

	# Compute true liking scores for each individual
	liking <- df %>%
	group_by(individual_id) %>%
	summarise(
	random_responding = first(random_responding),
	true_liking = sum(trait_value * normative_pref_strength) / sqrt(p$num_traits)
	) %>%
	# Standardize true liking scores
	mutate(true_liking = scale(true_liking)) %>%
	# Filter individuals based on true liking threshold
	filter(true_liking > p$true_liking_filter) %>%
	# Convert true liking to Likert scale
	mutate(liking_likert = to_likert_11(true_liking/2, (random_responding/3)))

	# Subset df to only include individuals who passed the true liking filter
	df <- liking %>% select(individual_id) %>%
	left_join(df, by = "individual_id") %>%
	collect()

	# Compute summary statistics for traits and preferences
	traits <- df %>%
	group_by(trait_id) %>%
	summarise(
	trait_mean = mean(trait_likert, na.rm = TRUE),
	trait_sd = sd(trait_likert, na.rm = TRUE),
	pref_mean = mean(pref_likert, na.rm = TRUE),
	pref_sd = sd(pref_likert, na.rm = TRUE)
	)
	# Reshape data to wide format for correlation analysis
	traits_prefs_wide <- df %>%
	select(individual_id, trait_id, trait_likert, pref_likert) %>%
	pivot_wider(names_from = trait_id, values_from = c(trait_likert, pref_likert)) %>%
	collect()

	# Compute average correlations among preferences and traits
	avg_pref_cor <- traits_prefs_wide %>%
	select(starts_with("pref_likert_")) %>%
	cor() %>%
	get_upper_tri()
	avg_trait_cor <- traits_prefs_wide %>%
	select(starts_with("trait_likert_")) %>%
	cor() %>%
	get_upper_tri()

	# Compute pattern metrics
	pattern_metric <- df %>%
	group_by(trait_id) %>%
	mutate(
	trait_mean = mean(trait_likert, na.rm = TRUE),
	trait_norm = trait_likert - trait_mean,
	pref_mean = mean(pref_likert, na.rm = TRUE),
	pref_norm = pref_likert - pref_mean
	) %>%
	group_by(individual_id) %>%
	summarise(
	corrected_pattern_metric = cor(trait_norm, pref_norm, use = 'pairwise.complete.obs')[1],
	norm_pattern_metric = cor(trait_likert, pref_mean, use = 'pairwise.complete.obs')[1],
	trait_mean = mean(trait_likert),
	pref_mean = mean(pref_likert),
	raw_pattern_metric = cor(trait_likert, pref_likert, use = 'pairwise.complete.obs')[1]
	) %>%
	# Apply Fisher's z-transformation to correlation coefficients
	mutate(
	corrected_pattern_metric = psych::fisherz(corrected_pattern_metric),
	norm_pattern_metric = psych::fisherz(norm_pattern_metric),
	raw_pattern_metric = psych::fisherz(raw_pattern_metric)
	) %>%
	# Handle infinite values resulting from Fisher transformation
	mutate(
	raw_pattern_metric = if_else(is.finite(raw_pattern_metric), raw_pattern_metric, NA_real_),
	norm_pattern_metric = if_else(is.finite(norm_pattern_metric), norm_pattern_metric, NA_real_),
	corrected_pattern_metric = if_else(is.finite(corrected_pattern_metric), corrected_pattern_metric, NA_real_)
	)

	# Combine pattern metrics with liking scores and wide data
	wide_df <- pattern_metric %>%
	left_join(liking %>% collect(), by = "individual_id") %>%
	left_join(traits_prefs_wide, by = "individual_id")


	# Linear models to compute incremental variance explained by corrected pattern metric
	m1 <- lm(liking_likert ~ ., wide_df %>% select(liking_likert, starts_with("trait_likert_")))
	m2 <- lm(liking_likert ~ ., wide_df %>% select(liking_likert, corrected_pattern_metric, starts_with("trait_likert_")))
	incremental_var_explained_by_cpm <- diff(sapply(list(m1, m2), function(x) summary(x)$adj.r.squared))

	# Compile results
	results <- cbind(
	as_tibble(p), # Return parameters
	final_N = nrow(wide_df),
	# Proportion of responses at ceiling for preferences and traits
	df %>% summarise(
	pref_at_ceiling = mean(pref_likert == 11),
	trait_at_ceiling = mean(trait_likert == 11)
	),
	wide_df %>% summarise(
	liking_at_ceiling = mean(liking_likert == 11),
	liking_at_floor = mean(liking_likert == 1)
	),
	# Correlation between random responding and pattern metrics
	df %>%
	select(individual_id, random_responding) %>%
	left_join(pattern_metric, by = "individual_id") %>%
	summarise(
	cor_random_responding_cpm = cor(random_responding, corrected_pattern_metric, use = 'pairwise.complete.obs'),
	cor_random_responding_rpm = cor(random_responding, raw_pattern_metric, use = 'pairwise.complete.obs')
	),
	# Average correlation between preferences and traits across traits
	df %>%
	group_by(trait_id) %>%
	summarise(avg_pref_trait_cor = cor(pref_likert, trait_likert, use = 'pairwise.complete.obs')) %>%
	summarise(avg_pref_trait_cor = mean(avg_pref_trait_cor, na.rm = TRUE)),
	# Standard deviations of trait and preference means
	traits %>% summarise(
	sd_norm_prefs = sd(pref_mean),
	sd_traits = sd(trait_mean)
	),
	# Mean and SD of correlations among traits and preferences
	tibble(
	trait_cor_mean = mean(avg_trait_cor, na.rm = TRUE),
	trait_cor_sd = sd(avg_trait_cor, na.rm = TRUE),
	pref_cor_mean = mean(avg_pref_cor, na.rm = TRUE),
	pref_cor_sd = sd(avg_pref_cor, na.rm = TRUE)
	),
	# Incremental variance explained by corrected pattern metric
	incremental_var_explained_by_cpm = incremental_var_explained_by_cpm,
	pattern_metric %>%
	summarise(
	cor_corrected_trait_mean = cor(corrected_pattern_metric, trait_mean, use = 'pairwise.complete.obs')
	),
	# Correlations between pattern metrics and liking
	wide_df %>%
	summarise(
	avg_cor_corrected = mean(corrected_pattern_metric, na.rm = TRUE),
	cor_corrected_latent = cor(corrected_pattern_metric, true_liking, use = 'pairwise.complete.obs'),
	cor_corrected = cor(corrected_pattern_metric, liking_likert, use = 'pairwise.complete.obs'),
	cor_test_p = broom::tidy(cor.test(corrected_pattern_metric, liking_likert))$p.value[1],
	cor_raw = cor(raw_pattern_metric, liking_likert, use = 'pairwise.complete.obs'),
	cor_norm = cor(norm_pattern_metric, liking_likert, use = 'pairwise.complete.obs')
	)
	)
	write.table(results, file = "sim_results.csv", sep = ",",
	row.names = FALSE, col.names = !file.exists("sim_results.csv"), append = TRUE)

	as.data.frame(results)
	}


	# Test the run_simulation function with parameter set p
	system.time({
	p %>% pmap(run_simulation)
	})

	# Function to safely run simulation and capture errors
	safe_run_simulation <- function(...) {
	tryCatch({
	result <- run_simulation(...)
	cbind(success = TRUE, result)
	}, error = function(e) {
	cbind(success = FALSE, as_tibble(list(...)))
	})
	}


	# Check memory usage before simulations
	pryr::mem_used()

	# Run simulations in parallel over the parameter grid
	SimulationResults <- parameters %>%
	pmap(
	safe_run_simulation,
	.progress = TRUE
	# .options = furrr_options(seed = 20191005)
	) %>%
	# Combine all results into a single data frame
	bind_rows()

	# Check memory usage after simulations
	pryr::mem_used()