nicebread/Ruscio2008_cor_generate.R

## Ruscio2008_cor_generate.R
## This code snippet is taken from Ruscio, J., & Kaczetow, W. (2008). Simulating Multivariate Nonnormal Data Using an Iterative Algorithm. Multivariate Behavioral Research, 43(3), 355–381.

## In the original code were some errors; this is a cleaned version.
## 2012 Felix Schönbrodt


#' @param Pop List with empirical samples
#' @param rho The correlation to be induced

#' @examples
#' Pop <- list(X=rnorm(1000), Y=rlnorm(3000))
#' r1 <- GenData(Pop, rho=.4)
#' plot(r1)
#' cor(r1, method="p")


################################################################################################################
GenData <- function(Pop, rho, N=1000, N.Factors = 0, Max.Trials = 5, Initial.Multiplier = 1, seed = NA)
{
# Initialize variables and (if applicable) set random number seed (step 1) -------------------------------------

	k <- length(Pop)
   Data <- matrix(0, nrow = N, ncol = k)            # Matrix to store the simulated data
   Iteration <- 0                                   # Iteration counter
   Best.RMSR <- 1                                   # Lowest RMSR correlation
   Trials.Without.Improvement <- 0                  # Trial counter
   if (!is.na(seed)) set.seed(seed)                    # If user specified a nonzero seed, set it
	   Distributions <- matrix(NA, nrow=N, ncol=k)

   Target.Corr <- matrix(c(1, rho, rho, 1), nrow=2)

# Generate distribution for each variable (step 2) -------------------------------------------------------------

   for (i in 1:k) {
	   Distributions[, i] <- sort(sample(Pop[[i]], N, replace = TRUE))
   }

#     This implementation of GenData bootstraps each variable's score distribution from a supplied data set.
#     Users should modify this block of the program, as needed, to generate the desired distribution(s).
#
#     For example, to sample from chi-square distributions with 2 df, replace the 2nd line in this block with:
#        Distributions[,i] <- sort(rchisq(N, df = 2))
#
#     Or, one can drop the loop and use a series of commands that samples variables from specified populations:
#        Distributions[,1] <- sort(rnorm(N, 0, 1))          # Standard normal distribution
#        Distributions[,2] <- sort(runif(N, 0, 1))          # Uniform distribution ranging from 0 - 1
#        Distributions[,3] <- sort(rlnorm(N, 0, 1))         # Log-normal distribution, log scale M = 0, SD = 1
#        Distributions[,4] <- sort(rexp(N, rate = 1))       # Exponential distribution with rate = 1
#        Distributions[,5] <- sort(rpois(N, lambda = 4))    # Poisson distribution with lambda = 4
#        Distributions[,6] <- sort(rbinom(N, 10, .25)       # Binominal distribution, size = 10 and p = .25
#        Distributions[,7] <- sort(rbinom(N, 2, .25)        # Binary distribution with p = .25
#
#     All of the commands shown above draw random samples from specified population distributions.  As an
#     alternative, one can reproduce distributions without sampling error.  For example, working with a
#     supplied data set, one can replace the 2nd line in this block with:
#        Disrributions[,i] <- Supplied.Data[,i]
#     Alternatively, idealized distributions can be reproduced.  For example, uniform quantiles can be
#     created and used to generate data from common distributions:
#        Uniform.Quantiles <- seq(from = 0, to = 1, length = (N + 2))[2:(N + 1)]   # quantiles 0, 1 dropped
#        Distributions[,1] <- qnorm(Uniform.Quantiles, 0, 1)      # Standard normal distribution
#        Distributions[,2] <- qunif(Uniform.Quantiles, 0, 1)      # Uniform distribution ranging from 0 to 1
#        Distributions[,3] <- qchisq(Uniform.Quantiles, df = 2)   # Chi-square distribution with 2 df
#
#     Note that when score distributions are generated from specified populations rather than bootstrapped from
#     a supplied data set, the user must provide the target correlation matrix (see the next block).  This is
#     true regardless of whether the distributions incorporate sampling error.

# Calculate and store a copy of the target correlation matrix (step 3) -----------------------------------------

   #Target.Corr <- cor(Supplied.Data)
   Intermediate.Corr <- Target.Corr

#     This implementation of GenData calculates the target correlation matrix from a supplied data set.
#     Alternatively, the user can modify the program to generate data with user-defined sample size, number of
#     variables, and target correlation matrix by redefining the function as follows:
#        GenData <- function(N, k, Target.Corr, N.Factors = 0, Max.Trials = 5, Initial.Multiplier = 1, seed = 0)
#     In this case, one would also remove the program lines that calculate N, k, and Target.Corr.
#     To generate data in which variables are uncorrelated, one would remove the “sort” function from step 2
#        and terminate the program before step 3 begins by returning the Distributions object as the data set.

# If number of latent factors was not specified, determine it through parallel analysis (step 4) ---------------

   if (N.Factors == 0)
   {
      Eigenvalues.Observed <- eigen(Intermediate.Corr)$values
      Eigenvalues.Random <- matrix(0, nrow = 100, ncol = k)
      Random.Data <- matrix(0, nrow = N, ncol = k)
      for (i in 1:100)
      {
         for (j in 1:k)
            Random.Data[,j] <- sample(Distributions[,j], size = N, replace = TRUE)
         Eigenvalues.Random[i,] <- eigen(cor(Random.Data))$values
      }
      Eigenvalues.Random <- apply(Eigenvalues.Random, 2, mean) # calculate mean eigenvalue for each factor
      N.Factors <- max(1, sum(Eigenvalues.Observed > Eigenvalues.Random))
   }

# Generate random normal data for shared and unique components, initialize factor loadings (steps 5, 6) --------

   Shared.Comp <- matrix(rnorm(N * N.Factors, 0, 1), nrow = N, ncol = N.Factors)
   Unique.Comp <- matrix(rnorm(N * k, 0, 1), nrow = N, ncol = k)
   Shared.Load <- matrix(0, nrow = k, ncol = N.Factors)
   Unique.Load <- matrix(0, nrow = k, ncol = 1)

# Begin loop that ends when specified number of iterations pass without improvement in RMSR correlation --------

   while (Trials.Without.Improvement < Max.Trials)
   {
      Iteration <- Iteration + 1

# Calculate factor loadings and apply to reproduce desired correlations (steps 7, 8) ---------------------------

      Fact.Anal <- Factor.Analysis(Intermediate.Corr, Corr.Matrix = TRUE, N.Factors = N.Factors)
      if (N.Factors == 1) {
		  Shared.Load[,1] <- Fact.Anal$loadings
	  } else {
		  Shared.Load <- Fact.Anal$loadings
	  }
      Shared.Load[Shared.Load > 1] <- 1
      Shared.Load[Shared.Load < -1] <- -1
      if (Shared.Load[1,1] < 0) Shared.Load <- Shared.Load * -1
      Shared.Load.sq <- Shared.Load * Shared.Load
      for (i in 1:k)
         if (sum(Shared.Load.sq[i,]) < 1) {
			 Unique.Load[i,1] <- (1 - sum(Shared.Load.sq[i,]))
		 } else {
			 Unique.Load[i,1] <- 0
		 }
      Unique.Load <- sqrt(Unique.Load)

      for (i in 1:k) {
         Data[,i] <- (Shared.Comp %*% t(Shared.Load))[,i] + Unique.Comp[,i] * Unique.Load[i,1]
	 }
      # the %*% operator = matrix multiplication, and the t() function = transpose (both used again in step 13)

	  # Replace normal with nonnormal distributions (step 9) ---------------------------------------------------------

      for (i in 1:k)
      {
         Data <- Data[sort.list(Data[,i]),]
         Data[,i] <- Distributions[,i]
      }

# Calculate RMSR correlation, compare to lowest value, take appropriate action (steps 10, 11, 12) --------------

      Reproduced.Corr <- cor(Data)
      Residual.Corr <- Target.Corr - Reproduced.Corr
      RMSR <- sqrt(sum(Residual.Corr[lower.tri(Residual.Corr)] * Residual.Corr[lower.tri(Residual.Corr)]) /
              (.5 * (k * k - k)))
      if (RMSR < Best.RMSR) {
         Best.RMSR <- RMSR
         Best.Corr <- Intermediate.Corr
         Best.Res <- Residual.Corr
         Intermediate.Corr <- Intermediate.Corr + Initial.Multiplier * Residual.Corr
         Trials.Without.Improvement <- 0
      } else {
         Trials.Without.Improvement <- Trials.Without.Improvement + 1
         Current.Multiplier <- Initial.Multiplier * .5 ^ Trials.Without.Improvement
         Intermediate.Corr <- Best.Corr + Current.Multiplier * Best.Res
      }
   }  # end of the while loop

# Construct the data set with the lowest RMSR correlation (step 13) --------------------------------------------

   Fact.Anal <- Factor.Analysis(Best.Corr, Corr.Matrix = TRUE, N.Factors = N.Factors)
   if (N.Factors == 1) {
	   Shared.Load[,1] <- Fact.Anal$loadings
   } else {
	   Shared.Load <- Fact.Anal$loadings
   }
   Shared.Load[Shared.Load > 1] <- 1
   Shared.Load[Shared.Load < -1] <- -1
   if (Shared.Load[1,1] < 0) {Shared.Load <- Shared.Load * -1}
   Shared.Load.sq <- Shared.Load * Shared.Load
   for (i in 1:k) {
      if (sum(Shared.Load.sq[i,]) < 1) {
		  Unique.Load[i,1] <- (1 - sum(Shared.Load.sq[i,]))
	  } else {
		  Unique.Load[i,1] <- 0
	  }
   }
   Unique.Load <- sqrt(Unique.Load)
   for (i in 1:k) {
	   Data[,i] <- (Shared.Comp %*% t(Shared.Load))[,i] + Unique.Comp[,i] * Unique.Load[i,1]
   }
   Data <- apply(Data, 2, scale) # standardizes each variable in the matrix
   for (i in 1:k)
   {
      Data <- Data[sort.list(Data[,i]),]
      Data[,i] <- Distributions[,i]
   }
   Data <- Data[sample(1:N, N, replace = FALSE), ] # randomize order of cases

# Report the results and return the simulated data set (step 14) -----------------------------------------------

   Iteration <- Iteration - Max.Trials
   cat("\nN =",N,", k =",k,",",Iteration,"Iterations,",N.Factors,"Factors, RMSR r =",round(Best.RMSR,3),"\n")
   return(Data)
}

################################################################################################################
Factor.Analysis <- function(Data, Corr.Matrix = FALSE, Max.Iter = 50, N.Factors = 0)
{
   Data <- as.matrix(Data)
   k <- dim(Data)[2]
   if (N.Factors == 0) N.Factors <- k
   if (!Corr.Matrix) Cor.Matrix <- cor(Data)
   else Cor.Matrix <- Data
   Criterion <- .001
   Old.H2 <- rep(99, k)
   H2 <- rep(0, k)
   Change <- 1
   Iter <- 0
   Factor.Loadings <- matrix(nrow = k, ncol = N.Factors)
   while ((Change >= Criterion) & (Iter < Max.Iter))
   {
      Iter <- Iter + 1
      Eig <- eigen(Cor.Matrix)
      L <- sqrt(Eig$values[1:N.Factors])
      for (i in 1:N.Factors)
         Factor.Loadings[,i] <- Eig$vectors[,i] * L[i]
      for (i in 1:k)
         H2[i] <- sum(Factor.Loadings[i,] * Factor.Loadings[i,])
      Change <- max(abs(Old.H2 - H2))
      Old.H2 <- H2
      diag(Cor.Matrix) <- H2
   }
   if (N.Factors == k) N.Factors <- sum(Eig$values > 1)
   return(list(loadings = Factor.Loadings[,1:N.Factors], factors = N.Factors))
}
	## This code snippet is taken from Ruscio, J., & Kaczetow, W. (2008). Simulating Multivariate Nonnormal Data Using an Iterative Algorithm. Multivariate Behavioral Research, 43(3), 355–381.

	## In the original code were some errors; this is a cleaned version.
	## 2012 Felix Schönbrodt



	#' @param Pop List with empirical samples
	#' @param rho The correlation to be induced

	#' @examples
	#' Pop <- list(X=rnorm(1000), Y=rlnorm(3000))
	#' r1 <- GenData(Pop, rho=.4)
	#' plot(r1)
	#' cor(r1, method="p")



	################################################################################################################
	GenData <- function(Pop, rho, N=1000, N.Factors = 0, Max.Trials = 5, Initial.Multiplier = 1, seed = NA)
	{
	# Initialize variables and (if applicable) set random number seed (step 1) -------------------------------------

	k <- length(Pop)
	Data <- matrix(0, nrow = N, ncol = k) # Matrix to store the simulated data
	Iteration <- 0 # Iteration counter
	Best.RMSR <- 1 # Lowest RMSR correlation
	Trials.Without.Improvement <- 0 # Trial counter
	if (!is.na(seed)) set.seed(seed) # If user specified a nonzero seed, set it
	Distributions <- matrix(NA, nrow=N, ncol=k)

	Target.Corr <- matrix(c(1, rho, rho, 1), nrow=2)

	# Generate distribution for each variable (step 2) -------------------------------------------------------------

	for (i in 1:k) {
	Distributions[, i] <- sort(sample(Pop[[i]], N, replace = TRUE))
	}

	# This implementation of GenData bootstraps each variable's score distribution from a supplied data set.
	# Users should modify this block of the program, as needed, to generate the desired distribution(s).
	#
	# For example, to sample from chi-square distributions with 2 df, replace the 2nd line in this block with:
	# Distributions[,i] <- sort(rchisq(N, df = 2))
	#
	# Or, one can drop the loop and use a series of commands that samples variables from specified populations:
	# Distributions[,1] <- sort(rnorm(N, 0, 1)) # Standard normal distribution
	# Distributions[,2] <- sort(runif(N, 0, 1)) # Uniform distribution ranging from 0 - 1
	# Distributions[,3] <- sort(rlnorm(N, 0, 1)) # Log-normal distribution, log scale M = 0, SD = 1
	# Distributions[,4] <- sort(rexp(N, rate = 1)) # Exponential distribution with rate = 1
	# Distributions[,5] <- sort(rpois(N, lambda = 4)) # Poisson distribution with lambda = 4
	# Distributions[,6] <- sort(rbinom(N, 10, .25) # Binominal distribution, size = 10 and p = .25
	# Distributions[,7] <- sort(rbinom(N, 2, .25) # Binary distribution with p = .25
	#
	# All of the commands shown above draw random samples from specified population distributions. As an
	# alternative, one can reproduce distributions without sampling error. For example, working with a
	# supplied data set, one can replace the 2nd line in this block with:
	# Disrributions[,i] <- Supplied.Data[,i]
	# Alternatively, idealized distributions can be reproduced. For example, uniform quantiles can be
	# created and used to generate data from common distributions:
	# Uniform.Quantiles <- seq(from = 0, to = 1, length = (N + 2))[2:(N + 1)] # quantiles 0, 1 dropped
	# Distributions[,1] <- qnorm(Uniform.Quantiles, 0, 1) # Standard normal distribution
	# Distributions[,2] <- qunif(Uniform.Quantiles, 0, 1) # Uniform distribution ranging from 0 to 1
	# Distributions[,3] <- qchisq(Uniform.Quantiles, df = 2) # Chi-square distribution with 2 df
	#
	# Note that when score distributions are generated from specified populations rather than bootstrapped from
	# a supplied data set, the user must provide the target correlation matrix (see the next block). This is
	# true regardless of whether the distributions incorporate sampling error.

	# Calculate and store a copy of the target correlation matrix (step 3) -----------------------------------------

	#Target.Corr <- cor(Supplied.Data)
	Intermediate.Corr <- Target.Corr

	# This implementation of GenData calculates the target correlation matrix from a supplied data set.
	# Alternatively, the user can modify the program to generate data with user-defined sample size, number of
	# variables, and target correlation matrix by redefining the function as follows:
	# GenData <- function(N, k, Target.Corr, N.Factors = 0, Max.Trials = 5, Initial.Multiplier = 1, seed = 0)
	# In this case, one would also remove the program lines that calculate N, k, and Target.Corr.
	# To generate data in which variables are uncorrelated, one would remove the “sort” function from step 2
	# and terminate the program before step 3 begins by returning the Distributions object as the data set.

	# If number of latent factors was not specified, determine it through parallel analysis (step 4) ---------------

	if (N.Factors == 0)
	{
	Eigenvalues.Observed <- eigen(Intermediate.Corr)$values
	Eigenvalues.Random <- matrix(0, nrow = 100, ncol = k)
	Random.Data <- matrix(0, nrow = N, ncol = k)
	for (i in 1:100)
	{
	for (j in 1:k)
	Random.Data[,j] <- sample(Distributions[,j], size = N, replace = TRUE)
	Eigenvalues.Random[i,] <- eigen(cor(Random.Data))$values
	}
	Eigenvalues.Random <- apply(Eigenvalues.Random, 2, mean) # calculate mean eigenvalue for each factor
	N.Factors <- max(1, sum(Eigenvalues.Observed > Eigenvalues.Random))
	}

	# Generate random normal data for shared and unique components, initialize factor loadings (steps 5, 6) --------

	Shared.Comp <- matrix(rnorm(N * N.Factors, 0, 1), nrow = N, ncol = N.Factors)
	Unique.Comp <- matrix(rnorm(N * k, 0, 1), nrow = N, ncol = k)
	Shared.Load <- matrix(0, nrow = k, ncol = N.Factors)
	Unique.Load <- matrix(0, nrow = k, ncol = 1)

	# Begin loop that ends when specified number of iterations pass without improvement in RMSR correlation --------

	while (Trials.Without.Improvement < Max.Trials)
	{
	Iteration <- Iteration + 1

	# Calculate factor loadings and apply to reproduce desired correlations (steps 7, 8) ---------------------------

	Fact.Anal <- Factor.Analysis(Intermediate.Corr, Corr.Matrix = TRUE, N.Factors = N.Factors)
	if (N.Factors == 1) {
	Shared.Load[,1] <- Fact.Anal$loadings
	} else {
	Shared.Load <- Fact.Anal$loadings
	}
	Shared.Load[Shared.Load > 1] <- 1
	Shared.Load[Shared.Load < -1] <- -1
	if (Shared.Load[1,1] < 0) Shared.Load <- Shared.Load * -1
	Shared.Load.sq <- Shared.Load * Shared.Load
	for (i in 1:k)
	if (sum(Shared.Load.sq[i,]) < 1) {
	Unique.Load[i,1] <- (1 - sum(Shared.Load.sq[i,]))
	} else {
	Unique.Load[i,1] <- 0
	}
	Unique.Load <- sqrt(Unique.Load)

	for (i in 1:k) {
	Data[,i] <- (Shared.Comp %% t(Shared.Load))[,i] + Unique.Comp[,i] Unique.Load[i,1]
	}
	# the %*% operator = matrix multiplication, and the t() function = transpose (both used again in step 13)

	# Replace normal with nonnormal distributions (step 9) ---------------------------------------------------------

	for (i in 1:k)
	{
	Data <- Data[sort.list(Data[,i]),]
	Data[,i] <- Distributions[,i]
	}

	# Calculate RMSR correlation, compare to lowest value, take appropriate action (steps 10, 11, 12) --------------

	Reproduced.Corr <- cor(Data)
	Residual.Corr <- Target.Corr - Reproduced.Corr
	RMSR <- sqrt(sum(Residual.Corr[lower.tri(Residual.Corr)] * Residual.Corr[lower.tri(Residual.Corr)]) /
	(.5 * (k * k - k)))
	if (RMSR < Best.RMSR) {
	Best.RMSR <- RMSR
	Best.Corr <- Intermediate.Corr
	Best.Res <- Residual.Corr
	Intermediate.Corr <- Intermediate.Corr + Initial.Multiplier * Residual.Corr
	Trials.Without.Improvement <- 0
	} else {
	Trials.Without.Improvement <- Trials.Without.Improvement + 1
	Current.Multiplier <- Initial.Multiplier * .5 ^ Trials.Without.Improvement
	Intermediate.Corr <- Best.Corr + Current.Multiplier * Best.Res
	}
	} # end of the while loop

	# Construct the data set with the lowest RMSR correlation (step 13) --------------------------------------------

	Fact.Anal <- Factor.Analysis(Best.Corr, Corr.Matrix = TRUE, N.Factors = N.Factors)
	if (N.Factors == 1) {
	Shared.Load[,1] <- Fact.Anal$loadings
	} else {
	Shared.Load <- Fact.Anal$loadings
	}
	Shared.Load[Shared.Load > 1] <- 1
	Shared.Load[Shared.Load < -1] <- -1
	if (Shared.Load[1,1] < 0) {Shared.Load <- Shared.Load * -1}
	Shared.Load.sq <- Shared.Load * Shared.Load
	for (i in 1:k) {
	if (sum(Shared.Load.sq[i,]) < 1) {
	Unique.Load[i,1] <- (1 - sum(Shared.Load.sq[i,]))
	} else {
	Unique.Load[i,1] <- 0
	}
	}
	Unique.Load <- sqrt(Unique.Load)
	for (i in 1:k) {
	Data[,i] <- (Shared.Comp %% t(Shared.Load))[,i] + Unique.Comp[,i] Unique.Load[i,1]
	}
	Data <- apply(Data, 2, scale) # standardizes each variable in the matrix
	for (i in 1:k)
	{
	Data <- Data[sort.list(Data[,i]),]
	Data[,i] <- Distributions[,i]
	}
	Data <- Data[sample(1:N, N, replace = FALSE), ] # randomize order of cases

	# Report the results and return the simulated data set (step 14) -----------------------------------------------

	Iteration <- Iteration - Max.Trials
	cat("\nN =",N,", k =",k,",",Iteration,"Iterations,",N.Factors,"Factors, RMSR r =",round(Best.RMSR,3),"\n")
	return(Data)
	}

	################################################################################################################
	Factor.Analysis <- function(Data, Corr.Matrix = FALSE, Max.Iter = 50, N.Factors = 0)
	{
	Data <- as.matrix(Data)
	k <- dim(Data)[2]
	if (N.Factors == 0) N.Factors <- k
	if (!Corr.Matrix) Cor.Matrix <- cor(Data)
	else Cor.Matrix <- Data
	Criterion <- .001
	Old.H2 <- rep(99, k)
	H2 <- rep(0, k)
	Change <- 1
	Iter <- 0
	Factor.Loadings <- matrix(nrow = k, ncol = N.Factors)
	while ((Change >= Criterion) & (Iter < Max.Iter))
	{
	Iter <- Iter + 1
	Eig <- eigen(Cor.Matrix)
	L <- sqrt(Eig$values[1:N.Factors])
	for (i in 1:N.Factors)
	Factor.Loadings[,i] <- Eig$vectors[,i] * L[i]
	for (i in 1:k)
	H2[i] <- sum(Factor.Loadings[i,] * Factor.Loadings[i,])
	Change <- max(abs(Old.H2 - H2))
	Old.H2 <- H2
	diag(Cor.Matrix) <- H2
	}
	if (N.Factors == k) N.Factors <- sum(Eig$values > 1)
	return(list(loadings = Factor.Loadings[,1:N.Factors], factors = N.Factors))
	}