GabrielHoffman/spca_path.R

## spca_path.R

#' @title  Sparse Principal Component Analysis (spca).
#
#' @description Implementation of SPCA, using variable projection as an optimization strategy.
#
#' @details
#' Adapted from sparsepca package to use initial values to speed up convergence.
#'
#' Sparse principal component analysis is a modern variant of PCA. Specifically, SPCA attempts to find sparse
#' weight vectors (loadings), i.e., a weight vector with only a few 'active' (nonzero) values. This approach
#' leads to an improved interpretability of the model, because the principal components are formed as a
#' linear combination of only a few of the original variables. Further, SPCA avoids overfitting in a
#' high-dimensional data setting where the number of variables \eqn{p} is greater than the number of
#' observations \eqn{n}.
#'
#' Such a parsimonious model is obtained by introducing prior information like sparsity promoting regularizers.
#' More concreatly, given an \eqn{(n,p)} data matrix \eqn{X}, SPCA attemps to minimize the following
#' objective function:
#'
#' \deqn{ f(A,B) = \frac{1}{2} \| X - X B A^\top \|^2_F + \psi(B) }
#'
#' where \eqn{B} is the sparse weight (loadings) matrix and \eqn{A} is an orthonormal matrix.
#' \eqn{\psi} denotes a sparsity inducing regularizer such as the LASSO (\eqn{\ell_1}{l1} norm) or the elastic net
#' (a combination of the \eqn{\ell_1}{l1} and \eqn{\ell_2}{l2} norm). The principal components \eqn{Z} are formed as
#'
#' \deqn{ Z = X B }{Z = X * B}
#'
#' and the data can be approximately rotated back as
#'
#' \deqn{ \tilde{X} = Z A^\top }{ X = Z t(A)}
#'
#' The print and summary method can be used to present the results in a nice format.
#'
#' @param X       array_like; \cr
#'                a real \eqn{(n, p)} input matrix (or data frame) to be decomposed.
#'
#' @param k       integer; \cr
#'                specifies the target rank, i.e., the number of components to be computed.
#'
#' @param alpha   float; \cr
#'                Sparsity controlling parameter. Higher values lead to sparser components.
#'
#' @param beta    float; \cr
#'                Amount of ridge shrinkage to apply in order to improve conditioning.
#'
#' @param center  bool; \cr
#'                logical value which indicates whether the variables should be
#'                shifted to be zero centered (TRUE by default).
#'
#' @param scale   bool; \cr
#'                logical value which indicates whether the variables should
#'                be scaled to have unit variance (FALSE by default).
#'
#' @param max_iter integer; \cr
#'                 maximum number of iterations to perform before exiting.
#'
#' @param tol float; \cr
#'            stopping tolerance for the convergence criterion.
#'
#' @param verbose bool; \cr
#'                logical value which indicates whether progress is printed.
#'
#' @param init intital values for parameter estimates used for "warm start" to accellerate convergence.
#'
#'
#'@return \code{spca} returns a list containing the following three components:
#'\item{loadings}{  array_like; \cr
#'           sparse loadings (weight) vector;  \eqn{(p, k)} dimensional array.
#'}
#'
#'\item{transform}{  array_like; \cr
#'           the approximated inverse transform; \eqn{(p, k)} dimensional array.
#'}
#'
#'\item{scores}{  array_like; \cr
#'           the principal component scores; \eqn{(n, k)} dimensional array.
#'}
#'
#'\item{eigenvalues}{  array_like; \cr
#'          the approximated eigenvalues; \eqn{(k)} dimensional array.
#'}
#'
#'\item{center, scale}{  array_like; \cr
#'                     the centering and scaling used.
#'}
#'
#'
#' @references
#' \itemize{
#'
#'  \item [1] N. B. Erichson, P. Zheng, K. Manohar, S. Brunton, J. N. Kutz, A. Y. Aravkin.
#'  "Sparse Principal Component Analysis via Variable Projection."
#'  Submitted to IEEE Journal of Selected Topics on Signal Processing (2018).
#'  (available at `arXiv \url{https://arxiv.org/abs/1804.00341}).
#' }
#'
#' @author N. Benjamin Erichson, Peng Zheng, and Sasha Aravkin.  Very slightly adapted by Gabriel Hoffman
#'
#' @seealso \code{\link{spca_path}}
#'
#' @examples
#'
#' # Create artifical data
#' m <- 10000
#' V1 <- rnorm(m, 0, 290)
#' V2 <- rnorm(m, 0, 300)
#' V3 <- -0.1*V1 + 0.1*V2 + rnorm(m,0,100)
#'
#' X <- cbind(V1,V1,V1,V1, V2,V2,V2,V2, V3,V3)
#' X <- X + matrix(rnorm(length(X),0,1), ncol = ncol(X), nrow = nrow(X))
#'
#' # Compute SPCA
#' out <- .spca(X, k=3, alpha=1e-3, beta=1e-3, center = TRUE, scale = FALSE, verbose=0)
#' print(out)
#' summary(out)
#'
.spca <- function(X, k=NULL, alpha=1e-4, beta=1e-4, center=TRUE, scale=FALSE, max_iter=1000, tol=1e-5, verbose=TRUE, init=NULL){

  X <- as.matrix(X)

  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Checks
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  if (any(is.na(X))) {
    warning("Missing values are omitted: na.omit(X).")
    X <- stats::na.omit(X)
  }

  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Init rpca object
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

  spcaObj = list(loadings = NULL,
                 transform = NULL,
                 scores = NULL,
                 eigenvalues = NULL,
                 center = center,
                 scale = scale)

  n <- nrow(X)
  p <- ncol(X)

  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Set target rank
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  if(is.null(k)) k <- min(n,p)
  if(k > min(n,p)) k <- min(n,p)
  if(k<1) stop("Target rank is not valid!")

  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Center/Scale data
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  if(center == TRUE) {
    spcaObj$center <- colMeans(X)
    X <- sweep(X, MARGIN = 2, STATS = spcaObj$center, FUN = "-", check.margin = TRUE)
  } else { spcaObj$center <- FALSE }

  if(scale == TRUE) {
    spcaObj$scale <- sqrt(colSums(X**2) / (n-1))
    if(is.complex(spcaObj$scale)) { spcaObj$scale[Re(spcaObj$scale) < 1e-8 ] <- 1+0i
    } else {spcaObj$scale[spcaObj$scale < 1e-8] <- 1}
    X <- sweep(X, MARGIN = 2, STATS = spcaObj$scale, FUN = "/", check.margin = TRUE)
  } else { spcaObj$scale <- FALSE }

  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Compute SVD for initialization of the Variable Projection Solver
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  svd_init <- svd(X)

  Dmax <- svd_init$d[1] # l2 norm

  if( is.null(init) ){

    A <- svd_init$v[,1:k]
    B <- svd_init$v[,1:k]
    V <- svd_init$v
  }else{
    # spcaObj$loadings <- B
    # spcaObj$transform <- A
    B <- init$loadings
    A <- init$transform
    V <- svd_init$v
  }

  VD = sweep(V, MARGIN = 2, STATS = svd_init$d, FUN = "*", check.margin = TRUE)
  VD2 = sweep(V, MARGIN = 2, STATS = svd_init$d**2, FUN = "*", check.margin = TRUE)

  #--------------------------------------------------------------------
  #   Set Tuning Parameters
  #--------------------------------------------------------------------
  alpha <- alpha *  Dmax**2
  beta <- beta * Dmax**2

  nu <- 1.0 / (Dmax**2 + beta)
  kappa <- nu * alpha

  obj <- c()
  improvement <- Inf

  #--------------------------------------------------------------------
  #   Apply Variable Projection Solver
  #--------------------------------------------------------------------
  noi <- 1
  while (noi <= max_iter && improvement > tol) {

    # Update A:  X'XB = UDV'
    # Z <- VD2 %*% (t(V) %*% B)
    Z <- VD2 %*% crossprod(V, B)
    svd_update <- svd(Z)
    # A <- svd_update$u %*% t(svd_update$v)
    A <- tcrossprod(svd_update$u, svd_update$v)

    # Proximal Gradient Descent to Update B:
    # grad <- VD2 %*% (t(V) %*% (A - B)) - beta * B
    grad <- VD2 %*% crossprod(V, A - B) - beta * B
    B_temp <- B + nu * grad

    # l1 soft-threshold
    idxH <- which(B_temp > kappa)
    idxL <- which(B_temp <= -kappa)

    B = matrix(0, nrow = nrow(B_temp), ncol = ncol(B_temp))
    B[idxH] <- B_temp[idxH] - kappa
    B[idxL] <- B_temp[idxL] + kappa

    # compute residual
    # R <- t(VD) - (t(VD) %*% B) %*% t(A)
    R <- t(VD) - tcrossprod(crossprod(VD, B), A)

    # compute objective function
    obj <- c(obj, 0.5 * sum(R**2) + alpha * sum(abs(B)) + 0.5 * beta * sum(B**2))

    # Break if obj is not improving anymore
    if(noi > 1){
      improvement <- (obj[noi-1] - obj[noi]) / obj[noi]
    }

    # Trace
    if(verbose > 0 && (noi-1) %% 1 == 0) {
      print(sprintf("Iteration: %4d, Objective: %1.5e, Relative improvement %1.5e", noi, obj[noi], improvement))
    }

    # Next iter
    noi <- noi + 1
  }

  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Update spca object
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  spcaObj$loadings <- B
  spcaObj$transform <- A
  spcaObj$scores <- X %*% B
  spcaObj$eigenvalues <- svd_update$d / (n - 1)
  spcaObj$objective <- obj

  rownames(spcaObj$loadings) = colnames(X)

  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Explained variance and explained variance ratio
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  spcaObj$sdev <-  sqrt( spcaObj$eigenvalues )
  spcaObj$var <- sum( apply( Re(X) , 2, stats::var ) )
  if(is.complex(X)) spcaObj$var <- Re(spcaObj$var + sum( apply( Im(X) , 2, stats::var ) ))

  class(spcaObj) <- "spca"
  return( spcaObj )
}


#' Class spcaPath
#'
#' Class \code{spcaPath} stores result of spca_path()
#'
#' @name spcaPath-class
#' @rdname spcaPath-class
#' @exportClass spcaPath
setClass("spcaPath", contains="list", slots=c(eps = "numeric"))


#' Fit sparse PCA path
#'
#' Fit sparse PCA path for many levels of sparsity
#'
#' @param X       array_like; \cr
#'                a real \eqn{(n, p)} input matrix (or data frame) to be decomposed.
#'
#' @param k       integer; \cr
#'                specifies the target rank, i.e., the number of components to be computed.
#'
#' @param alpha   array; \cr
#'                array of values for sparsity controlling parameter. Higher values lead to sparser components.
#'
#' @param beta    float; \cr
#'                Amount of ridge shrinkage to apply in order to improve conditioning.
#'
#' @param center  bool; \cr
#'                logical value which indicates whether the variables should be
#'                shifted to be zero centered (TRUE by default).
#'
#' @param scale   bool; \cr
#'                logical value which indicates whether the variables should
#'                be scaled to have unit variance (FALSE by default).
#'
#' @param max_iter integer; \cr
#'                 maximum number of iterations to perform before exiting.
#'
#' @param tol float; \cr
#'            stopping tolerance for the convergence criterion.
#'
#' @param verbose bool; \cr
#'                logical value which indicates whether progress is printed.
#'
#' @param eps  values greater than eps are considered non-zero
#'
#' @description Adapted from sparsepca package to use a "warm start" approach that uses the parameter estimates from the previous value of alpha as initial values
#'
#' @seealso \code{\link{plotPath}}
#' @examples
#'
#' n = 10000
#' p = 100
#' X = matrix(rnorm(n*p), n, p)
#'
#' res = spca_path(X, k=2, center=TRUE, scale=TRUE, alpha=10^-seq(1, 4, length.out=10))
#'
#' plotPath( res )
#'
#' @export
spca_path <- function(X, k=NULL, alpha=10^-seq(1, 4, length.out=100), beta=1e-4, center=TRUE, scale=FALSE, max_iter=1000, tol=1e-5, eps=1e-8, verbose=TRUE){

  # must start with full model and then decrease
  alpha = sort(alpha)

  spcaRes = list()

  if( verbose) message("Run sparse PCA regularization path")
  if( verbose) message(sprintf("%-8s%-10s%-8s", "index", "alpha", "# active"))

  for( i in 1:length(alpha) ){

    init = NULL
    if( i > 1){
      init = spcaRes[[i-1]]
    }
    spcaRes[[i]] = .spca(X=X, k=k, alpha=alpha[i], beta=beta, center = center, scale = scale, verbose=FALSE, init=init, tol=tol)
    spcaRes[[i]]$alpha = alpha[i]

    count = countNonzero(spcaRes[[i]], eps)
    message(sprintf("%-8s%-10s%-8s", i, format(alpha[i], digits=3, scientific=TRUE), count))

    if( count == 0) break
  }

  new("spcaPath", spcaRes, eps = eps)
}

# count number of features with non-zero loadings
countNonzero = function(res, eps=1e-8){
  sum(apply(res$loadings, 1, function(value) max(abs(value))) > eps)
}

#' Plot sparse PCA regularization path
#'
#' Plot sparse PCA regularization path
#'
#' @param regPath spcaPath
#' @param eps values greater than eps are considered non-zero
#'
#' @seealso \code{\link{spca_path}}
#' @examples
#'
#' n = 10000
#' p = 100
#' X = matrix(rnorm(n*p), n, p)
#'
#' regPath = spca_path(X, k=2, center=TRUE, scale=TRUE, alpha=10^-seq(1, 4, length.out=10))
#'
#' plotPath( regPath )
#'
#' @import ggplot2
#' @export
plotPath = function(regPath, eps=1e-8){

  if( ! is(regPath, 'spcaPath') ){
    stop("Object must be spcaPath")
  }

  # Pass R CMD check
  alpha = nactive = NA

  df = lapply( regPath, function(x){
    data.frame( alpha = x$alpha, nactive = countNonzero(x, eps) )
  })
  df = do.call(rbind, df)

  ggplot(df, aes(alpha, nactive)) + geom_point() + geom_line() + theme_bw() + theme(aspect.ratio=1, plot.title = element_text(hjust = 0.5)) + ylab("# active features") + ggtitle("Sparse PCA regularization path")
}

	#' @title Sparse Principal Component Analysis (spca).
	#
	#' @description Implementation of SPCA, using variable projection as an optimization strategy.
	#
	#' @details
	#' Adapted from sparsepca package to use initial values to speed up convergence.
	#'
	#' Sparse principal component analysis is a modern variant of PCA. Specifically, SPCA attempts to find sparse
	#' weight vectors (loadings), i.e., a weight vector with only a few 'active' (nonzero) values. This approach
	#' leads to an improved interpretability of the model, because the principal components are formed as a
	#' linear combination of only a few of the original variables. Further, SPCA avoids overfitting in a
	#' high-dimensional data setting where the number of variables \eqn{p} is greater than the number of
	#' observations \eqn{n}.
	#'
	#' Such a parsimonious model is obtained by introducing prior information like sparsity promoting regularizers.
	#' More concreatly, given an \eqn{(n,p)} data matrix \eqn{X}, SPCA attemps to minimize the following
	#' objective function:
	#'
	#' \deqn{ f(A,B) = \frac{1}{2} \\| X - X B A^\top \\|^2_F + \psi(B) }
	#'
	#' where \eqn{B} is the sparse weight (loadings) matrix and \eqn{A} is an orthonormal matrix.
	#' \eqn{\psi} denotes a sparsity inducing regularizer such as the LASSO (\eqn{\ell_1}{l1} norm) or the elastic net
	#' (a combination of the \eqn{\ell_1}{l1} and \eqn{\ell_2}{l2} norm). The principal components \eqn{Z} are formed as
	#'
	#' \deqn{ Z = X B }{Z = X * B}
	#'
	#' and the data can be approximately rotated back as
	#'
	#' \deqn{ \tilde{X} = Z A^\top }{ X = Z t(A)}
	#'
	#' The print and summary method can be used to present the results in a nice format.
	#'
	#' @param X array_like; \cr
	#' a real \eqn{(n, p)} input matrix (or data frame) to be decomposed.
	#'
	#' @param k integer; \cr
	#' specifies the target rank, i.e., the number of components to be computed.
	#'
	#' @param alpha float; \cr
	#' Sparsity controlling parameter. Higher values lead to sparser components.
	#'
	#' @param beta float; \cr
	#' Amount of ridge shrinkage to apply in order to improve conditioning.
	#'
	#' @param center bool; \cr
	#' logical value which indicates whether the variables should be
	#' shifted to be zero centered (TRUE by default).
	#'
	#' @param scale bool; \cr
	#' logical value which indicates whether the variables should
	#' be scaled to have unit variance (FALSE by default).
	#'
	#' @param max_iter integer; \cr
	#' maximum number of iterations to perform before exiting.
	#'
	#' @param tol float; \cr
	#' stopping tolerance for the convergence criterion.
	#'
	#' @param verbose bool; \cr
	#' logical value which indicates whether progress is printed.
	#'
	#' @param init intital values for parameter estimates used for "warm start" to accellerate convergence.
	#'
	#'
	#'@return \code{spca} returns a list containing the following three components:
	#'\item{loadings}{ array_like; \cr
	#' sparse loadings (weight) vector; \eqn{(p, k)} dimensional array.
	#'}
	#'
	#'\item{transform}{ array_like; \cr
	#' the approximated inverse transform; \eqn{(p, k)} dimensional array.
	#'}
	#'
	#'\item{scores}{ array_like; \cr
	#' the principal component scores; \eqn{(n, k)} dimensional array.
	#'}
	#'
	#'\item{eigenvalues}{ array_like; \cr
	#' the approximated eigenvalues; \eqn{(k)} dimensional array.
	#'}
	#'
	#'\item{center, scale}{ array_like; \cr
	#' the centering and scaling used.
	#'}
	#'
	#'
	#' @references
	#' \itemize{
	#'
	#' \item [1] N. B. Erichson, P. Zheng, K. Manohar, S. Brunton, J. N. Kutz, A. Y. Aravkin.
	#' "Sparse Principal Component Analysis via Variable Projection."
	#' Submitted to IEEE Journal of Selected Topics on Signal Processing (2018).
	#' (available at `arXiv \url{https://arxiv.org/abs/1804.00341}).
	#' }
	#'
	#' @author N. Benjamin Erichson, Peng Zheng, and Sasha Aravkin. Very slightly adapted by Gabriel Hoffman
	#'
	#' @seealso \code{\link{spca_path}}
	#'
	#' @examples
	#'
	#' # Create artifical data
	#' m <- 10000
	#' V1 <- rnorm(m, 0, 290)
	#' V2 <- rnorm(m, 0, 300)
	#' V3 <- -0.1V1 + 0.1V2 + rnorm(m,0,100)
	#'
	#' X <- cbind(V1,V1,V1,V1, V2,V2,V2,V2, V3,V3)
	#' X <- X + matrix(rnorm(length(X),0,1), ncol = ncol(X), nrow = nrow(X))
	#'
	#' # Compute SPCA
	#' out <- .spca(X, k=3, alpha=1e-3, beta=1e-3, center = TRUE, scale = FALSE, verbose=0)
	#' print(out)
	#' summary(out)
	#'
	.spca <- function(X, k=NULL, alpha=1e-4, beta=1e-4, center=TRUE, scale=FALSE, max_iter=1000, tol=1e-5, verbose=TRUE, init=NULL){

	X <- as.matrix(X)

	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# Checks
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	if (any(is.na(X))) {
	warning("Missing values are omitted: na.omit(X).")
	X <- stats::na.omit(X)
	}

	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# Init rpca object
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

	spcaObj = list(loadings = NULL,
	transform = NULL,
	scores = NULL,
	eigenvalues = NULL,
	center = center,
	scale = scale)

	n <- nrow(X)
	p <- ncol(X)

	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# Set target rank
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	if(is.null(k)) k <- min(n,p)
	if(k > min(n,p)) k <- min(n,p)
	if(k<1) stop("Target rank is not valid!")

	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# Center/Scale data
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	if(center == TRUE) {
	spcaObj$center <- colMeans(X)
	X <- sweep(X, MARGIN = 2, STATS = spcaObj$center, FUN = "-", check.margin = TRUE)
	} else { spcaObj$center <- FALSE }

	if(scale == TRUE) {
	spcaObj$scale <- sqrt(colSums(X**2) / (n-1))
	if(is.complex(spcaObj$scale)) { spcaObj$scale[Re(spcaObj$scale) < 1e-8 ] <- 1+0i
	} else {spcaObj$scale[spcaObj$scale < 1e-8] <- 1}
	X <- sweep(X, MARGIN = 2, STATS = spcaObj$scale, FUN = "/", check.margin = TRUE)
	} else { spcaObj$scale <- FALSE }

	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# Compute SVD for initialization of the Variable Projection Solver
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	svd_init <- svd(X)

	Dmax <- svd_init$d[1] # l2 norm

	if( is.null(init) ){

	A <- svd_init$v[,1:k]
	B <- svd_init$v[,1:k]
	V <- svd_init$v
	}else{
	# spcaObj$loadings <- B
	# spcaObj$transform <- A
	B <- init$loadings
	A <- init$transform
	V <- svd_init$v
	}

	VD = sweep(V, MARGIN = 2, STATS = svd_init$d, FUN = "*", check.margin = TRUE)
	VD2 = sweep(V, MARGIN = 2, STATS = svd_init$d*2, FUN = "", check.margin = TRUE)

	#--------------------------------------------------------------------
	# Set Tuning Parameters
	#--------------------------------------------------------------------
	alpha <- alpha * Dmax**2
	beta <- beta * Dmax**2

	nu <- 1.0 / (Dmax**2 + beta)
	kappa <- nu * alpha

	obj <- c()
	improvement <- Inf

	#--------------------------------------------------------------------
	# Apply Variable Projection Solver
	#--------------------------------------------------------------------
	noi <- 1
	while (noi <= max_iter && improvement > tol) {

	# Update A: X'XB = UDV'
	# Z <- VD2 %% (t(V) %% B)
	Z <- VD2 %*% crossprod(V, B)
	svd_update <- svd(Z)
	# A <- svd_update$u %*% t(svd_update$v)
	A <- tcrossprod(svd_update$u, svd_update$v)

	# Proximal Gradient Descent to Update B:
	# grad <- VD2 %% (t(V) %% (A - B)) - beta * B
	grad <- VD2 %% crossprod(V, A - B) - beta B
	B_temp <- B + nu * grad

	# l1 soft-threshold
	idxH <- which(B_temp > kappa)
	idxL <- which(B_temp <= -kappa)

	B = matrix(0, nrow = nrow(B_temp), ncol = ncol(B_temp))
	B[idxH] <- B_temp[idxH] - kappa
	B[idxL] <- B_temp[idxL] + kappa

	# compute residual
	# R <- t(VD) - (t(VD) %% B) %% t(A)
	R <- t(VD) - tcrossprod(crossprod(VD, B), A)

	# compute objective function
	obj <- c(obj, 0.5 * sum(R*2) + alpha sum(abs(B)) + 0.5 * beta * sum(B**2))

	# Break if obj is not improving anymore
	if(noi > 1){
	improvement <- (obj[noi-1] - obj[noi]) / obj[noi]
	}

	# Trace
	if(verbose > 0 && (noi-1) %% 1 == 0) {
	print(sprintf("Iteration: %4d, Objective: %1.5e, Relative improvement %1.5e", noi, obj[noi], improvement))
	}

	# Next iter
	noi <- noi + 1
	}

	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# Update spca object
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	spcaObj$loadings <- B
	spcaObj$transform <- A
	spcaObj$scores <- X %*% B
	spcaObj$eigenvalues <- svd_update$d / (n - 1)
	spcaObj$objective <- obj

	rownames(spcaObj$loadings) = colnames(X)

	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# Explained variance and explained variance ratio
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	spcaObj$sdev <- sqrt( spcaObj$eigenvalues )
	spcaObj$var <- sum( apply( Re(X) , 2, stats::var ) )
	if(is.complex(X)) spcaObj$var <- Re(spcaObj$var + sum( apply( Im(X) , 2, stats::var ) ))

	class(spcaObj) <- "spca"
	return( spcaObj )
	}




	#' Class spcaPath
	#'
	#' Class \code{spcaPath} stores result of spca_path()
	#'
	#' @name spcaPath-class
	#' @rdname spcaPath-class
	#' @exportClass spcaPath
	setClass("spcaPath", contains="list", slots=c(eps = "numeric"))


	#' Fit sparse PCA path
	#'
	#' Fit sparse PCA path for many levels of sparsity
	#'
	#' @param X array_like; \cr
	#' a real \eqn{(n, p)} input matrix (or data frame) to be decomposed.
	#'
	#' @param k integer; \cr
	#' specifies the target rank, i.e., the number of components to be computed.
	#'
	#' @param alpha array; \cr
	#' array of values for sparsity controlling parameter. Higher values lead to sparser components.
	#'
	#' @param beta float; \cr
	#' Amount of ridge shrinkage to apply in order to improve conditioning.
	#'
	#' @param center bool; \cr
	#' logical value which indicates whether the variables should be
	#' shifted to be zero centered (TRUE by default).
	#'
	#' @param scale bool; \cr
	#' logical value which indicates whether the variables should
	#' be scaled to have unit variance (FALSE by default).
	#'
	#' @param max_iter integer; \cr
	#' maximum number of iterations to perform before exiting.
	#'
	#' @param tol float; \cr
	#' stopping tolerance for the convergence criterion.
	#'
	#' @param verbose bool; \cr
	#' logical value which indicates whether progress is printed.
	#'
	#' @param eps values greater than eps are considered non-zero
	#'
	#' @description Adapted from sparsepca package to use a "warm start" approach that uses the parameter estimates from the previous value of alpha as initial values
	#'
	#' @seealso \code{\link{plotPath}}
	#' @examples
	#'
	#' n = 10000
	#' p = 100
	#' X = matrix(rnorm(n*p), n, p)
	#'
	#' res = spca_path(X, k=2, center=TRUE, scale=TRUE, alpha=10^-seq(1, 4, length.out=10))
	#'
	#' plotPath( res )
	#'
	#' @export
	spca_path <- function(X, k=NULL, alpha=10^-seq(1, 4, length.out=100), beta=1e-4, center=TRUE, scale=FALSE, max_iter=1000, tol=1e-5, eps=1e-8, verbose=TRUE){

	# must start with full model and then decrease
	alpha = sort(alpha)

	spcaRes = list()

	if( verbose) message("Run sparse PCA regularization path")
	if( verbose) message(sprintf("%-8s%-10s%-8s", "index", "alpha", "# active"))

	for( i in 1:length(alpha) ){

	init = NULL
	if( i > 1){
	init = spcaRes[[i-1]]
	}
	spcaRes[[i]] = .spca(X=X, k=k, alpha=alpha[i], beta=beta, center = center, scale = scale, verbose=FALSE, init=init, tol=tol)
	spcaRes[[i]]$alpha = alpha[i]

	count = countNonzero(spcaRes[[i]], eps)
	message(sprintf("%-8s%-10s%-8s", i, format(alpha[i], digits=3, scientific=TRUE), count))

	if( count == 0) break
	}

	new("spcaPath", spcaRes, eps = eps)
	}

	# count number of features with non-zero loadings
	countNonzero = function(res, eps=1e-8){
	sum(apply(res$loadings, 1, function(value) max(abs(value))) > eps)
	}

	#' Plot sparse PCA regularization path
	#'
	#' Plot sparse PCA regularization path
	#'
	#' @param regPath spcaPath
	#' @param eps values greater than eps are considered non-zero
	#'
	#' @seealso \code{\link{spca_path}}
	#' @examples
	#'
	#' n = 10000
	#' p = 100
	#' X = matrix(rnorm(n*p), n, p)
	#'
	#' regPath = spca_path(X, k=2, center=TRUE, scale=TRUE, alpha=10^-seq(1, 4, length.out=10))
	#'
	#' plotPath( regPath )
	#'
	#' @import ggplot2
	#' @export
	plotPath = function(regPath, eps=1e-8){

	if( ! is(regPath, 'spcaPath') ){
	stop("Object must be spcaPath")
	}

	# Pass R CMD check
	alpha = nactive = NA

	df = lapply( regPath, function(x){
	data.frame( alpha = x$alpha, nactive = countNonzero(x, eps) )
	})
	df = do.call(rbind, df)

	ggplot(df, aes(alpha, nactive)) + geom_point() + geom_line() + theme_bw() + theme(aspect.ratio=1, plot.title = element_text(hjust = 0.5)) + ylab("# active features") + ggtitle("Sparse PCA regularization path")
	}