vikram-s-narayan/GEKPLS_V1.jl

## GEKPLS_V1.jl
using Statistics
using ScikitLearn
using LinearAlgebra

@sk_import cross_decomposition: PLSRegression

####### START OF ALL FUNCTIONS ##########

function _ge_compute_pls(X, y, n_comp, grads, delta_x, xlimits, extra_points)
    """
    Gradient-enhanced PLS-coefficients.
        Parameters
        ----------
        X: [n_obs,dim] - The input variables.
        y: [n_obs,ny] - The output variable
        n_comp: int - Number of principal components used.
        gradients: - The gradient values. Matrix size (n_obs,dim)
        delta_x: real - The step used in the First Order Taylor Approximation
        xlimits: [dim, 2]- The upper and lower var bounds.
        extra_points: int - The number of extra points per each training point.
        Returns
        -------
        Coeff_pls: [dim, n_comp] - The PLS-coefficients.
        X: Concatenation of XX: [extra_points*nt, dim] - Extra points added (when extra_points > 0) and X
        y: Concatenation of yy[extra_points*nt, 1]- Extra points added (when extra_points > 0) and y
        """
    # this function is equivalent to a combination of
    # https://github.com/SMTorg/smt/blob/f124c01ffa78c04b80221dded278a20123dac742/smt/utils/kriging_utils.py#L1036
    # and https://github.com/SMTorg/smt/blob/f124c01ffa78c04b80221dded278a20123dac742/smt/surrogate_models/gekpls.py#L48
    nt, dim = size(X)
    XX = zeros(0,dim)
    yy = zeros(0,size(y)[2])
    _pls = PLSRegression(n_comp)
    coeff_pls = zeros((dim, n_comp, nt))
    for i in 1:nt
        if dim >= 3 #to do ... this is consistent with SMT but inefficient. Move outside for loop and evaluate dim >= 3 just once.
            bb_vals = circshift(boxbehnken(dim, 1),1)
            _X = zeros((size(bb_vals)[1], dim))
            _y = zeros((size(bb_vals)[1], 1))
            bb_vals = bb_vals .* (delta_x * (xlimits[:, 2] - xlimits[:, 1]))' #smt calls this sign. I've called it bb_vals
            _X = X[i, :]' .+ bb_vals
            bb_vals =  bb_vals .* grads[i,:]'
            _y = y[i, :] .+ sum(bb_vals, dims=2)
        else
            println("GEKPLS for less than 3 dimensions is coming soon")
        end
        _pls.fit(_X, _y)
        coeff_pls[:, :, i] = _pls.x_rotations_ #size of _pls.x_rotations_ is (dim, n_comp)
        if extra_points != 0
            start_index = max(1, length(coeff_pls[:,1,i])-extra_points+1) #todo: evaluate just once
            max_coeff = sortperm(broadcast(abs,coeff_pls[:,1,i]))[start_index:end]
            for ii in max_coeff
                XX = [XX; transpose(X[i, :])]
                XX[end, ii] += delta_x * (xlimits[ii,2]-xlimits[ii,1])
                yy = [yy; y[i]]
                yy[end] += grads[i,ii] * delta_x * (xlimits[ii,2]-xlimits[ii,1])
            end
        end
    end
    if extra_points != 0
        X = [X; XX]
        y = [y; yy]
    end
    pls_mean = mean(broadcast(abs,coeff_pls),dims=3)
    return pls_mean, X, y
end

######start of bbdesign######


    #
    # Adapted from 'ExperimentalDesign.jl: Design of Experiments in Julia'
    # https://github.com/phrb/ExperimentalDesign.jl

    # MIT License

    # ExperimentalDesign.jl: Design of Experiments in Julia
    # Copyright (C) 2019 Pedro Bruel <pedro.bruel@gmail.com>

    # Permission is hereby granted, free of charge,  to any person obtaining a copy of
    # this software  and associated documentation  files (the "Software"), to  deal in
    # the Software  without restriction,  including without  limitation the  rights to
    # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
    # the Software, and to permit persons to  whom the Software is furnished to do so,
    # subject to the following conditions:

    # The  above copyright  notice  and  this permission  notice  (including the  next
    # paragraph)  shall be  included  in all  copies or  substantial  portions of  the
    # Software.

    # THE  SOFTWARE IS  PROVIDED "AS  IS", WITHOUT  WARRANTY OF  ANY KIND,  EXPRESS OR
    # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
    # FOR A PARTICULAR  PURPOSE AND NONINFRINGEMENT. IN NO EVENT  SHALL THE AUTHORS OR
    # COPYRIGHT HOLDERS BE  LIABLE FOR ANY CLAIM, DAMAGES OR  OTHER LIABILITY, WHETHER
    # IN  AN ACTION  OF  CONTRACT, TORT  OR  OTHERWISE,  ARISING FROM,  OUT  OF OR  IN
    # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
    #

    function boxbehnken(matrix_size::Int)
        boxbehnken(matrix_size, matrix_size)
    end

    function boxbehnken(matrix_size::Int, center::Int)
        @assert matrix_size>=3

        A_fact = explicit_fullfactorial(Tuple([-1,1] for i = 1:2))

        rows = floor(Int, (0.5*matrix_size*(matrix_size-1))*size(A_fact)[1])

        A = zeros(rows, matrix_size)

        l = 0
        for i in 1:matrix_size-1
            for j in i+1:matrix_size
                l = l +1
                A[max(0, (l - 1)*size(A_fact)[1])+1:l*size(A_fact)[1], i] = A_fact[:, 1]
                A[max(0, (l - 1)*size(A_fact)[1])+1:l*size(A_fact)[1], j] = A_fact[:, 2]
            end
        end

        if center == matrix_size
            if matrix_size <= 16
                points = [0, 0, 3, 3, 6, 6, 6, 8, 9, 10, 12, 12, 13, 14, 15, 16]
                center = points[matrix_size]
            end
        end

        A = transpose(hcat(transpose(A), transpose(zeros(center, matrix_size))))
    end

    function explicit_fullfactorial(factors::Tuple)
        explicit_fullfactorial(fullfactorial(factors))
    end

    function explicit_fullfactorial(iterator::Base.Iterators.ProductIterator)
        hcat(vcat.(collect(iterator)...)...)
    end

    function fullfactorial(factors::Tuple)
        Base.Iterators.product(factors...)
    end


######end of bb design######

function standardization(X,y)
    """
    We substract the mean from each variable. Then, we divide the values of each
    variable by its standard deviation.

    Parameters
    ----------

    X - The input variables.
    y - The output variable.

    Returns
    -------

    X: [n_obs, dim]
       The standardized input matrix.

    y: [n_obs, 1]
       The standardized output vector.

    X_offset: The mean (or the min if scale_X_to_unit=True) of each input variable.

    y_mean: The mean of the output variable.

    X_scale:  The standard deviation of each input variable.

    y_std: The standard deviation of the output variable.

    """
    #Equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L21
    X_offset = mean(X, dims = 1)
    X_scale = std(X, dims = 1)
    X_scale = map(x -> (x==0.0) ? x=1 : x=x, X_scale) #to prevent division by 0 below
    y_mean = mean(y)
    y_std = std(y)
    y_std = map(y -> (y==0) ? y=1 : y=y, y_std) #to prevent division by 0 below
    X = (X.-X_offset) ./ X_scale
    y = (y .- y_mean) ./ y_std
    return X, y, X_offset, y_mean, X_scale, y_std

end

function cross_distances(X)
    """
    Computes the nonzero componentwise cross-distances between the vectors
    in X

    Parameters
    ----------

    X: [n_obs, dim]

    Returns
    -------
    D:  [n_obs * (n_obs - 1) / 2, dim]
        - The cross-distances between the vectors in X.

    ij: [n_obs * (n_obs - 1) / 2, 2]
            - The indices i and j of the vectors in X associated to the cross-
              distances in D.
    """
    # equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L86
    n_samples, n_features = size(X)
    n_nonzero_cross_dist = ( n_samples * (n_samples - 1) ) ÷ 2
    ij = zeros((n_nonzero_cross_dist, 2))
    D = zeros((n_nonzero_cross_dist, n_features))
    ll_1 = 0

    for k in 1:n_samples - 1
        ll_0 = ll_1 + 1
        ll_1 = ll_0 + n_samples - k - 1
        ij[ll_0:ll_1, 1] .= k
        ij[ll_0:ll_1, 2] = k+1:1:n_samples
        D[ll_0:ll_1, :] = -(X[(k + 1) : n_samples,:] .- X[k,:]')

    end
    return D, Int.(ij)
end

function componentwise_distance_PLS(D, corr, n_comp, coeff_pls)
    """
        Computes the nonzero componentwise cross-spatial-correlation-distance
        between the vectors in X.

        Equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L1257
        with some simplifications (removed theta and return_derivative as it's not required for GEKPLS)

        Parameters
        ----------

        D: [n_obs * (n_obs - 1) / 2, dim]
            - The L1 cross-distances between the vectors in X.

        corr: str
                - Name of the correlation function used.
                squar_exp or abs_exp.

        n_comp: int
                - Number of principal components used.

        coeff_pls: [dim, n_comp]
                - The PLS-coefficients.

        Returns
        -------

        D_corr: [n_obs * (n_obs - 1) / 2, n_comp]
                - The componentwise cross-spatial-correlation-distance between the
                vectors in X.

    """
    #equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L1257
    #todo
    #figure out how to handle this computation in the case of very large matrices
    #similar to what SMT has done
    #at https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L1257
    D_corr = zeros((size(D)[1], n_comp))

    if corr == "squar_exp"
        D_corr = D.^2 * coeff_pls.^2
    else #abs_exp
        D_corr = abs.(D) * abs.(coeff_pls)
    end
    return D_corr
end

function squar_exp(theta, d)
    n_components = size(d)[2]
    theta = reshape(theta, (1,n_components))
    return exp.(-sum(theta .* d, dims=2))
end

function differences(A,B)
    #equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L392
    #to have behavior comparable to SMT,
    #ensure that you send in as
    #differences(B,A)
    #THIS FUNCTION NEEDS TO BE REWRITTEN
    C = [0.0 0.0 0.0]
    for row in eachrow(B)
        C = vcat(C, -A.+transpose(row))
    end
    return C[2:size(C)[1],:]
end

function _reduced_likelihood_function(theta, kernel_type, d, nt, ij, y_norma, noise=0.0)
    """
    This function is a loose translation of SMT code from
    https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/surrogate_models/krg_based.py#L247
    It  determines the BLUP parameters and evaluates the reduced likelihood function for the given theta.

    Parameters
    ----------
    theta: array containing the parameters at which the Gaussian Process model parameters should be determined.
    kernel_type: name of the correlation function.
    d: The componentwise cross-spatial-correlation-distance between the vectors in X.
    nt: number of training points
    ij: The indices i and j of the vectors in X associated to the cross-distances in D.
    y_norma: Standardized y values
    noise: noise hyperparameter - increasing noise reduces reduced_likelihood_function_value


    Returns
    -------
    reduced_likelihood_function_value: real
        - The value of the reduced likelihood function associated with the given autocorrelation parameters theta.
    beta:  Generalized least-squares regression weights
    gamma: Gaussian Process weights.

    """
    #equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/surrogate_models/krg_based.py#L247
    reduced_likelihood_function_value = -Inf
    nugget = 1000000.0 * eps() #a jitter for numerical stability; reducing the multiple from 1000000.0 results in positive definite error for Cholesky decomposition;
    if kernel_type =="squar_exp" #todo - add other kernel types like abs_exp, matern52 etc.
        r = squar_exp(theta,d)
    end
    R = (I + zeros(nt,nt)) .* (1.0 + nugget + noise)
    for k in 1:size(ij)[1] #todo - speed up using @inbounds / @simd where required
        R[ij[k,1],ij[k,2]]=r[k]
        R[ij[k,2],ij[k,1]]=r[k]
    end
    C = cholesky(R).L #todo - #values diverge at this point from SMT code; verify impact
    F = ones(nt,1) #todo - examine if this should be a parameter for this function
    Ft = C\F
    Q, G = qr(Ft)
    Q = Array(Q)
    Yt = C\y_norma
    #todo - in smt, they check if the matrix is ill-conditioned using SVD. Verify and include if necessary
    beta = G \ [(transpose(Q) ⋅ Yt)]
    rho = Yt .- (Ft .* beta)
    gamma = transpose(C) \ rho
    sigma2 = sum((rho).^2, dims=1)/nt
    detR = prod(diag(C).^(2.0/nt))
    reduced_likelihood_function_value = - nt * log10(sum(sigma2)) - nt * log10(detR)
    return beta, gamma, reduced_likelihood_function_value
end

####### END OF ALL FUNCTIONS ##########

####### START OF DEFINING INPUT VARIABLES FOR TRAINING ##########
X = [
    [ 1.  2.  3.]
    [ 4.  5.  6.]
    [ 7.  8.  9.]
    [10. 11. 12.]
    [ 2.  1.  2.]
    [ 1.  1.  2.]
    [ 2.  2.  1.]
    ]


y = transpose([ 14. 77. 194. 365. 9. 4. 9. ])

n_comp = 2

grads = [
        [ 2.  4.  6.]
        [ 8.  10.  12.]
        [ 14. 16.  18.]
        [ 20. 22.  24.]
        [ 4.  2.   4. ]
        [ 2.  2.   4. ]
        [ 4.  4.   2. ]
        ]

delta_x = 0.0001

xlimits = [[-10.  10.]
          [-10.  10.]
          [-10.  10.]]

extra_points = 2

#######  END OF DEFINING INPUT VARIABLES FOR TRAINING ##########

#######  START OF TRAINING ##########
pls_mean, X_after_PLS, y_after_PLS = _ge_compute_pls(X, y, n_comp, grads, delta_x, xlimits, extra_points)
X_after_std, y_after_std, X_offset, y_mean, X_scale, y_std = standardization(X_after_PLS, y_after_PLS)
D, ij = cross_distances(X_after_std)
pls_mean_reshaped = reshape(pls_mean, (3,2))
d = componentwise_distance_PLS(D, "squar_exp", n_comp, pls_mean_reshaped)
theta = [0.01, 0.01]
nt, nd = size(X_after_PLS)
beta, gamma, reduced_likelihood_function_value = _reduced_likelihood_function(theta, "squar_exp", d, nt, ij, y_after_std)
#######  END OF TRAINING ##########

#######  START OF PREDICTING ##########
X_test = reshape([2.0,2.0,3.0],(1,3))
#X_test = [2.0 2.0 2.0; 2.5 2.5 2.5] #todo fix error in multi-row case
n_eval, n_features_x = size(X_test)
X_cont = (X_test .- X_offset) ./ X_scale
dx = differences(X_cont, X_after_std)
pred_d = componentwise_distance_PLS(dx, "squar_exp", n_comp, pls_mean_reshaped)
r = squar_exp(theta, pred_d)
f = ones(n_eval,1)
y_ = (f ⋅ beta) + (r ⋅ gamma)
y = y_mean + y_std * y_
#######  END OF PREDICTING ##########
	using Statistics
	using ScikitLearn
	using LinearAlgebra

	@sk_import cross_decomposition: PLSRegression

	####### START OF ALL FUNCTIONS ##########

	function _ge_compute_pls(X, y, n_comp, grads, delta_x, xlimits, extra_points)
	"""
	Gradient-enhanced PLS-coefficients.
	Parameters
	----------
	X: [n_obs,dim] - The input variables.
	y: [n_obs,ny] - The output variable
	n_comp: int - Number of principal components used.
	gradients: - The gradient values. Matrix size (n_obs,dim)
	delta_x: real - The step used in the First Order Taylor Approximation
	xlimits: [dim, 2]- The upper and lower var bounds.
	extra_points: int - The number of extra points per each training point.
	Returns
	-------
	Coeff_pls: [dim, n_comp] - The PLS-coefficients.
	X: Concatenation of XX: [extra_points*nt, dim] - Extra points added (when extra_points > 0) and X
	y: Concatenation of yy[extra_points*nt, 1]- Extra points added (when extra_points > 0) and y
	"""
	# this function is equivalent to a combination of
	# https://github.com/SMTorg/smt/blob/f124c01ffa78c04b80221dded278a20123dac742/smt/utils/kriging_utils.py#L1036
	# and https://github.com/SMTorg/smt/blob/f124c01ffa78c04b80221dded278a20123dac742/smt/surrogate_models/gekpls.py#L48
	nt, dim = size(X)
	XX = zeros(0,dim)
	yy = zeros(0,size(y)[2])
	_pls = PLSRegression(n_comp)
	coeff_pls = zeros((dim, n_comp, nt))
	for i in 1:nt
	if dim >= 3 #to do ... this is consistent with SMT but inefficient. Move outside for loop and evaluate dim >= 3 just once.
	bb_vals = circshift(boxbehnken(dim, 1),1)
	_X = zeros((size(bb_vals)[1], dim))
	_y = zeros((size(bb_vals)[1], 1))
	bb_vals = bb_vals .* (delta_x * (xlimits[:, 2] - xlimits[:, 1]))' #smt calls this sign. I've called it bb_vals
	_X = X[i, :]' .+ bb_vals
	bb_vals = bb_vals .* grads[i,:]'
	_y = y[i, :] .+ sum(bb_vals, dims=2)
	else
	println("GEKPLS for less than 3 dimensions is coming soon")
	end
	_pls.fit(_X, _y)
	coeff_pls[:, :, i] = _pls.x_rotations_ #size of _pls.x_rotations_ is (dim, n_comp)
	if extra_points != 0
	start_index = max(1, length(coeff_pls[:,1,i])-extra_points+1) #todo: evaluate just once
	max_coeff = sortperm(broadcast(abs,coeff_pls[:,1,i]))[start_index:end]
	for ii in max_coeff
	XX = [XX; transpose(X[i, :])]
	XX[end, ii] += delta_x * (xlimits[ii,2]-xlimits[ii,1])
	yy = [yy; y[i]]
	yy[end] += grads[i,ii] * delta_x * (xlimits[ii,2]-xlimits[ii,1])
	end
	end
	end
	if extra_points != 0
	X = [X; XX]
	y = [y; yy]
	end
	pls_mean = mean(broadcast(abs,coeff_pls),dims=3)
	return pls_mean, X, y
	end

	######start of bbdesign######


	#
	# Adapted from 'ExperimentalDesign.jl: Design of Experiments in Julia'
	# https://github.com/phrb/ExperimentalDesign.jl

	# MIT License

	# ExperimentalDesign.jl: Design of Experiments in Julia
	# Copyright (C) 2019 Pedro Bruel <pedro.bruel@gmail.com>

	# Permission is hereby granted, free of charge, to any person obtaining a copy of
	# this software and associated documentation files (the "Software"), to deal in
	# the Software without restriction, including without limitation the rights to
	# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
	# the Software, and to permit persons to whom the Software is furnished to do so,
	# subject to the following conditions:

	# The above copyright notice and this permission notice (including the next
	# paragraph) shall be included in all copies or substantial portions of the
	# Software.

	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
	# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
	# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
	# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
	# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
	#

	function boxbehnken(matrix_size::Int)
	boxbehnken(matrix_size, matrix_size)
	end

	function boxbehnken(matrix_size::Int, center::Int)
	@assert matrix_size>=3

	A_fact = explicit_fullfactorial(Tuple([-1,1] for i = 1:2))

	rows = floor(Int, (0.5matrix_size(matrix_size-1))*size(A_fact)[1])

	A = zeros(rows, matrix_size)

	l = 0
	for i in 1:matrix_size-1
	for j in i+1:matrix_size
	l = l +1
	A[max(0, (l - 1)size(A_fact)[1])+1:lsize(A_fact)[1], i] = A_fact[:, 1]
	A[max(0, (l - 1)size(A_fact)[1])+1:lsize(A_fact)[1], j] = A_fact[:, 2]
	end
	end

	if center == matrix_size
	if matrix_size <= 16
	points = [0, 0, 3, 3, 6, 6, 6, 8, 9, 10, 12, 12, 13, 14, 15, 16]
	center = points[matrix_size]
	end
	end

	A = transpose(hcat(transpose(A), transpose(zeros(center, matrix_size))))
	end

	function explicit_fullfactorial(factors::Tuple)
	explicit_fullfactorial(fullfactorial(factors))
	end

	function explicit_fullfactorial(iterator::Base.Iterators.ProductIterator)
	hcat(vcat.(collect(iterator)...)...)
	end

	function fullfactorial(factors::Tuple)
	Base.Iterators.product(factors...)
	end


	######end of bb design######

	function standardization(X,y)
	"""
	We substract the mean from each variable. Then, we divide the values of each
	variable by its standard deviation.

	Parameters
	----------

	X - The input variables.
	y - The output variable.

	Returns
	-------

	X: [n_obs, dim]
	The standardized input matrix.

	y: [n_obs, 1]
	The standardized output vector.

	X_offset: The mean (or the min if scale_X_to_unit=True) of each input variable.

	y_mean: The mean of the output variable.

	X_scale: The standard deviation of each input variable.

	y_std: The standard deviation of the output variable.

	"""
	#Equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L21
	X_offset = mean(X, dims = 1)
	X_scale = std(X, dims = 1)
	X_scale = map(x -> (x==0.0) ? x=1 : x=x, X_scale) #to prevent division by 0 below
	y_mean = mean(y)
	y_std = std(y)
	y_std = map(y -> (y==0) ? y=1 : y=y, y_std) #to prevent division by 0 below
	X = (X.-X_offset) ./ X_scale
	y = (y .- y_mean) ./ y_std
	return X, y, X_offset, y_mean, X_scale, y_std

	end

	function cross_distances(X)
	"""
	Computes the nonzero componentwise cross-distances between the vectors
	in X

	Parameters
	----------

	X: [n_obs, dim]

	Returns
	-------
	D: [n_obs * (n_obs - 1) / 2, dim]
	- The cross-distances between the vectors in X.

	ij: [n_obs * (n_obs - 1) / 2, 2]
	- The indices i and j of the vectors in X associated to the cross-
	distances in D.
	"""
	# equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L86
	n_samples, n_features = size(X)
	n_nonzero_cross_dist = ( n_samples * (n_samples - 1) ) ÷ 2
	ij = zeros((n_nonzero_cross_dist, 2))
	D = zeros((n_nonzero_cross_dist, n_features))
	ll_1 = 0

	for k in 1:n_samples - 1
	ll_0 = ll_1 + 1
	ll_1 = ll_0 + n_samples - k - 1
	ij[ll_0:ll_1, 1] .= k
	ij[ll_0:ll_1, 2] = k+1:1:n_samples
	D[ll_0:ll_1, :] = -(X[(k + 1) : n_samples,:] .- X[k,:]')

	end
	return D, Int.(ij)
	end

	function componentwise_distance_PLS(D, corr, n_comp, coeff_pls)
	"""
	Computes the nonzero componentwise cross-spatial-correlation-distance
	between the vectors in X.

	Equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L1257
	with some simplifications (removed theta and return_derivative as it's not required for GEKPLS)

	Parameters
	----------

	D: [n_obs * (n_obs - 1) / 2, dim]
	- The L1 cross-distances between the vectors in X.

	corr: str
	- Name of the correlation function used.
	squar_exp or abs_exp.

	n_comp: int
	- Number of principal components used.

	coeff_pls: [dim, n_comp]
	- The PLS-coefficients.

	Returns
	-------

	D_corr: [n_obs * (n_obs - 1) / 2, n_comp]
	- The componentwise cross-spatial-correlation-distance between the
	vectors in X.

	"""
	#equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L1257
	#todo
	#figure out how to handle this computation in the case of very large matrices
	#similar to what SMT has done
	#at https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L1257
	D_corr = zeros((size(D)[1], n_comp))

	if corr == "squar_exp"
	D_corr = D.^2 * coeff_pls.^2
	else #abs_exp
	D_corr = abs.(D) * abs.(coeff_pls)
	end
	return D_corr
	end

	function squar_exp(theta, d)
	n_components = size(d)[2]
	theta = reshape(theta, (1,n_components))
	return exp.(-sum(theta .* d, dims=2))
	end

	function differences(A,B)
	#equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L392
	#to have behavior comparable to SMT,
	#ensure that you send in as
	#differences(B,A)
	#THIS FUNCTION NEEDS TO BE REWRITTEN
	C = [0.0 0.0 0.0]
	for row in eachrow(B)
	C = vcat(C, -A.+transpose(row))
	end
	return C[2:size(C)[1],:]
	end

	function _reduced_likelihood_function(theta, kernel_type, d, nt, ij, y_norma, noise=0.0)
	"""
	This function is a loose translation of SMT code from
	https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/surrogate_models/krg_based.py#L247
	It determines the BLUP parameters and evaluates the reduced likelihood function for the given theta.

	Parameters
	----------
	theta: array containing the parameters at which the Gaussian Process model parameters should be determined.
	kernel_type: name of the correlation function.
	d: The componentwise cross-spatial-correlation-distance between the vectors in X.
	nt: number of training points
	ij: The indices i and j of the vectors in X associated to the cross-distances in D.
	y_norma: Standardized y values
	noise: noise hyperparameter - increasing noise reduces reduced_likelihood_function_value


	Returns
	-------
	reduced_likelihood_function_value: real
	- The value of the reduced likelihood function associated with the given autocorrelation parameters theta.
	beta: Generalized least-squares regression weights
	gamma: Gaussian Process weights.

	"""
	#equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/surrogate_models/krg_based.py#L247
	reduced_likelihood_function_value = -Inf
	nugget = 1000000.0 * eps() #a jitter for numerical stability; reducing the multiple from 1000000.0 results in positive definite error for Cholesky decomposition;
	if kernel_type =="squar_exp" #todo - add other kernel types like abs_exp, matern52 etc.
	r = squar_exp(theta,d)
	end
	R = (I + zeros(nt,nt)) .* (1.0 + nugget + noise)
	for k in 1:size(ij)[1] #todo - speed up using @inbounds / @simd where required
	R[ij[k,1],ij[k,2]]=r[k]
	R[ij[k,2],ij[k,1]]=r[k]
	end
	C = cholesky(R).L #todo - #values diverge at this point from SMT code; verify impact
	F = ones(nt,1) #todo - examine if this should be a parameter for this function
	Ft = C\F
	Q, G = qr(Ft)
	Q = Array(Q)
	Yt = C\y_norma
	#todo - in smt, they check if the matrix is ill-conditioned using SVD. Verify and include if necessary
	beta = G \ [(transpose(Q) ⋅ Yt)]
	rho = Yt .- (Ft .* beta)
	gamma = transpose(C) \ rho
	sigma2 = sum((rho).^2, dims=1)/nt
	detR = prod(diag(C).^(2.0/nt))
	reduced_likelihood_function_value = - nt * log10(sum(sigma2)) - nt * log10(detR)
	return beta, gamma, reduced_likelihood_function_value
	end

	####### END OF ALL FUNCTIONS ##########

	####### START OF DEFINING INPUT VARIABLES FOR TRAINING ##########
	X = [
	[ 1. 2. 3.]
	[ 4. 5. 6.]
	[ 7. 8. 9.]
	[10. 11. 12.]
	[ 2. 1. 2.]
	[ 1. 1. 2.]
	[ 2. 2. 1.]
	]


	y = transpose([ 14. 77. 194. 365. 9. 4. 9. ])

	n_comp = 2

	grads = [
	[ 2. 4. 6.]
	[ 8. 10. 12.]
	[ 14. 16. 18.]
	[ 20. 22. 24.]
	[ 4. 2. 4. ]
	[ 2. 2. 4. ]
	[ 4. 4. 2. ]
	]

	delta_x = 0.0001

	xlimits = [[-10. 10.]
	[-10. 10.]
	[-10. 10.]]

	extra_points = 2

	####### END OF DEFINING INPUT VARIABLES FOR TRAINING ##########

	####### START OF TRAINING ##########
	pls_mean, X_after_PLS, y_after_PLS = _ge_compute_pls(X, y, n_comp, grads, delta_x, xlimits, extra_points)
	X_after_std, y_after_std, X_offset, y_mean, X_scale, y_std = standardization(X_after_PLS, y_after_PLS)
	D, ij = cross_distances(X_after_std)
	pls_mean_reshaped = reshape(pls_mean, (3,2))
	d = componentwise_distance_PLS(D, "squar_exp", n_comp, pls_mean_reshaped)
	theta = [0.01, 0.01]
	nt, nd = size(X_after_PLS)
	beta, gamma, reduced_likelihood_function_value = _reduced_likelihood_function(theta, "squar_exp", d, nt, ij, y_after_std)
	####### END OF TRAINING ##########

	####### START OF PREDICTING ##########
	X_test = reshape([2.0,2.0,3.0],(1,3))
	#X_test = [2.0 2.0 2.0; 2.5 2.5 2.5] #todo fix error in multi-row case
	n_eval, n_features_x = size(X_test)
	X_cont = (X_test .- X_offset) ./ X_scale
	dx = differences(X_cont, X_after_std)
	pred_d = componentwise_distance_PLS(dx, "squar_exp", n_comp, pls_mean_reshaped)
	r = squar_exp(theta, pred_d)
	f = ones(n_eval,1)
	y_ = (f ⋅ beta) + (r ⋅ gamma)
	y = y_mean + y_std * y_
	####### END OF PREDICTING ##########