Skip to content

Instantly share code, notes, and snippets.

@vikram-s-narayan
Created May 26, 2022 13:36
Show Gist options
  • Save vikram-s-narayan/cbe2e65dfe3c42b062b2175c70d5e22f to your computer and use it in GitHub Desktop.
Save vikram-s-narayan/cbe2e65dfe3c42b062b2175c70d5e22f to your computer and use it in GitHub Desktop.
First attempt at GEKPLS
using Statistics
using ScikitLearn
using LinearAlgebra
@sk_import cross_decomposition: PLSRegression
####### START OF ALL FUNCTIONS ##########
function _ge_compute_pls(X, y, n_comp, grads, delta_x, xlimits, extra_points)
"""
Gradient-enhanced PLS-coefficients.
Parameters
----------
X: [n_obs,dim] - The input variables.
y: [n_obs,ny] - The output variable
n_comp: int - Number of principal components used.
gradients: - The gradient values. Matrix size (n_obs,dim)
delta_x: real - The step used in the First Order Taylor Approximation
xlimits: [dim, 2]- The upper and lower var bounds.
extra_points: int - The number of extra points per each training point.
Returns
-------
Coeff_pls: [dim, n_comp] - The PLS-coefficients.
X: Concatenation of XX: [extra_points*nt, dim] - Extra points added (when extra_points > 0) and X
y: Concatenation of yy[extra_points*nt, 1]- Extra points added (when extra_points > 0) and y
"""
# this function is equivalent to a combination of
# https://github.com/SMTorg/smt/blob/f124c01ffa78c04b80221dded278a20123dac742/smt/utils/kriging_utils.py#L1036
# and https://github.com/SMTorg/smt/blob/f124c01ffa78c04b80221dded278a20123dac742/smt/surrogate_models/gekpls.py#L48
nt, dim = size(X)
XX = zeros(0,dim)
yy = zeros(0,size(y)[2])
_pls = PLSRegression(n_comp)
coeff_pls = zeros((dim, n_comp, nt))
for i in 1:nt
if dim >= 3 #to do ... this is consistent with SMT but inefficient. Move outside for loop and evaluate dim >= 3 just once.
bb_vals = circshift(boxbehnken(dim, 1),1)
_X = zeros((size(bb_vals)[1], dim))
_y = zeros((size(bb_vals)[1], 1))
bb_vals = bb_vals .* (delta_x * (xlimits[:, 2] - xlimits[:, 1]))' #smt calls this sign. I've called it bb_vals
_X = X[i, :]' .+ bb_vals
bb_vals = bb_vals .* grads[i,:]'
_y = y[i, :] .+ sum(bb_vals, dims=2)
else
println("GEKPLS for less than 3 dimensions is coming soon")
end
_pls.fit(_X, _y)
coeff_pls[:, :, i] = _pls.x_rotations_ #size of _pls.x_rotations_ is (dim, n_comp)
if extra_points != 0
start_index = max(1, length(coeff_pls[:,1,i])-extra_points+1) #todo: evaluate just once
max_coeff = sortperm(broadcast(abs,coeff_pls[:,1,i]))[start_index:end]
for ii in max_coeff
XX = [XX; transpose(X[i, :])]
XX[end, ii] += delta_x * (xlimits[ii,2]-xlimits[ii,1])
yy = [yy; y[i]]
yy[end] += grads[i,ii] * delta_x * (xlimits[ii,2]-xlimits[ii,1])
end
end
end
if extra_points != 0
X = [X; XX]
y = [y; yy]
end
pls_mean = mean(broadcast(abs,coeff_pls),dims=3)
return pls_mean, X, y
end
######start of bbdesign######
#
# Adapted from 'ExperimentalDesign.jl: Design of Experiments in Julia'
# https://github.com/phrb/ExperimentalDesign.jl
# MIT License
# ExperimentalDesign.jl: Design of Experiments in Julia
# Copyright (C) 2019 Pedro Bruel <pedro.bruel@gmail.com>
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
function boxbehnken(matrix_size::Int)
boxbehnken(matrix_size, matrix_size)
end
function boxbehnken(matrix_size::Int, center::Int)
@assert matrix_size>=3
A_fact = explicit_fullfactorial(Tuple([-1,1] for i = 1:2))
rows = floor(Int, (0.5*matrix_size*(matrix_size-1))*size(A_fact)[1])
A = zeros(rows, matrix_size)
l = 0
for i in 1:matrix_size-1
for j in i+1:matrix_size
l = l +1
A[max(0, (l - 1)*size(A_fact)[1])+1:l*size(A_fact)[1], i] = A_fact[:, 1]
A[max(0, (l - 1)*size(A_fact)[1])+1:l*size(A_fact)[1], j] = A_fact[:, 2]
end
end
if center == matrix_size
if matrix_size <= 16
points = [0, 0, 3, 3, 6, 6, 6, 8, 9, 10, 12, 12, 13, 14, 15, 16]
center = points[matrix_size]
end
end
A = transpose(hcat(transpose(A), transpose(zeros(center, matrix_size))))
end
function explicit_fullfactorial(factors::Tuple)
explicit_fullfactorial(fullfactorial(factors))
end
function explicit_fullfactorial(iterator::Base.Iterators.ProductIterator)
hcat(vcat.(collect(iterator)...)...)
end
function fullfactorial(factors::Tuple)
Base.Iterators.product(factors...)
end
######end of bb design######
function standardization(X,y)
"""
We substract the mean from each variable. Then, we divide the values of each
variable by its standard deviation.
Parameters
----------
X - The input variables.
y - The output variable.
Returns
-------
X: [n_obs, dim]
The standardized input matrix.
y: [n_obs, 1]
The standardized output vector.
X_offset: The mean (or the min if scale_X_to_unit=True) of each input variable.
y_mean: The mean of the output variable.
X_scale: The standard deviation of each input variable.
y_std: The standard deviation of the output variable.
"""
#Equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L21
X_offset = mean(X, dims = 1)
X_scale = std(X, dims = 1)
X_scale = map(x -> (x==0.0) ? x=1 : x=x, X_scale) #to prevent division by 0 below
y_mean = mean(y)
y_std = std(y)
y_std = map(y -> (y==0) ? y=1 : y=y, y_std) #to prevent division by 0 below
X = (X.-X_offset) ./ X_scale
y = (y .- y_mean) ./ y_std
return X, y, X_offset, y_mean, X_scale, y_std
end
function cross_distances(X)
"""
Computes the nonzero componentwise cross-distances between the vectors
in X
Parameters
----------
X: [n_obs, dim]
Returns
-------
D: [n_obs * (n_obs - 1) / 2, dim]
- The cross-distances between the vectors in X.
ij: [n_obs * (n_obs - 1) / 2, 2]
- The indices i and j of the vectors in X associated to the cross-
distances in D.
"""
# equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L86
n_samples, n_features = size(X)
n_nonzero_cross_dist = ( n_samples * (n_samples - 1) ) ÷ 2
ij = zeros((n_nonzero_cross_dist, 2))
D = zeros((n_nonzero_cross_dist, n_features))
ll_1 = 0
for k in 1:n_samples - 1
ll_0 = ll_1 + 1
ll_1 = ll_0 + n_samples - k - 1
ij[ll_0:ll_1, 1] .= k
ij[ll_0:ll_1, 2] = k+1:1:n_samples
D[ll_0:ll_1, :] = -(X[(k + 1) : n_samples,:] .- X[k,:]')
end
return D, Int.(ij)
end
function componentwise_distance_PLS(D, corr, n_comp, coeff_pls)
"""
Computes the nonzero componentwise cross-spatial-correlation-distance
between the vectors in X.
Equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L1257
with some simplifications (removed theta and return_derivative as it's not required for GEKPLS)
Parameters
----------
D: [n_obs * (n_obs - 1) / 2, dim]
- The L1 cross-distances between the vectors in X.
corr: str
- Name of the correlation function used.
squar_exp or abs_exp.
n_comp: int
- Number of principal components used.
coeff_pls: [dim, n_comp]
- The PLS-coefficients.
Returns
-------
D_corr: [n_obs * (n_obs - 1) / 2, n_comp]
- The componentwise cross-spatial-correlation-distance between the
vectors in X.
"""
#equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L1257
#todo
#figure out how to handle this computation in the case of very large matrices
#similar to what SMT has done
#at https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L1257
D_corr = zeros((size(D)[1], n_comp))
if corr == "squar_exp"
D_corr = D.^2 * coeff_pls.^2
else #abs_exp
D_corr = abs.(D) * abs.(coeff_pls)
end
return D_corr
end
function squar_exp(theta, d)
n_components = size(d)[2]
theta = reshape(theta, (1,n_components))
return exp.(-sum(theta .* d, dims=2))
end
function differences(A,B)
#equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/utils/kriging_utils.py#L392
#to have behavior comparable to SMT,
#ensure that you send in as
#differences(B,A)
#THIS FUNCTION NEEDS TO BE REWRITTEN
C = [0.0 0.0 0.0]
for row in eachrow(B)
C = vcat(C, -A.+transpose(row))
end
return C[2:size(C)[1],:]
end
function _reduced_likelihood_function(theta, kernel_type, d, nt, ij, y_norma, noise=0.0)
"""
This function is a loose translation of SMT code from
https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/surrogate_models/krg_based.py#L247
It determines the BLUP parameters and evaluates the reduced likelihood function for the given theta.
Parameters
----------
theta: array containing the parameters at which the Gaussian Process model parameters should be determined.
kernel_type: name of the correlation function.
d: The componentwise cross-spatial-correlation-distance between the vectors in X.
nt: number of training points
ij: The indices i and j of the vectors in X associated to the cross-distances in D.
y_norma: Standardized y values
noise: noise hyperparameter - increasing noise reduces reduced_likelihood_function_value
Returns
-------
reduced_likelihood_function_value: real
- The value of the reduced likelihood function associated with the given autocorrelation parameters theta.
beta: Generalized least-squares regression weights
gamma: Gaussian Process weights.
"""
#equivalent of https://github.com/SMTorg/smt/blob/4a4df255b9259965439120091007f9852f41523e/smt/surrogate_models/krg_based.py#L247
reduced_likelihood_function_value = -Inf
nugget = 1000000.0 * eps() #a jitter for numerical stability; reducing the multiple from 1000000.0 results in positive definite error for Cholesky decomposition;
if kernel_type =="squar_exp" #todo - add other kernel types like abs_exp, matern52 etc.
r = squar_exp(theta,d)
end
R = (I + zeros(nt,nt)) .* (1.0 + nugget + noise)
for k in 1:size(ij)[1] #todo - speed up using @inbounds / @simd where required
R[ij[k,1],ij[k,2]]=r[k]
R[ij[k,2],ij[k,1]]=r[k]
end
C = cholesky(R).L #todo - #values diverge at this point from SMT code; verify impact
F = ones(nt,1) #todo - examine if this should be a parameter for this function
Ft = C\F
Q, G = qr(Ft)
Q = Array(Q)
Yt = C\y_norma
#todo - in smt, they check if the matrix is ill-conditioned using SVD. Verify and include if necessary
beta = G \ [(transpose(Q) ⋅ Yt)]
rho = Yt .- (Ft .* beta)
gamma = transpose(C) \ rho
sigma2 = sum((rho).^2, dims=1)/nt
detR = prod(diag(C).^(2.0/nt))
reduced_likelihood_function_value = - nt * log10(sum(sigma2)) - nt * log10(detR)
return beta, gamma, reduced_likelihood_function_value
end
####### END OF ALL FUNCTIONS ##########
####### START OF DEFINING INPUT VARIABLES FOR TRAINING ##########
X = [
[ 1. 2. 3.]
[ 4. 5. 6.]
[ 7. 8. 9.]
[10. 11. 12.]
[ 2. 1. 2.]
[ 1. 1. 2.]
[ 2. 2. 1.]
]
y = transpose([ 14. 77. 194. 365. 9. 4. 9. ])
n_comp = 2
grads = [
[ 2. 4. 6.]
[ 8. 10. 12.]
[ 14. 16. 18.]
[ 20. 22. 24.]
[ 4. 2. 4. ]
[ 2. 2. 4. ]
[ 4. 4. 2. ]
]
delta_x = 0.0001
xlimits = [[-10. 10.]
[-10. 10.]
[-10. 10.]]
extra_points = 2
####### END OF DEFINING INPUT VARIABLES FOR TRAINING ##########
####### START OF TRAINING ##########
pls_mean, X_after_PLS, y_after_PLS = _ge_compute_pls(X, y, n_comp, grads, delta_x, xlimits, extra_points)
X_after_std, y_after_std, X_offset, y_mean, X_scale, y_std = standardization(X_after_PLS, y_after_PLS)
D, ij = cross_distances(X_after_std)
pls_mean_reshaped = reshape(pls_mean, (3,2))
d = componentwise_distance_PLS(D, "squar_exp", n_comp, pls_mean_reshaped)
theta = [0.01, 0.01]
nt, nd = size(X_after_PLS)
beta, gamma, reduced_likelihood_function_value = _reduced_likelihood_function(theta, "squar_exp", d, nt, ij, y_after_std)
####### END OF TRAINING ##########
####### START OF PREDICTING ##########
X_test = reshape([2.0,2.0,3.0],(1,3))
#X_test = [2.0 2.0 2.0; 2.5 2.5 2.5] #todo fix error in multi-row case
n_eval, n_features_x = size(X_test)
X_cont = (X_test .- X_offset) ./ X_scale
dx = differences(X_cont, X_after_std)
pred_d = componentwise_distance_PLS(dx, "squar_exp", n_comp, pls_mean_reshaped)
r = squar_exp(theta, pred_d)
f = ones(n_eval,1)
y_ = (f ⋅ beta) + (r ⋅ gamma)
y = y_mean + y_std * y_
####### END OF PREDICTING ##########
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment