Last active
February 5, 2023 10:34
-
-
Save youheiakimoto/2fb26c0ace43c22b8f19c7796e69e108 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division # use // for integer division | |
from __future__ import absolute_import # use from . import | |
from __future__ import print_function # print function | |
from __future__ import unicode_literals # all the strings are unicode | |
__author__ = 'Youhei Akimoto' | |
import warnings | |
from math import sqrt, exp, log, ceil, floor | |
import numpy as np | |
from numpy.random import randn | |
from numpy.linalg import norm, eigh | |
class VkdCma(object): | |
"""O(N*k^2 + k^3) Time/Space Variant of CMA-ES with C = D * (I + V * V^T) * D | |
References | |
---------- | |
[1] Youhei Akimoto and Nikolaus Hansen. | |
Online Model Selection for Restricted Covariance Matrix Adaptation. | |
In Proc. of PPSN 2016, pp. 3--13 (2016) | |
[2] Youhei Akimoto and Nikolaus Hansen. | |
Projection-Based Restricted Covariance Matrix Adaptation for High | |
Dimension. In Proc. of GECCO 2016, pp. 197--204 (2016) | |
""" | |
def __init__(self, func, xmean0, sigma0, **kwargs): | |
# ES Parameters | |
self.N = len(xmean0) | |
self.lam = kwargs.get('lam', int(4 + floor(3 * log(self.N)))) | |
wtemp = np.array([ | |
np.log(np.float(self.lam + 1) / 2.0) - np.log(1 + i) | |
for i in range(self.lam // 2) | |
]) | |
self.w = kwargs.get('w', wtemp / np.sum(wtemp)) | |
self.sqrtw = np.sqrt(self.w) | |
self.mueff = 1.0 / (self.w**2).sum() | |
self.mu = self.w.shape[0] | |
self.neval = 0 | |
# Arguments | |
self.func = func | |
self.xmean = np.array(xmean0) | |
if isinstance(sigma0, np.ndarray): | |
self.sigma = np.exp(np.log(sigma0).mean()) | |
self.D = sigma0 / self.sigma | |
else: | |
self.sigma = sigma0 | |
self.D = np.ones(self.N) | |
# VkD Static Parameters | |
self.k = kwargs.get('k_init', 0) # alternatively, self.w.shape[0] | |
self.kmin = kwargs.get('kmin', 0) | |
self.kmax = kwargs.get('kmax', self.N - 1) | |
assert (0 <= self.kmin <= self.kmax < self.N) | |
self.k_inc_cond = kwargs.get('k_inc_cond', 30.0) | |
self.k_dec_cond = kwargs.get('k_dec_cond', self.k_inc_cond) | |
self.k_adapt_factor = kwargs.get('k_adapt_factor', 1.414) | |
self.factor_sigma_slope = kwargs.get('factor_sigma_slope', 0.1) | |
self.factor_diag_slope = kwargs.get( | |
'factor_diag_slope', 1.0) # 0.3 in PPSN (due to cc change) | |
self.opt_conv = 0.5 * min(1, self.lam / self.N) | |
self.accepted_slowdown = max(1., self.k_inc_cond / 10.) | |
self.k_adapt_decay = 1.0 / self.N | |
self.k_adapt_wait = 2.0 / self.k_adapt_decay - 1 | |
# VkD Dynamic Parameters | |
self.k_active = 0 | |
self.last_log_sigma = np.log(self.sigma) | |
self.last_log_d = 2.0 * np.log(self.D) | |
self.last_log_cond_corr = np.zeros(self.N) | |
self.ema_log_sigma = ExponentialMovingAverage( | |
decay=self.opt_conv / self.accepted_slowdown, dim=1) | |
self.ema_log_d = ExponentialMovingAverage( | |
decay=self.k_adapt_decay, dim=self.N) | |
self.ema_log_s = ExponentialMovingAverage( | |
decay=self.k_adapt_decay, dim=self.N) | |
self.itr_after_k_inc = 0 | |
# CMA Learning Rates | |
self.cm = kwargs.get('cm', 1.0) | |
(self.cone, self.cmu, self.cc) = self._get_learning_rate(self.k) | |
# TPA Parameters | |
self.cs = kwargs.get('cs', 0.3) | |
self.ds = kwargs.get('ds', np.sqrt(self.N)) # or 4 - 3/N | |
self.flg_injection = False | |
self.ps = 0 | |
# Initialize Dynamic Parameters | |
self.V = np.zeros((self.k, self.N)) | |
self.S = np.zeros(self.N) | |
self.pc = np.zeros(self.N) | |
self.dx = np.zeros(self.N) | |
self.U = np.zeros((self.N, self.k + self.mu + 1)) | |
self.arx = np.zeros((self.lam, self.N)) | |
self.arf = np.zeros(self.lam) | |
# Stopping Condition | |
self.fhist_len = 20 + self.N // self.lam | |
self.tolf_checker = TolfChecker(self.fhist_len) | |
self.ftarget = kwargs.get('ftarget', 1e-8) | |
self.maxeval = kwargs.get('maxeval', 5e3 * self.N * self.lam) | |
self.tolf = kwargs.get('tolf', abs(self.ftarget) / 1e5) | |
self.tolfrel = kwargs.get('tolfrel', 1e-12) | |
self.minstd = kwargs.get('minstd', 1e-12) | |
self.minstdrel = kwargs.get('minstdrel', 1e-12) | |
self.maxconds = kwargs.get('maxconds', 1e12) | |
self.maxcondd = kwargs.get('maxcondd', 1e6) | |
# Other Options | |
self.batch_evaluation = kwargs.get('batch_evaluation', False) | |
def run(self): | |
itr = 0 | |
satisfied = False | |
while not satisfied: | |
itr += 1 | |
self._onestep() | |
satisfied, condition = self._check() | |
if itr % 20 == 0: | |
print(itr, self.neval, self.arf.min(), self.sigma) | |
if satisfied: | |
print(condition) | |
return self.xmean | |
def _onestep(self): | |
# ====================================================================== | |
# VkD-CMA (GECCO 2016) | |
k = self.k | |
ka = self.k_active | |
# Sampling | |
if True: | |
# Sampling with two normal vectors | |
# Available only if S >= 0 | |
arzd = randn(self.lam, self.N) | |
arzv = randn(self.lam, ka) | |
ary = (arzd + np.dot(arzv * np.sqrt(self.S[:ka]), self.V[:ka]) | |
) * self.D | |
else: | |
# Sampling with one normal vectors | |
# Available even if S < 0 as long as V are orthogonal to each other | |
arz = randn(self.lam, self.N) | |
ary = arz + np.dot( | |
np.dot(arz, self.V[:ka].T) * | |
(np.sqrt(1.0 + self.S[:ka]) - 1.0), self.V[:ka]) | |
ary *= self.D | |
# Injection | |
if self.flg_injection: | |
mnorm = self._mahalanobis_square_norm(self.dx) | |
dy = (norm(randn(self.N)) / sqrt(mnorm)) * self.dx | |
ary[0] = dy | |
ary[1] = -dy | |
self.arx = self.xmean + self.sigma * ary | |
# Evaluation | |
if self.batch_evaluation: | |
self.arf = self.func(self.arx) | |
self.neval += self.lam | |
else: | |
self.arf = np.zeros(self.lam) | |
for i in range(self.lam): | |
self.arf[i] = self.func(self.arx[i]) | |
self.neval += 1 | |
idx = np.argsort(self.arf) | |
if not np.all(self.arf[idx[1:]] - self.arf[idx[:-1]] > 0.): | |
warnings.warn("assumed no tie, but there exists", RuntimeWarning) | |
sary = ary[idx[:self.mu]] | |
# Update xmean | |
self.dx = np.dot(self.w, sary) | |
self.dz = self._inv_sqrt_ivv(self.dx / self.D) # For flg_dev_kadapt | |
self.xmean += (self.cm * self.sigma) * self.dx | |
# TPA (PPSN 2014 version) | |
if self.flg_injection: | |
alpha_act = np.where(idx == 1)[0][0] - np.where(idx == 0)[0][0] | |
alpha_act /= float(self.lam - 1) | |
self.ps += self.cs * (alpha_act - self.ps) | |
self.sigma *= exp(self.ps / self.ds) | |
hsig = self.ps < 0.5 | |
else: | |
self.flg_injection = True | |
hsig = True | |
# Cumulation | |
self.pc = (1 - self.cc) * self.pc + hsig * sqrt(self.cc * (2 - self.cc) | |
* self.mueff) * self.dx | |
# Update V, S and D | |
# Cov = D(alpha**2 * I + UU^t)D | |
if self.cmu == 0.0: | |
rankU = ka + 1 | |
alpha = sqrt( | |
abs(1 - self.cmu - self.cone + self.cone * (1 - hsig) * self.cc | |
* (2 - self.cc))) | |
self.U[:, :ka] = (self.V[:ka].T * (np.sqrt(self.S[:ka]) * alpha)) | |
self.U[:, rankU - 1] = sqrt(self.cone) * (self.pc / self.D) | |
elif self.cone == 0.0: | |
rankU = ka + self.mu | |
alpha = sqrt( | |
abs(1 - self.cmu - self.cone + self.cone * (1 - hsig) * self.cc | |
* (2 - self.cc))) | |
self.U[:, :ka] = (self.V[:ka].T * (np.sqrt(self.S[:ka]) * alpha)) | |
self.U[:, ka:rankU] = sqrt(self.cmu) * self.sqrtw * (sary / | |
self.D).T | |
else: | |
rankU = ka + self.mu + 1 | |
alpha = sqrt( | |
abs(1 - self.cmu - self.cone + self.cone * (1 - hsig) * self.cc | |
* (2 - self.cc))) | |
self.U[:, :ka] = (self.V[:ka].T * (np.sqrt(self.S[:ka]) * alpha)) | |
self.U[:, ka:rankU - 1] = sqrt(self.cmu) * self.sqrtw * (sary / | |
self.D).T | |
self.U[:, rankU - 1] = sqrt(self.cone) * (self.pc / self.D) | |
if self.N > rankU: | |
# O(Nk^2 + k^3) | |
DD, R = eigh(np.dot(self.U[:, :rankU].T, self.U[:, :rankU])) | |
idxeig = np.argsort(DD)[::-1] | |
gamma = 0 if rankU <= k else DD[idxeig[k:]].sum() / (self.N - k) | |
beta = alpha * alpha + gamma | |
self.k_active = ka = min(np.sum(DD >= 0), k) | |
self.S[:ka] = (DD[idxeig[:ka]] - gamma) / beta | |
self.V[:ka] = (np.dot(self.U[:, :rankU], R[:, idxeig[:ka]]) / | |
np.sqrt(DD[idxeig[:ka]])).T | |
else: | |
# O(N^3 + N^2(k+mu+1)) | |
# If this is the case, the standard CMA is preferred | |
DD, L = eigh(np.dot(self.U[:, :rankU], self.U[:, :rankU].T)) | |
idxeig = np.argsort(DD)[::-1] | |
gamma = 0 if rankU <= k else DD[idxeig[k:]].sum() / (self.N - k) | |
beta = alpha * alpha + gamma | |
self.k_active = ka = min(np.sum(DD >= 0), k) | |
self.S[:ka] = (DD[idxeig[:ka]] - gamma) / beta | |
self.V[:ka] = L[:, idxeig[:ka]].T | |
self.D *= np.sqrt( | |
(alpha * alpha + np.sum( | |
self.U[:, :rankU] * self.U[:, :rankU], axis=1)) / | |
(1.0 + np.dot(self.S[:ka], self.V[:ka] * self.V[:ka]))) | |
# Covariance Normalization by Its Determinant | |
gmean_eig = np.exp(self._get_log_determinant_of_cov() / self.N / 2.0) | |
self.D /= gmean_eig | |
self.pc /= gmean_eig | |
# ====================================================================== | |
# k-Adaptation (PPSN 2016) | |
self.itr_after_k_inc += 1 | |
# Exponential Moving Average | |
self.ema_log_sigma.update(log(self.sigma) - self.last_log_sigma) | |
self.lnsigma_change = self.ema_log_sigma.M / (self.opt_conv / | |
self.accepted_slowdown) | |
self.last_log_sigma = log(self.sigma) | |
self.ema_log_d.update(2. * np.log(self.D) + np.log(1 + np.dot( | |
self.S[:self.k], self.V[:self.k]**2)) - self.last_log_d) | |
self.lndiag_change = self.ema_log_d.M / (self.cmu + self.cone) | |
self.last_log_d = 2. * np.log( | |
self.D) + np.log(1 + np.dot(self.S[:self.k], self.V[:self.k]**2)) | |
self.ema_log_s.update(np.log(1 + self.S) - self.last_log_cond_corr) | |
self.lnlambda_change = self.ema_log_s.M / (self.cmu + self.cone) | |
self.last_log_cond_corr = np.log(1 + self.S) | |
# Check for adaptation condition | |
flg_k_increase = self.itr_after_k_inc > self.k_adapt_wait | |
flg_k_increase *= self.k < self.kmax | |
flg_k_increase *= np.all((1 + self.S[:self.k]) > self.k_inc_cond) | |
flg_k_increase *= ( | |
np.abs(self.lnsigma_change) < self.factor_sigma_slope) | |
flg_k_increase *= np.all( | |
np.abs(self.lndiag_change) < self.factor_diag_slope) | |
# print(self.itr_after_k_inc > self.k_adapt_wait, self.k < self.kmax, | |
# np.all((1 + self.S[:self.k]) > self.k_inc_cond), | |
# np.abs(self.lnsigma_change) < self.factor_sigma_slope, | |
# np.percentile(np.abs(self.lndiag_change), [1, 50, 99])) | |
flg_k_decrease = (self.k > self.kmin) * ( | |
1 + self.S[:self.k] < self.k_dec_cond) | |
flg_k_decrease *= (self.lnlambda_change[:self.k] < 0.) | |
if (self.itr_after_k_inc > self.k_adapt_wait) and flg_k_increase: | |
# ----- Increasing k ----- | |
self.k_active = k | |
self.k = newk = min( | |
max(int(ceil(self.k * self.k_adapt_factor)), self.k + 1), | |
self.kmax) | |
self.V = np.vstack((self.V, np.zeros((newk - k, self.N)))) | |
self.U = np.empty((self.N, newk + self.mu + 1)) | |
# update constants | |
(self.cone, self.cmu, self.cc) = self._get_learning_rate(self.k) | |
self.itr_after_k_inc = 0 | |
elif self.itr_after_k_inc > k * self.k_adapt_wait and np.any( | |
flg_k_decrease): | |
# ----- Decreasing k ----- | |
flg_keep = np.logical_not(flg_k_decrease) | |
new_k = max(np.count_nonzero(flg_keep), self.kmin) | |
self.V = self.V[flg_keep] | |
self.S[:new_k] = (self.S[:flg_keep.shape[0]])[flg_keep] | |
self.S[new_k:] = 0 | |
self.k = self.k_active = new_k | |
# update constants | |
(self.cone, self.cmu, self.cc) = self._get_learning_rate(self.k) | |
# ============================================================================== | |
# Covariance Normalization by Its Determinant | |
gmean_eig = exp(self._get_log_determinant_of_cov() / self.N / 2.0) | |
self.D /= gmean_eig | |
self.pc /= gmean_eig | |
def _mahalanobis_square_norm(self, dx): | |
"""Square norm of dx w.r.t. C = D*(I + V*S*V^t)*D | |
Parameters | |
---------- | |
dx : numpy.ndarray (1D) | |
Returns | |
------- | |
square of the Mahalanobis distance dx^t * (D*(I + V*S*V^t)*D)^{-1} * dx | |
""" | |
D = self.D | |
V = self.V[:self.k_active] | |
S = self.S[:self.k_active] | |
dy = dx / D | |
vdy = np.dot(V, dy) | |
return np.sum(dy * dy) - np.sum((vdy * vdy) * (S / (S + 1.0))) | |
def _inv_sqrt_ivv(self, vec): | |
"""Return (I + V*V^t)^{-1/2} x""" | |
if self.k_active == 0: | |
return vec | |
else: | |
return vec + np.dot( | |
np.dot(self.V[:self.k_active], vec) * | |
(1.0 / np.sqrt(1.0 + self.S[:self.k_active]) - 1.0 | |
), self.V[:self.k_active]) | |
def _get_learning_rate(self, k): | |
"""Return the learning rate cone, cmu, cc depending on k | |
Parameters | |
---------- | |
k : int | |
the number of vectors for covariance matrix | |
Returns | |
------- | |
cone, cmu, cc : float in [0, 1]. Learning rates for rank-one, rank-mu, | |
and the cumulation factor for rank-one. | |
""" | |
nelem = self.N * (k + 1) | |
cone = 2.0 / (nelem + self.N + 2 * (k + 2) + self.mueff) # PPSN 2016 | |
# cone = 2.0 / (nelem + 2 * (k + 2) + self.mueff) # GECCO 2016 | |
# cc = (4 + self.mueff / self.N) / ( | |
# (self.N + 2 * (k + 1)) / 3 + 4 + 2 * self.mueff / self.N) | |
# New Cc and C1: Best Cc depends on C1, not directory on K. | |
# Observations on Cigar (N = 3, 10, 30, 100, 300, 1000) by Rank-1 VkD. | |
cc = sqrt(cone) | |
cmu = min(1 - cone, 2.0 * (self.mueff - 2 + 1.0 / self.mueff) / | |
(nelem + 4 * (k + 2) + self.mueff)) | |
return cone, cmu, cc | |
def _get_log_determinant_of_cov(self): | |
return 2.0 * np.sum(np.log(self.D)) + np.sum( | |
np.log(1.0 + self.S[:self.k_active])) | |
def _check(self): | |
is_satisfied = False | |
condition = '' | |
self.tolf_checker.update(self.arf) | |
std = self.sigma * exp(self._get_log_determinant_of_cov() / self.N / | |
2.0) | |
if self.arf.min() <= self.ftarget: | |
is_satisfied = True | |
condition = 'ftarget' | |
if not is_satisfied and self.neval >= self.maxeval: | |
is_satisfied = True | |
condition = 'maxeval' | |
if not is_satisfied and self.tolf_checker.check_relative(self.tolfrel): | |
is_satisfied = True | |
condition = 'tolfrel' | |
if not is_satisfied and self.tolf_checker.check_absolute(self.tolf): | |
is_satisfied = True | |
condition = 'tolf' | |
if not is_satisfied and self.tolf_checker.check_flatarea(): | |
is_satisfied = True | |
condition = 'flatarea' | |
if not is_satisfied and std < self.minstd: | |
is_satisfied = True | |
condition = 'minstd' | |
if not is_satisfied and std < self.minstd * np.median( | |
np.abs(self.xmean)): | |
is_satisfied = True | |
condition = 'minstdrel' | |
if not is_satisfied and np.any( | |
1 + self.S[:self.k_active] > self.maxconds): | |
is_satisfied = True | |
condition = 'maxconds' | |
if not is_satisfied and self.D.max() / self.D.min() > self.maxcondd: | |
is_satisfied = True | |
condition = 'maxcondd' | |
return is_satisfied, condition | |
class ExponentialMovingAverage(object): | |
"""Exponential Moving Average, Variance, and SNR (Signal-to-Noise Ratio) | |
See http://www-uxsup.csx.cam.ac.uk/~fanf2/hermes/doc/antiforgery/stats.pdf | |
""" | |
def __init__(self, decay, dim, flg_init_with_data=False): | |
""" | |
The latest N steps occupy approximately 86% of the information when | |
decay = 2 / (N - 1). | |
""" | |
self.decay = decay | |
self.M = np.zeros(dim) # Mean Estimate | |
self.S = np.zeros(dim) # Variance Estimate | |
self.flg_init = -flg_init_with_data | |
def update(self, datum): | |
a = self.decay if self.flg_init else 1. | |
self.S += a * ((1 - a) * (datum - self.M)**2 - self.S) | |
self.M += a * (datum - self.M) | |
class TolfChecker(object): | |
def __init__(self, size=20): | |
""" | |
Parameters | |
---------- | |
size : int | |
number of points for which the value is restored | |
""" | |
self._min_hist = np.empty(size) * np.nan | |
self._l_quartile_hist = np.empty(size) * np.nan | |
# self._median_hist = np.empty(size) * np.nan | |
self._u_quartile_hist = np.empty(size) * np.nan | |
# self._max_hist = np.empty(size) * np.nan | |
# self._pop_hist = np.empty(size) * np.nan | |
self._next_position = 0 | |
def update(self, arf): | |
self._min_hist[self._next_position] = np.nanmin(arf) | |
self._l_quartile_hist[self._next_position] = np.nanpercentile(arf, 25) | |
# self._median_hist[self._next_position] = np.nanmedian(arf) | |
self._u_quartile_hist[self._next_position] = np.nanpercentile(arf, 75) | |
# self._max_hist[self._next_position] = np.nanmax(arf) | |
self._next_position = ( | |
self._next_position + 1) % self._min_hist.shape[0] | |
def check(self, tolfun=1e-9): | |
# alias to check_absolute | |
return self.check_relative(tolfun) | |
def check_relative(self, tolfun=1e-9): | |
iqr = np.nanmedian(self._u_quartile_hist - self._l_quartile_hist) | |
return iqr < tolfun * np.abs(np.nanmedian(self._min_hist)) | |
def check_absolute(self, tolfun=1e-9): | |
iqr = np.nanmedian(self._u_quartile_hist - self._l_quartile_hist) | |
return iqr < tolfun | |
def check_flatarea(self): | |
return np.nanmedian(self._l_quartile_hist - self._min_hist) == 0 | |
if __name__ == '__main__': | |
def diag_cigar(x): | |
return 1e6 * np.inner(x, x) + (1. - 1e6) * np.sum(x)**2 / len(x) | |
def ellipsoid(x): | |
return np.dot( | |
np.logspace(0, 6, num=len(x), base=10, endpoint=True), x * x) | |
N = 20 | |
fobj = diag_cigar | |
xmean0 = 3. + 2. * randn(N) | |
sigma0 = 2. | |
# Optional Parameters | |
esoption = dict() | |
esoption['lam'] = int(4 + 3 * log(N)) | |
esoption['ds'] = 4 - 3 / N # sqrt(N) in PPSN | |
# Termination Condition | |
tcoption = dict() | |
tcoption['ftarget'] = 1e-20 | |
tcoption['maxeval'] = int(5e3 * N * esoption['lam']) | |
tcoption['tolf'] = 1e-12 | |
tcoption['tolfrel'] = 1e-12 | |
tcoption['minstd'] = 1e-12 | |
tcoption['minstdrel'] = 1e-12 | |
tcoption['maxconds'] = 1e12 | |
tcoption['maxcondd'] = 1e6 | |
# k-adaptation | |
kaoption = dict() | |
kaoption['kmin'] = 0 | |
kaoption['kmax'] = N - 1 | |
kaoption['k_init'] = kaoption['kmin'] | |
kaoption['k_inc_cond'] = 30.0 | |
kaoption['k_dec_cond'] = kaoption['k_inc_cond'] | |
kaoption['k_adapt_factor'] = 1.414 | |
kaoption['factor_sigma_slope'] = 0.1 | |
kaoption['factor_diag_slope'] = 1.0 # 0.3 in PPSN | |
opts = dict() | |
opts.update(esoption) | |
opts.update(tcoption) | |
opts.update(kaoption) | |
n_restart = 10 | |
itr = 0 | |
for r in range(n_restart): | |
vkd = VkdCma(fobj, xmean0, sigma0, **opts) | |
satisfied = False | |
while not satisfied: | |
itr += 1 | |
vkd._onestep() | |
satisfied, condition = vkd._check() | |
if itr % 20 == 0: | |
print(itr, vkd.neval, vkd.arf.min(), vkd.sigma, vkd.k) | |
print(condition) | |
if condition == 'ftarget': | |
break | |
else: | |
opts['lam'] = opts['lam'] * 2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment