wielandbrendel/brendel_bethge.py

## brendel_bethge.py
import logging
from foolbox.utils import onehot_like

from foolbox.attacks.base import Attack
from foolbox.attacks.base import generator_call_decorator

import numpy as np
from numba import jitclass

import line_profiler
import atexit
import os
import functools
import time

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# file handler for log files
filename = np.random.randint(10000000)
path = os.path.realpath(__file__)
path = path.split(".py")[0]
file_handler = logging.FileHandler(f"{path}_{filename}.log")
formatter = logging.Formatter("%(asctime)s : %(levelname)s : %(message)s")
file_handler.setFormatter(formatter)

# add file handler to logger
logger.addHandler(file_handler)

# output to stdout
# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.WARNING)

_epsilon = np.sqrt(np.finfo(float).eps)

profile = line_profiler.LineProfiler()
atexit.register(profile.print_stats)

EPS = 1e-10

norm = np.linalg.norm


class BrendelBethgeAttack(Attack):
    """ Base class for the Brendel & Bethge adversarial attack described in [1]_.

    References
    ----------
    .. [1] Wieland Brendel, Jonas Rauber, Matthias Kümmerer, Ivan Ustyuzhaninov, Matthias Bethge,
    "Accurate, reliable and fast robustness evaluation",
    https://arxiv.org/abs/1907.01003

    """

    @generator_call_decorator
    def __call__(
        self,
        input_or_adv,
        label=None,
        unpack=True,
        epsilon=None,
        overshoot=1.1,
        max_iterations=100,
        lr=1,
        momentum=0.8,
        starting_point=None,
    ):
        """The Brendel & Bethge adversarial attack.

        Parameters
        ----------
        input_or_adv : `numpy.ndarray` or :class:`Adversarial`
            The original, unperturbed input as a `numpy.ndarray` or
            an :class:`Adversarial` instance.
        label : int
            The reference label of the original input. Must be passed
            if `a` is a `numpy.ndarray`, must not be passed if `a` is
            an :class:`Adversarial` instance.
        unpack : bool
            If true, returns the adversarial input, otherwise returns
            the Adversarial object.
        max_iterations : int
            Number of steps for the optimization.
        epsilon : float or None, optional
            If not None the optimisation stops once it reaches a distance
            epsilon between original and adversarial image.
        lr : float
            Scaling factor for the trust region radius. Smaller values lead
            to smaller steps in each iteration which allows the attack to stay
            in closer vicinity to the decision boundary.
        overshoot: float, optional
            A small overshoot biases the attack to stay in the adversarial region
            (instead of sticking exactly to the decision boundary). Value should be
            slightly above 1.
        momentum: float, optional
            Momentum term that smoothes the normal vector of the decision boundary
            over several steps.
        """
        logger.info('Starting attack')
        a = input_or_adv
        del input_or_adv
        del label
        del unpack

        if not a.has_gradient():
            logger.fatal(
                "Applied gradient-based attack to model that "
                "does not provide gradients."
            )
            return

        min_, max_ = a.bounds()

        original_image = a.original_image
        original_label = a.original_class
        bounds = a.bounds()
        x0 = original_image.flatten()

        # make abortion condition available everywhere
        self.epsilon = epsilon

        # instantiate optimizer
        _optimizer = self.instantiate_optimizer()

        # test initial point
        logits, is_adv = yield from a.predictions(starting_point)

        # get suitable starting point
        x = yield from self._get_starting_point(
            a, bounds, original_image, starting_point=starting_point
        )

        if self.epsilon is not None:
            if a.distance.value <= epsilon:
                return None

        rate_normalization = np.prod(x.shape) * (max_ - min_)

        # stop if last three updates of adv_distance is not decreasing
        adv_distance_history = []
        num_of_nones = 0
        logger.info(f'Batch size is {x.shape[0]}')

        for i in range(max_iterations):
            logger.info(f"Starting iteration number {i}")
            # get logits and local boundary geometry
            try:
                x = np.clip(x, min_ + EPS, max_ - EPS)
                logits, is_adv = yield from a.predictions(x)
            except AssertionError:
                logger.exception("Assertion Error: ", x.min(), x.max(), min_, max_)
            logits_diff, _boundary = yield from self.normal_vector(a, x, logits)

            if _boundary.flatten().dot(_boundary.flatten()) < 1e-13:
                break

            # denoise estimate of boundary using a short history of the boundary
            if i == 0:
                boundary = _boundary
            else:
                boundary = (1 - momentum) * _boundary + momentum * boundary

            if boundary.flatten().dot(boundary.flatten()) < 1e-13:
                break

            logger.debug(
                f"Iteration {i}: logit = {logits[a.original_class]:.2}, logit-diff = {logits_diff:.2}, "
                f"current dist. = {float(a.normalized_distance(x).value):.4}, best distance = {float(a.distance.value):.4}"
            )

            # compute optimal step within trust region depending on metric
            original_shape = x.shape
            x = x.flatten()
            region = lr * rate_normalization

            # we aim to slight overshoot over the boundary to stay within the adversarial region
            corr_logits_diff = (
                overshoot * logits_diff
                if logits_diff < 0
                else (2 - overshoot) * logits_diff
            )

            # employ solver to find optimal step within trust region
            delta = _optimizer.solve(
                x0,
                x,
                boundary.flatten(),
                bounds[0],
                bounds[1],
                corr_logits_diff,
                region,
            )

            try:
                x += delta
            except TypeError:
                if num_of_nones < 5:
                    num_of_nones += 1
                else:
                    break

            # add step to current perturbation
            x = x.reshape(original_shape)
            x = np.clip(x, min_, max_)

            # early stopping if progress stalls
            if logits_diff > 0:
                adv_distance_history.append(a.distance.value)

            if len(adv_distance_history) > 4:
                past_distance = adv_distance_history[-4]
                if np.abs(a.distance.value - past_distance) / past_distance < 0.001:
                    logger.debug(
                        f"Progress is too slow. Finishing optimization after {i} steps."
                    )
                    if np.random.randint(10) == 5:
                        lr /= 2
                        logger.debug("new lr = ", lr)

                # sometimes distance_value is not really updated any more because the logit-diff is < 0
                if logits_diff < 0:
                    if (
                        np.abs(a.normalized_distance(x).value - past_distance)
                        / past_distance
                        < 0.001
                    ):
                        logger.debug(
                            f"Progress is too slow. Finishing optimization after {i} steps."
                        )
                        if np.random.randint(10) == 5:
                            lr /= 2
                            logger.debug("new lr = ", lr)

            if epsilon is not None:
                if a.distance.value <= epsilon:
                    break

            if lr < 1e-6:
                break

    def _initial_binary_search(self, x0, x1, a, max_iter=10, corridor=0.1):
        eps0, eps1 = 0, 1

        for i in range(max_iter):
            mid_point = self.mid_point(x0, x1, (eps0 + eps1) / 2, a.bounds())
            # mid_point = (x0 + x1) / 2
            logits, adv = yield from a.predictions(mid_point)

            # break condition
            l1, l2 = np.argsort(logits)[::-1][:2]
            sec_label = l1 if l1 != a.original_class else l2
            log1, log2 = logits[sec_label], logits[a.original_class]

            if abs(log1 - log2) / abs(log2) < corridor:
                break
            elif self.epsilon is not None:
                if a.distance.value <= self.epsilon:
                    break

            if adv:
                eps1 = (eps0 + eps1) / 2
            else:
                eps0 = (eps0 + eps1) / 2

        return mid_point

    def _get_starting_point(self, a, bounds, x0, corridor=0.1, starting_point=None):
        if starting_point is None:
            logger.debug(
                "No starting point is given. Falling back to random initialization, but other starting points "
                "like FGSM or DeepFool might be more query efficient and better."
            )
            starting_point = "random"

        if type(starting_point) == str:
            if starting_point == "random":
                logger.debug("Starting random search")
                # this could potentially be done better for different metrics
                x = np.random.uniform(bounds[0], bounds[1], size=x0.shape).astype(
                    np.float32
                )
                logits, adv = yield from a.predictions(x)
                while not adv:  # draw noise until image is adversarial
                    logger.debug("f", a.original_class, logits.argmax())
                    x = np.random.uniform(bounds[0], bounds[1], size=x0.shape).astype(
                        np.float32
                    )

                logger.debug("Perform binary search")
                x = yield from self._initial_binary_search(
                    x0, x, a, max_iter=10, corridor=corridor
                )
        else:
            if callable(starting_point):
                x = starting_point(a)
            else:
                x = starting_point
            x = yield from self._initial_binary_search(
                x0, x, a, max_iter=10, corridor=corridor
            )

        return x

    @classmethod
    def normal_vector(cls, a, x, logits):
        """Returns the loss and the gradient of the loss w.r.t. x,
        assuming that logits = model(x)."""

        targeted = a.target_class() is not None
        if targeted:
            c_minimize = cls.best_other_class(logits, a.target_class())
            c_maximize = a.target_class()
        else:
            c_minimize = a.original_class
            c_maximize = cls.best_other_class(logits, a.original_class)

        logits_diff = logits[c_minimize] - logits[c_maximize]

        # calculate the gradient of total_loss w.r.t. x
        logits_diff_grad = np.zeros_like(logits)
        logits_diff_grad[c_minimize] = 1
        logits_diff_grad[c_maximize] = -1
        is_adv_loss_grad = yield from a.backward(logits_diff_grad, x)

        return -logits_diff, is_adv_loss_grad

    @staticmethod
    def best_other_class(logits, exclude):
        """Returns the index of the largest logit, ignoring the class that
        is passed as `exclude`."""
        other_logits = logits - onehot_like(logits, exclude, value=np.inf)
        return np.argmax(other_logits)

    def instantiate_optimizer(self):
        raise NotImplementedError


class BrendelBethgeL2Attack(BrendelBethgeAttack):
    """ Brendel & Bethge adversarial attack described in [1]_ that minimizes the L2 distance.

    References
    ----------
    .. [1] Wieland Brendel, Jonas Rauber, Matthias Kümmerer, Ivan Ustyuzhaninov, Matthias Bethge,
    "Accurate, reliable and fast robustness evaluation",
    https://arxiv.org/abs/1907.01003

    """

    def instantiate_optimizer(self):
        return L2Optimizer()

    def mid_point(self, x0, x1, epsilon, bounds):
        # returns a point between x0 and x1 where
        # epsilon = 0 returns x0 and epsilon = 1
        # returns x1
        return epsilon * x1 + (1 - epsilon) * x0


class BrendelBethgeLinfAttack(BrendelBethgeAttack):
    """ Brendel & Bethge adversarial attack described in [1]_ that minimizes the L-infinity distance.

    References
    ----------
    .. [1] Wieland Brendel, Jonas Rauber, Matthias Kümmerer, Ivan Ustyuzhaninov, Matthias Bethge,
    "Accurate, reliable and fast robustness evaluation",
    https://arxiv.org/abs/1907.01003

    """

    def instantiate_optimizer(self):
        return LinfOptimizer()

    def mid_point(self, x0, x1, epsilon, bounds):
        # returns a point between x0 and x1 where
        # epsilon = 0 returns x0 and epsilon = 1
        delta = x1 - x0
        min_, max_ = bounds
        s = max_ - min_
        clipped_delta = np.clip(delta, -epsilon * s, epsilon * s)
        return x0 + clipped_delta


class BrendelBethgeL1Attack(BrendelBethgeAttack):
    """ Brendel & Bethge adversarial attack described in [1]_ that minimizes the L1 distance.

    References
    ----------
    .. [1] Wieland Brendel, Jonas Rauber, Matthias Kümmerer, Ivan Ustyuzhaninov, Matthias Bethge,
    "Accurate, reliable and fast robustness evaluation",
    https://arxiv.org/abs/1907.01003

    """

    def instantiate_optimizer(self):
        return L1Optimizer()

    def mid_point(self, x0, x1, epsilon, bounds):
        # returns a point between x0 and x1 where
        # epsilon = 0 returns x0 and epsilon = 1
        # returns x1
        threshold = (bounds[1] - bounds[0]) * (1 - epsilon)
        mask = np.abs(x1 - x0) > threshold
        new_x = x0.copy()
        new_x[mask] += (np.sign(x1 - x0) * (np.abs(x1 - x0) - threshold))[mask]
        return new_x


class BrendelBethgeL0Attack(BrendelBethgeAttack):
    """ Brendel & Bethge adversarial attack described in [1]_ that minimizes the L0 distance.

    References
    ----------
    .. [1] Wieland Brendel, Jonas Rauber, Matthias Kümmerer, Ivan Ustyuzhaninov, Matthias Bethge,
    "Accurate, reliable and fast robustness evaluation",
    https://arxiv.org/abs/1907.01003

    """

    def instantiate_optimizer(self):
        return L0Optimizer()

    def mid_point(self, x0, x1, epsilon, bounds):
        # returns a point between x0 and x1 where
        # epsilon = 0 returns x0 and epsilon = 1
        # returns x1
        threshold = (bounds[1] - bounds[0]) * epsilon
        mask = np.abs(x1 - x0) < threshold
        new_x = x0.copy()
        new_x[mask] = x1[mask]
        return new_x


@jitclass(spec=[])
class BFGSB(object):
    def __init__(self):
        pass

    def solve(
        self, fun_and_jac, q0, bounds, args, ftol=1e-10, pgtol=-1e-5, maxiter=None
    ):
        debug = False
        N = q0.shape[0]

        if maxiter is None:
            maxiter = N * 200

        l = bounds[:, 0]  # np.array([b[0] for b in bounds])
        u = bounds[:, 1]  # np.array([b[1] for b in bounds])

        func_calls = 0

        old_fval, gfk = fun_and_jac(q0, *args)
        func_calls += 1

        k = 0
        Hk = np.eye(N)

        # Sets the initial step guess to dx ~ 1
        qk = q0
        old_old_fval = old_fval + np.linalg.norm(gfk) / 2

        # gnorm = np.amax(np.abs(gfk))
        _gfk = gfk

        # Compare with implementation BFGS-B implementation
        # in https://github.com/andrewhooker/PopED/blob/master/R/bfgsb_min.R

        while k < maxiter:
            # check if projected gradient is still large enough
            pg_norm = 0
            for v in range(N):
                if _gfk[v] < 0:
                    gv = max(qk[v] - u[v], _gfk[v])
                else:
                    gv = min(qk[v] - l[v], _gfk[v])

                if pg_norm < np.abs(gv):
                    pg_norm = np.abs(gv)

            if pg_norm < pgtol:
                # logging.info("Stopping due to projected gradient criterium.")
                break

            # get cauchy point
            x_cp = self._cauchy_point(qk, l, u, _gfk.copy(), Hk)
            qk1 = self._subspace_min(qk, l, u, x_cp, _gfk.copy(), Hk)
            pk = qk1 - qk

            alpha_k, fc, gc, old_fval, old_old_fval, gfkp1, fnev = self._line_search_wolfe(
                fun_and_jac, qk, pk, _gfk, old_fval, old_old_fval, l, u, args
            )
            func_calls += fnev

            if alpha_k is None:
                # logging.info("Stopping due to alphak being None.")
                break

            if np.abs(old_fval - old_old_fval) <= (ftol + ftol * np.abs(old_fval)):
                # logging.info("Stopping due to function criterion.")
                break

            qkp1 = self._project(qk + alpha_k * pk, l, u)

            if gfkp1 is None:
                _, gfkp1 = fun_and_jac(qkp1, *args)

            sk = qkp1 - qk
            qk = qkp1

            yk = np.zeros_like(qk)
            for k3 in range(N):
                yk[k3] = gfkp1[k3] - _gfk[k3]

                if np.abs(yk[k3]) < 1e-4:
                    yk[k3] = -1e-4

            _gfk = gfkp1

            k += 1

            # update inverse Hessian matrix
            Hk_sk = Hk.dot(sk)

            sk_yk = 0
            sk_Hk_sk = 0
            for v in range(N):
                sk_yk += sk[v] * yk[v]
                sk_Hk_sk += sk[v] * Hk_sk[v]

            if np.abs(sk_yk) >= 1e-8:
                rhok = 1.0 / sk_yk
            else:
                rhok = 100000.0

            if np.abs(sk_Hk_sk) >= 1e-8:
                rsk_Hk_sk = 1.0 / sk_Hk_sk
            else:
                rsk_Hk_sk = 100000.0

            for v in range(N):
                for w in range(N):
                    # change this to the update formula of the Hessian (not inverse Hessian)
                    Hk[v, w] += yk[v] * yk[w] * rhok - Hk_sk[v] * Hk_sk[w] * rsk_Hk_s

        return qk

    def _cauchy_point(self, x, l, u, g, B):
        # finds the cauchy point for q(x)=x'Gx+x'd s$t. l<=x<=u
        # g=G*x+d #gradient of q(x)
        # converted from r-code: https://github.com/andrewhooker/PopED/blob/master/R/cauchy_point.R
        n = x.shape[0]
        t = np.zeros_like(x)
        d = np.zeros_like(x)

        for i in range(n):
            if g[i] < 0:
                t[i] = (x[i] - u[i]) / g[i]
            elif g[i] > 0:
                t[i] = (x[i] - l[i]) / g[i]
            elif g[i] == 0:
                t[i] = np.inf

            if t[i] == 0:
                d[i] = 0
            else:
                d[i] = -g[i]

        ts = t.copy()
        ts = ts[ts != 0]
        ts = np.sort(ts)

        df = g.dot(d)
        d2f = d.dot(B.dot(d))

        if d2f < 1e-10:
            return x

        dt_min = -df / d2f
        t_old = 0
        i = 0
        z = np.zeros_like(x)

        while i < ts.shape[0] and dt_min >= (ts[i] - t_old):
            ind = ts[i] < t
            d[~ind] = 0
            z = z + (ts[i] - t_old) * d
            df = g.dot(d) + d.dot(B.dot(z))
            d2f = d.dot(B.dot(d))
            dt_min = df / (d2f + 1e-8)
            t_old = ts[i]
            i += 1

        dt_min = max(dt_min, 0)
        t_old = t_old + dt_min
        x_cp = x - t_old * g
        temp = x - t * g
        x_cp[t_old > t] = temp[t_old > t]

        return x_cp

    def _subspace_min(self, x, l, u, x_cp, d, G):
        # converted from r-code: https://github.com/andrewhooker/PopED/blob/master/R/subspace_min.R
        n = x.shape[0]
        Z = np.eye(n)
        fixed = (x_cp <= l + 1e-8) + (x_cp >= u - 1e8)

        if np.all(fixed):
            x = x_cp
            return x

        Z = Z[:, ~fixed]
        rgc = Z.T.dot(d + G.dot(x_cp - x))
        rB = Z.T.dot(G.dot(Z)) + 1e-10 * np.eye(Z.shape[1])
        d[~fixed] = np.linalg.solve(rB, rgc)
        d[~fixed] = -d[~fixed]
        alpha = 1
        temp1 = alpha

        for i in np.arange(n)[~fixed]:
            dk = d[i]
            if dk < 0:
                temp2 = l[i] - x_cp[i]
                if temp2 >= 0:
                    temp1 = 0
                else:
                    if dk * alpha < temp2:
                        temp1 = temp2 / dk
                    else:
                        temp2 = u[i] - x_cp[i]
            else:
                temp2 = u[i] - x_cp[i]
                if temp1 <= 0:
                    temp1 = 0
                else:
                    if dk * alpha > temp2:
                        temp1 = temp2 / dk

            alpha = min(temp1, alpha)

        return x_cp + alpha * Z.dot(d[~fixed])

    def _project(self, q, l, u):
        N = q.shape[0]
        for k in range(N):
            if q[k] < l[k]:
                q[k] = l[k]
            elif q[k] > u[k]:
                q[k] = u[k]

        return q

    def _line_search_armijo(
        self, fun_and_jac, pt, dpt, func_calls, m, gk, l, u, x0, x, b, min_, max_, c, r
    ):
        ls_rho = 0.6
        ls_c = 1e-4
        ls_alpha = 1

        t = m * ls_c

        for k2 in range(100):
            ls_pt = self._project(pt + ls_alpha * dpt, l, u)

            gkp1, dgkp1 = fun_and_jac(ls_pt, x0, x, b, min_, max_, c, r)
            func_calls += 1

            if gk - gkp1 >= ls_alpha * t:
                break
            else:
                ls_alpha *= ls_rho

        return ls_alpha, ls_pt, gkp1, dgkp1, func_calls

    def _line_search_wolfe(
        self, fun_and_jac, xk, pk, gfk, old_fval, old_old_fval, l, u, args
    ):
        """Find alpha that satisfies strong Wolfe conditions.
        Uses the line search algorithm to enforce strong Wolfe conditions
        Wright and Nocedal, 'Numerical Optimization', 1999, pg. 59-60
        For the zoom phase it uses an algorithm by
        Outputs: (alpha0, gc, fc)
        """
        c1 = 1e-4
        c2 = 0.9
        N = xk.shape[0]
        _ls_fc = 0
        _ls_ingfk = None

        alpha0 = 0
        phi0 = old_fval

        derphi0 = 0
        for v in range(N):
            derphi0 += gfk[v] * pk[v]

        if derphi0 == 0:
            derphi0 = 1e-8
        elif np.abs(derphi0) < 1e-8:
            derphi0 = np.sign(derphi0) * 1e-8

        alpha1 = min(1.0, 1.01 * 2 * (phi0 - old_old_fval) / derphi0)

        # print(
        #     "   (ls) initial alpha1: ",
        #     alpha1,
        #     phi0 - old_old_fval,
        #     phi0,
        #     old_old_fval,
        #     derphi0,
        # )

        if alpha1 == 0:
            # This shouldn't happen. Perhaps the increment has slipped below
            # machine precision?  For now, set the return variables skip the
            # useless while loop, and raise warnflag=2 due to possible imprecision.
            # print("Slipped below machine precision.")
            alpha_star = None
            fval_star = old_fval
            old_fval = old_old_fval
            fprime_star = None

        _xkp1 = self._project(xk + alpha1 * pk, l, u)
        phi_a1, _ls_ingfk = fun_and_jac(_xkp1, *args)
        _ls_fc += 1
        # derphi_a1 = phiprime(alpha1)  evaluated below

        phi_a0 = phi0
        derphi_a0 = derphi0

        i = 1
        maxiter = 10
        while 1:  # bracketing phase
            # print("   (ls) in while loop: ", alpha1, alpha0)
            if alpha1 == 0:
                break
            if (phi_a1 > phi0 + c1 * alpha1 * derphi0) or (
                (phi_a1 >= phi_a0) and (i > 1)
            ):
                # inlining zoom for performance reasons
                #                 alpha0, alpha1, phi_a0, phi_a1, derphi_a0, phi0, derphi0, pk, xk
                # zoom signature: (a_lo, a_hi, phi_lo, phi_hi, derphi_lo, phi0, derphi0, pk, xk)
                # INLINE START
                k = 0
                delta1 = 0.2  # cubic interpolant check
                delta2 = 0.1  # quadratic interpolant check
                phi_rec = phi0
                a_rec = 0
                a_hi = alpha1
                a_lo = alpha0
                phi_lo = phi_a0
                phi_hi = phi_a1
                derphi_lo = derphi_a0
                while 1:
                    # interpolate to find a trial step length between a_lo and a_hi
                    # Need to choose interpolation here.  Use cubic interpolation and then if the
                    #  result is within delta * dalpha or outside of the interval bounded by a_lo or a_hi
                    #  then use quadratic interpolation, if the result is still too close, then use bisection

                    dalpha = a_hi - a_lo
                    if dalpha < 0:
                        a, b = a_hi, a_lo
                    else:
                        a, b = a_lo, a_hi

                    # minimizer of cubic interpolant
                    #    (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
                    #      if the result is too close to the end points (or out of the interval)
                    #         then use quadratic interpolation with phi_lo, derphi_lo and phi_hi
                    #      if the result is stil too close to the end points (or out of the interval)
                    #         then use bisection

                    if k > 0:
                        cchk = delta1 * dalpha
                        a_j = self._cubicmin(
                            a_lo, phi_lo, derphi_lo, a_hi, phi_hi, a_rec, phi_rec
                        )
                    if (
                        (k == 0)
                        or (a_j is None)
                        or (a_j > b - cchk)
                        or (a_j < a + cchk)
                    ):
                        qchk = delta2 * dalpha
                        a_j = self._quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
                        if (a_j is None) or (a_j > b - qchk) or (a_j < a + qchk):
                            a_j = a_lo + 0.5 * dalpha
                    #                print "Using bisection."
                    #            else: print "Using quadratic."
                    #        else: print "Using cubic."

                    # Check new value of a_j
                    _xkp1 = self._project(xk + a_j * pk, l, u)
                    # if _xkp1[1] < 0:
                    #     _xkp1[1] = 0
                    phi_aj, _ls_ingfk = fun_and_jac(_xkp1, *args)

                    derphi_aj = 0
                    for v in range(N):
                        derphi_aj += _ls_ingfk[v] * pk[v]

                    if (phi_aj > phi0 + c1 * a_j * derphi0) or (phi_aj >= phi_lo):
                        phi_rec = phi_hi
                        a_rec = a_hi
                        a_hi = a_j
                        phi_hi = phi_aj
                    else:
                        if abs(derphi_aj) <= -c2 * derphi0:
                            a_star = a_j
                            val_star = phi_aj
                            valprime_star = _ls_ingfk
                            break
                        if derphi_aj * (a_hi - a_lo) >= 0:
                            phi_rec = phi_hi
                            a_rec = a_hi
                            a_hi = a_lo
                            phi_hi = phi_lo
                        else:
                            phi_rec = phi_lo
                            a_rec = a_lo
                        a_lo = a_j
                        phi_lo = phi_aj
                        derphi_lo = derphi_aj
                    k += 1
                    if k > maxiter:
                        a_star = a_j
                        val_star = phi_aj
                        valprime_star = None
                        break

                alpha_star = a_star
                fval_star = val_star
                fprime_star = valprime_star
                fnev = k
                ## INLINE END

                _ls_fc += fnev
                break

            i += 1
            if i > maxiter:
                break

            _xkp1 = self._project(xk + alpha1 * pk, l, u)
            _, _ls_ingfk = fun_and_jac(_xkp1, *args)
            derphi_a1 = 0
            for v in range(N):
                derphi_a1 += _ls_ingfk[v] * pk[v]
            _ls_fc += 1
            if abs(derphi_a1) <= -c2 * derphi0:
                alpha_star = alpha1
                fval_star = phi_a1
                fprime_star = _ls_ingfk
                break

            if derphi_a1 >= 0:
                # alpha_star, fval_star, fprime_star, fnev, _ls_ingfk = _zoom(
                #     alpha1, alpha0, phi_a1, phi_a0, derphi_a1, phi0, derphi0, pk, xk
                # )
                #
                # INLINE START
                maxiter = 10
                k = 0
                delta1 = 0.2  # cubic interpolant check
                delta2 = 0.1  # quadratic interpolant check
                phi_rec = phi0
                a_rec = 0
                a_hi = alpha0
                a_lo = alpha1
                phi_lo = phi_a1
                phi_hi = phi_a0
                derphi_lo = derphi_a1
                while 1:
                    # interpolate to find a trial step length between a_lo and a_hi
                    # Need to choose interpolation here.  Use cubic interpolation and then if the
                    #  result is within delta * dalpha or outside of the interval bounded by a_lo or a_hi
                    #  then use quadratic interpolation, if the result is still too close, then use bisection

                    dalpha = a_hi - a_lo
                    if dalpha < 0:
                        a, b = a_hi, a_lo
                    else:
                        a, b = a_lo, a_hi

                    # minimizer of cubic interpolant
                    #    (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
                    #      if the result is too close to the end points (or out of the interval)
                    #         then use quadratic interpolation with phi_lo, derphi_lo and phi_hi
                    #      if the result is stil too close to the end points (or out of the interval)
                    #         then use bisection

                    if k > 0:
                        cchk = delta1 * dalpha
                        a_j = self._cubicmin(
                            a_lo, phi_lo, derphi_lo, a_hi, phi_hi, a_rec, phi_rec
                        )
                    if (
                        (k == 0)
                        or (a_j is None)
                        or (a_j > b - cchk)
                        or (a_j < a + cchk)
                    ):
                        qchk = delta2 * dalpha
                        a_j = self._quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
                        if (a_j is None) or (a_j > b - qchk) or (a_j < a + qchk):
                            a_j = a_lo + 0.5 * dalpha
                    #                print "Using bisection."
                    #            else: print "Using quadratic."
                    #        else: print "Using cubic."

                    # Check new value of a_j
                    _xkp1 = self._project(xk + a_j * pk, l, u)
                    phi_aj, _ls_ingfk = fun_and_jac(_xkp1, *args)
                    # print("call #3: ", phi_aj, _xkp1)
                    derphi_aj = 0
                    for v in range(N):
                        derphi_aj += _ls_ingfk[v] * pk[v]
                    if (phi_aj > phi0 + c1 * a_j * derphi0) or (phi_aj >= phi_lo):
                        phi_rec = phi_hi
                        a_rec = a_hi
                        a_hi = a_j
                        phi_hi = phi_aj
                    else:
                        if abs(derphi_aj) <= -c2 * derphi0:
                            a_star = a_j
                            val_star = phi_aj
                            valprime_star = _ls_ingfk
                            break
                        if derphi_aj * (a_hi - a_lo) >= 0:
                            phi_rec = phi_hi
                            a_rec = a_hi
                            a_hi = a_lo
                            phi_hi = phi_lo
                        else:
                            phi_rec = phi_lo
                            a_rec = a_lo
                        a_lo = a_j
                        phi_lo = phi_aj
                        derphi_lo = derphi_aj
                    k += 1
                    if k > maxiter:
                        a_star = a_j
                        val_star = phi_aj
                        valprime_star = None
                        break

                alpha_star = a_star
                fval_star = val_star
                fprime_star = valprime_star
                fnev = k
                ## INLINE END

                _ls_fc += fnev
                break

            alpha2 = 2 * alpha1  # increase by factor of two on each iteration
            i = i + 1
            alpha0 = alpha1
            alpha1 = alpha2
            phi_a0 = phi_a1
            _xkp1 = self._project(xk + alpha1 * pk, l, u)
            phi_a1, _ls_ingfk = fun_and_jac(_xkp1, *args)
            # print("call #4: ", phi_a1, _xkp1)
            _ls_fc += 1
            derphi_a0 = derphi_a1

            # stopping test if lower function not found
            if i > maxiter:
                alpha_star = alpha1
                fval_star = phi_a1
                fprime_star = None
                break

        return alpha_star, _ls_fc, _ls_fc, fval_star, old_fval, fprime_star, _ls_fc

    def _cubicmin(self, a, fa, fpa, b, fb, c, fc):
        # finds the minimizer for a cubic polynomial that goes through the
        #  points (a,fa), (b,fb), and (c,fc) with derivative at a of fpa.
        #
        # if no minimizer can be found return None
        #
        # f(x) = A *(x-a)^3 + B*(x-a)^2 + C*(x-a) + D

        C = fpa
        D = fa
        db = b - a
        dc = c - a
        if (db == 0) or (dc == 0) or (b == c):
            return None
        denom = (db * dc) ** 2 * (db - dc)
        A = dc ** 2 * (fb - fa - C * db) - db ** 2 * (fc - fa - C * dc)
        B = -dc ** 3 * (fb - fa - C * db) + db ** 3 * (fc - fa - C * dc)

        A /= denom
        B /= denom
        radical = B * B - 3 * A * C
        if radical < 0:
            return None
        if A == 0:
            return None
        xmin = a + (-B + np.sqrt(radical)) / (3 * A)
        return xmin

    def _quadmin(self, a, fa, fpa, b, fb):
        # finds the minimizer for a quadratic polynomial that goes through
        #  the points (a,fa), (b,fb) with derivative at a of fpa
        # f(x) = B*(x-a)^2 + C*(x-a) + D
        D = fa
        C = fpa
        db = b - a * 1.0
        if db == 0:
            return None
        B = (fb - D - C * db) / (db * db)
        if B <= 0:
            return None
        xmin = a - C / (2.0 * B)
        return xmin


spec = [("bfgsb", BFGSB.class_type.instance_type)]


class Optimizer(object):
    """ Base class for the trust-region optimization. If feasible, this optimizer solves the problem

        min_delta distance(x0, x + delta) s.t. ||delta||_2 <= r AND delta^T b = c AND min_ <= x + delta <= max_

        where x0 is the original sample, x is the current optimisation state, r is the trust-region radius,
        b is the current estimate of the normal vector of the decision boundary, c is the estimated distance of x
        to the trust region and [min_, max_] are the value constraints of the input. The function distance(.,.)
        is the distance measure to be optimised (e.g. L2, L1, L0).

    """

    def __init__(self):
        self.bfgsb = BFGSB()  # a box-constrained BFGS solver

    def solve(self, x0, x, b, min_, max_, c, r):
        x0, x, b = x0.astype(np.float64), x.astype(np.float64), b.astype(np.float64)
        cmax, cmaxnorm = self._max_logit_diff(x, b, min_, max_, c)

        if np.abs(cmax) < np.abs(c):
            # problem not solvable (boundary cannot be reached)
            if np.sqrt(cmaxnorm) < r:
                # make largest possible step towards boundary while staying within bounds
                _delta = self.optimize_boundary_s_t_trustregion(
                    x0, x, b, min_, max_, c, r
                )
            else:
                # make largest possible step towards boundary while staying within trust region
                _delta = self.optimize_boundary_s_t_trustregion(
                    x0, x, b, min_, max_, c, r
                )
        else:
            if cmaxnorm < r:
                # problem is solvable
                # proceed with standard optimization
                _delta = self.optimize_distance_s_t_boundary_and_trustregion(
                    x0, x, b, min_, max_, c, r
                )
            else:
                # problem might not be solvable
                bnorm = np.linalg.norm(b)
                minnorm = self._minimum_norm_to_boundary(x, b, min_, max_, c, bnorm)

                if minnorm <= r:
                    # problem is solvable, proceed with standard optimization
                    _delta = self.optimize_distance_s_t_boundary_and_trustregion(
                        x0, x, b, min_, max_, c, r
                    )
                else:
                    # problem not solvable (boundary cannot be reached)
                    # make largest step towards boundary within trust region
                    _delta = self.optimize_boundary_s_t_trustregion(
                        x0, x, b, min_, max_, c, r
                    )

        return _delta

    def _max_logit_diff(self, x, b, _ell, _u, c):
        """ Tests whether the (estimated) boundary can be reached within trust region. """
        N = x.shape[0]
        cmax = 0.0
        norm = 0.0

        if c > 0:
            for n in range(N):
                if b[n] > 0:
                    cmax += b[n] * (_u - x[n])
                    norm += (_u - x[n]) ** 2
                else:
                    cmax += b[n] * (_ell - x[n])
                    norm += (x[n] - _ell) ** 2
        else:
            for n in range(N):
                if b[n] > 0:
                    cmax += b[n] * (_ell - x[n])
                    norm += (x[n] - _ell) ** 2
                else:
                    cmax += b[n] * (_u - x[n])
                    norm += (_u - x[n]) ** 2

        return cmax, np.sqrt(norm)

    def _minimum_norm_to_boundary(self, x, b, _ell, _u, c, bnorm):
        """ Computes the minimum norm necessary to reach the boundary. More precisely, we aim to solve the
            following optimization problem

                min ||delta||_2^2 s.t. lower <= x + delta <= upper AND b.dot(delta) = c

            Lets forget about the box constraints for a second, i.e.

                min ||delta||_2^2 s.t. b.dot(delta) = c

            The dual of this problem is quite straight-forward to solve,

                g(lambda, delta) = ||delta||_2^2 + lambda * (c - b.dot(delta))

            The minimum of this Lagrangian is delta^* = lambda * b / 2, and so

                inf_delta g(lambda, delta) = lambda^2 / 4 ||b||_2^2 + lambda * c

            and so the optimal lambda, which maximizes inf_delta g(lambda, delta), is given by

                lambda^* = 2c / ||b||_2^2

            which in turn yields the optimal delta:

                delta^* = c * b / ||b||_2^2

            To take into account the box-constraints we perform a binary search over lambda and apply the box
            constraint in each step.
        """
        N = x.shape[0]

        lambda_lower = 2 * c / bnorm ** 2
        lambda_upper = (
            np.sign(c) * np.inf
        )  # optimal initial point (if box-constraints are neglected)
        _lambda = lambda_lower
        k = 0

        # perform a binary search over lambda
        while True:
            # compute _c = b.dot([- _lambda * b / 2]_clip)
            k += 1
            _c = 0
            norm = 0

            if c > 0:
                for n in range(N):
                    lam_step = _lambda * b[n] / 2
                    if b[n] > 0:
                        max_step = _u - x[n]
                        delta_step = min(max_step, lam_step)
                        _c += b[n] * delta_step
                        norm += delta_step ** 2
                    else:
                        max_step = _ell - x[n]
                        delta_step = max(max_step, lam_step)
                        _c += b[n] * delta_step
                        norm += delta_step ** 2
            else:
                for n in range(N):
                    lam_step = _lambda * b[n] / 2
                    if b[n] > 0:
                        max_step = _ell - x[n]
                        delta_step = max(max_step, lam_step)
                        _c += b[n] * delta_step
                        norm += delta_step ** 2
                    else:
                        max_step = _u - x[n]
                        delta_step = min(max_step, lam_step)
                        _c += b[n] * delta_step
                        norm += delta_step ** 2

            # adjust lambda
            if np.abs(_c) < np.abs(c):
                # increase absolute value of lambda
                if np.isinf(lambda_upper):
                    _lambda *= 2
                else:
                    lambda_lower = _lambda
                    _lambda = (lambda_upper - lambda_lower) / 2 + lambda_lower
            else:
                # decrease lambda
                lambda_upper = _lambda
                _lambda = (lambda_upper - lambda_lower) / 2 + lambda_lower

            # stopping condition
            if 0.999 * np.abs(c) - EPS < np.abs(_c) < 1.001 * np.abs(c) + EPS:
                break

        return np.sqrt(norm)

    def optimize_distance_s_t_boundary_and_trustregion(
        self, x0, x, b, min_, max_, c, r
    ):
        """ Find the solution to the optimization problem

            min_delta ||dx - delta||_p^p s.t. ||delta||_2^2 <= r^2 AND b^T delta = c AND min_ <= x + delta <= max_
        """
        params0 = np.array([0.0, 0.0])
        bounds = np.array([(-np.inf, np.inf), (0, np.inf)])
        args = (x0, x, b, min_, max_, c, r)

        qk = self.bfgsb.solve(self.fun_and_jac, params0, bounds, args)
        return self._get_final_delta(
            qk[0], qk[1], x0, x, b, min_, max_, c, r, touchup=True
        )

    def optimize_boundary_s_t_trustregion_fun_and_jac(
        self, params, x0, x, b, min_, max_, c, r
    ):
        N = x0.shape[0]
        s = -np.sign(c)
        _mu = params[0]
        t = 1 / (2 * _mu + EPS)

        g = -_mu * r ** 2
        grad_mu = -r ** 2

        for n in range(N):
            d = -s * b[n] * t

            if d < min_ - x[n]:
                d = min_ - x[n]
            elif d > max_ - x[n]:
                d = max_ - x[n]
            else:
                grad_mu += (b[n] + 2 * _mu * d) * (b[n] / (2 * _mu ** 2 + EPS))

            grad_mu += d ** 2
            g += (b[n] + _mu * d) * d

        return -g, -np.array([grad_mu])

    def safe_div(self, nominator, denominator):
        if np.abs(denominator) > EPS:
            return nominator / denominator
        elif denominator >= 0:
            return nominator / EPS
        else:
            return -nominator / EPS

    def optimize_boundary_s_t_trustregion(self, x0, x, b, min_, max_, c, r):
        """ Find the solution to the optimization problem

            min_delta sign(c) b^T delta s.t. ||delta||_2^2 <= r^2 AND min_ <= x + delta <= max_

            Note: this optimization problem is independent of the Lp norm being optimized.

            Lagrangian: g(delta) = sign(c) b^T delta + mu * (||delta||_2^2 - r^2)
            Optimal delta: delta = - sign(c) * b / (2 * mu)
        """
        params0 = np.array([1.0])
        args = (x0, x, b, min_, max_, c, r)
        bounds = np.array([(0, np.inf)])

        qk = self.bfgsb.solve(
            self.optimize_boundary_s_t_trustregion_fun_and_jac, params0, bounds, args
        )

        _delta = self.safe_div(-b, 2 * qk[0])

        for n in range(x0.shape[0]):
            if _delta[n] < min_ - x[n]:
                _delta[n] = min_ - x[n]
            elif _delta[n] > max_ - x[n]:
                _delta[n] = max_ - x[n]

        return _delta


@jitclass(spec=spec)
class L2Optimizer(Optimizer):
    def optimize_distance_s_t_boundary_and_trustregion(
        self, x0, x, b, min_, max_, c, r
    ):
        """ Solves the L2 trust region problem

        min ||x0 - x - delta||_2 s.t. b^top delta = c
                                    & ell <= x + delta <= u
                                    & ||delta||_2 <= r

        This is a specialised solver that does not use the generic BFGS-B solver.
        Instead, this active-set solver computes the active set of indices (those that
        do not hit the bounds) and then computes that optimal step size in the direction
        of the boundary and the direction of the original sample over the active indices.

        Parameters
        ----------
        x0 : `numpy.ndarray`
            The original image against which we minimize the perturbation
            (flattened).
        x : `numpy.ndarray`
            The current perturbation (flattened).
        b : `numpy.ndarray`
            Normal vector of the local decision boundary (flattened).
        min_ : float
            Lower bound on the pixel values.
        max_ : float
            Upper bound on the pixel values.
        c : float
            Logit difference between the ground truth class of x0 and the
            leading class different from the ground truth.
        r : float
            Size of the trust region.
        """
        N = x0.shape[0]
        clamp_c = 0
        clamp_norm = 0
        ck = c
        rk = r
        masked_values = 0

        mask = np.zeros(N, dtype=np.uint8)
        delta = np.empty_like(x0)
        dx = x0 - x

        for k in range(20):
            # inner optimization that solves subproblem
            bnorm = 1e-8
            bdotDx = 0

            for i in range(N):
                if mask[i] == 0:
                    bnorm += b[i] * b[i]
                    bdotDx += b[i] * dx[i]

            bdotDx = bdotDx / bnorm
            ck_bnorm = ck / bnorm
            b_scale = -bdotDx + ck / bnorm
            new_masked_values = 0
            delta_norm = 0
            descent_norm = 0
            boundary_step_norm = 0

            # make optimal step towards boundary AND minimum
            for i in range(N):
                if mask[i] == 0:
                    delta[i] = dx[i] + b[i] * b_scale
                    boundary_step_norm = (
                        boundary_step_norm + b[i] * ck_bnorm * b[i] * ck_bnorm
                    )
                    delta_norm = delta_norm + delta[i] * delta[i]
                    descent_norm = descent_norm + (dx[i] - b[i] * bdotDx) * (
                        dx[i] - b[i] * bdotDx
                    )

            # check of step to boundary is already larger than trust region
            if boundary_step_norm > rk * rk:
                for i in range(N):
                    if mask[i] == 0:
                        delta[i] = b[i] * ck_bnorm
            else:
                # check if combined step to large and correct step to minimum if necessary
                if delta_norm > rk * rk:
                    region_correct = np.sqrt(rk * rk - boundary_step_norm)
                    region_correct = region_correct / (np.sqrt(descent_norm) + 1e-8)
                    b_scale = -region_correct * bdotDx + ck / bnorm

                    for i in range(N):
                        if mask[i] == 0:
                            delta[i] = region_correct * dx[i] + b[i] * b_scale

            for i in range(N):
                if mask[i] == 0:
                    if x[i] + delta[i] <= min_:
                        mask[i] = 1
                        delta[i] = min_ - x[i]
                        new_masked_values = new_masked_values + 1
                        clamp_norm = clamp_norm + delta[i] * delta[i]
                        clamp_c = clamp_c + b[i] * delta[i]

                    if x[i] + delta[i] >= max_:
                        mask[i] = 1
                        delta[i] = max_ - x[i]
                        new_masked_values = new_masked_values + 1
                        clamp_norm = clamp_norm + delta[i] * delta[i]
                        clamp_c = clamp_c + b[i] * delta[i]

            # should no additional variable get out of bounds, stop optimization
            if new_masked_values == 0:
                break

            masked_values = masked_values + new_masked_values

            if clamp_norm < r * r:
                rk = np.sqrt(r * r - clamp_norm)
            else:
                rk = 0

            ck = c - clamp_c

            if masked_values == N:
                break

        return delta

    def fun_and_jac(self, params, x0, x, b, min_, max_, c, r):
        # we need to compute the loss function
        # g = distance + mu * (norm_d - r ** 2) + lam * (b_dot_d - c)
        # and its derivative d g / d lam and d g / d mu
        lam, mu = params

        N = x0.shape[0]

        g = 0
        d_g_d_lam = 0
        d_g_d_mu = 0

        distance = 0
        b_dot_d = 0
        d_norm = 0

        t = 1 / (2 * mu + 2)

        for n in range(N):
            dx = x0[n] - x[n]
            bn = b[n]
            xn = x[n]

            d = (2 * dx - lam * bn) * t

            if d + xn > max_:
                d = max_ - xn
            elif d + xn < min_:
                d = min_ - xn
            else:
                prefac = 2 * (d - dx) + 2 * mu * d + lam * bn
                d_g_d_lam -= prefac * bn * t
                d_g_d_mu -= prefac * 2 * d * t

            distance += (d - dx) ** 2
            b_dot_d += bn * d
            d_norm += d ** 2

            g += (dx - d) ** 2 + mu * d ** 2 + lam * bn * d
            d_g_d_lam += bn * d
            d_g_d_mu += d ** 2

        g += -mu * r ** 2 - lam * c
        d_g_d_lam -= c
        d_g_d_mu -= r ** 2

        return -g, -np.array([d_g_d_lam, d_g_d_mu])

    def _get_final_delta(self, lam, mu, x0, x, b, min_, max_, c, r, touchup=True):
        delta = np.empty_like(x0)
        N = x0.shape[0]

        t = 1 / (2 * mu + 2)

        for n in range(N):
            d = (2 * (x0[n] - x[n]) - lam * b[n]) * t

            if d + x[n] > max_:
                d = max_ - x[n]
            elif d + x[n] < min_:
                d = min_ - x[n]

            delta[n] = d

        return delta

    def _distance(self, x0, x):
        return np.linalg.norm(x0 - x) ** 2


@jitclass(spec=spec)
class L1Optimizer(Optimizer):
    def fun_and_jac(self, params, x0, x, b, min_, max_, c, r):
        lam, mu = params
        # arg min_delta ||delta - dx||_1 + lam * b^T delta + mu * ||delta||_2^2  s.t.  min <= delta + x <= max
        N = x0.shape[0]

        g = 0
        d_g_d_lam = 0
        d_g_d_mu = 0

        if mu > 0:
            for n in range(N):
                dx = x0[n] - x[n]
                bn = b[n]
                t = 1 / (2 * mu)
                u = -lam * bn * t - dx

                if np.abs(u) - t < 0:
                    # value and grad = 0
                    d = dx
                else:
                    d = np.sign(u) * (np.abs(u) - t) + dx

                    if d + x[n] < min_:
                        d = min_ - x[n]
                    elif d + x[n] > max_:
                        d = max_ - x[n]
                    else:
                        prefac = np.sign(d - dx) + 2 * mu * d + lam * bn
                        d_g_d_lam -= prefac * bn * t
                        d_g_d_mu -= prefac * 2 * d * t

                g += np.abs(dx - d) + mu * d ** 2 + lam * bn * d
                d_g_d_lam += bn * d
                d_g_d_mu += d ** 2
        else:  # mu == 0
            for n in range(N):
                dx = x0[n] - x[n]
                bn = b[n]
                if np.abs(lam * bn) < 1:
                    d = dx
                elif np.sign(lam * bn) < 0:
                    d = max_ - x[n]
                else:
                    d = min_ - x[n]

                g += np.abs(dx - d) + mu * d ** 2 + lam * bn * d
                d_g_d_lam += bn * d
                d_g_d_mu += d ** 2

        g += -mu * r ** 2 - lam * c
        d_g_d_lam -= c
        d_g_d_mu -= r ** 2

        return -g, -np.array([d_g_d_lam, d_g_d_mu])

    def _get_final_delta(self, lam, mu, x0, x, b, min_, max_, c, r, touchup=True):
        delta = np.empty_like(x0)
        N = x0.shape[0]

        b_dot_d = 0
        norm_d = 0
        distance = 0

        if mu > 0:
            for n in range(N):
                dx = x0[n] - x[n]
                bn = b[n]
                t = 1 / (2 * mu)
                u = -lam * bn * t - dx

                if np.abs(u) - t < 0:
                    # value and grad = 0
                    d = dx
                else:
                    d = np.sign(u) * (np.abs(u) - t) + dx

                    if d + x[n] < min_:
                        # grad = 0
                        d = min_ - x[n]
                    elif d + x[n] > max_:
                        # grad = 0
                        d = max_ - x[n]

                delta[n] = d
                b_dot_d += b[n] * d
                norm_d += d ** 2
                distance += np.abs(d - dx)
        else:  # mu == 0
            for n in range(N):
                dx = x0[n] - x[n]
                bn = b[n]
                if np.abs(lam * bn) < 1:
                    d = dx
                elif np.sign(lam * bn) < 0:
                    d = max_ - x[n]
                else:
                    d = min_ - x[n]

                delta[n] = d
                b_dot_d += b[n] * d
                norm_d += d ** 2
                distance += np.abs(d - dx)

        if touchup:
            # search for the one index that (a) we can modify to match boundary constraint, (b) stays within our
            # trust region and (c) minimize the distance to the original image
            dc = c - b_dot_d
            k = 0
            min_distance = np.inf
            min_distance_idx = 0
            for n in range(N):
                if np.abs(b[n]) > 0:
                    dx = x0[n] - x[n]
                    old_d = delta[n]
                    new_d = old_d + dc / b[n]

                    if (
                        x[n] + new_d <= max_
                        and x[n] + new_d >= min_
                        and norm_d - old_d ** 2 + new_d ** 2 <= r ** 2
                    ):
                        # conditions (a) and (b) are fulfilled
                        if k == 0:
                            min_distance = (
                                distance - np.abs(old_d - dx) + np.abs(new_d - dx)
                            )
                            min_distance_idx = n
                            k += 1
                        else:
                            new_distance = (
                                distance - np.abs(old_d - dx) + np.abs(new_d - dx)
                            )
                            if min_distance > new_distance:
                                min_distance = new_distance
                                min_distance_idx = n

            if k > 0:
                # touchup successful
                idx = min_distance_idx
                old_d = delta[idx]

                new_d = old_d + dc / b[idx]
                delta[idx] = new_d

        return delta

    def _distance(self, x0, x):
        return np.abs(x0 - x).sum()


@jitclass(spec=spec)
class LinfOptimizer(Optimizer):
    def optimize_distance_s_t_boundary_and_trustregion(
        self, x0, x, b, min_, max_, c, r
    ):
        """ Find the solution to the optimization problem

            min_delta ||dx - delta||_p^p s.t. ||delta||_2^2 <= r^2 AND b^T delta = c AND min_ <= x + delta <= max_
        """
        params0 = np.array([0.0, 0.0])
        bounds = np.array([(-np.inf, np.inf), (0, np.inf)])

        return self.binary_search(params0, bounds, x0, x, b, min_, max_, c, r)

    def binary_search(
        self, q0, bounds, x0, x, b, min_, max_, c, r, etol=1e-6, maxiter=1000
    ):
        # perform binary search over epsilon
        epsilon = (max_ - min_) / 2.0
        eps_low = min_
        eps_high = max_
        func_calls = 0

        bnorm = np.linalg.norm(b)
        lambda0 = 2 * c / bnorm ** 2

        k = 0

        while eps_high - eps_low > etol:
            fun, nfev, _lambda0 = self.fun(
                epsilon, x0, x, b, min_, max_, c, r, lambda0=lambda0
            )
            func_calls += nfev
            if fun > -np.inf:
                # decrease epsilon
                eps_high = epsilon
                lambda0 = _lambda0
            else:
                # increase epsilon
                eps_low = epsilon

            k += 1
            epsilon = (eps_high - eps_low) / 2.0 + eps_low
            # print(k, func_calls, epsilon, eps_high, eps_low, fun)

            if k > 20:
                break

        delta = self._get_final_delta(
            lambda0, eps_high, x0, x, b, min_, max_, c, r, touchup=True
        )
        return delta

    def _Linf_bounds(self, x0, epsilon, ell, u):
        N = x0.shape[0]
        _ell = np.empty_like(x0)
        _u = np.empty_like(x0)
        for i in range(N):
            nx, px = x0[i] - epsilon, x0[i] + epsilon
            if nx > ell:
                _ell[i] = nx
            else:
                _ell[i] = ell

            if px < u:
                _u[i] = px
            else:
                _u[i] = u

        return _ell, _u

    def fun(self, epsilon, x0, x, b, ell, u, c, r, lambda0=None):
        """ Computes the minimum norm necessary to reach the boundary. More precisely, we aim to solve the
            following optimization problem

                min ||delta||_2^2 s.t. lower <= x + delta <= upper AND b.dot(delta) = c

            Lets forget about the box constraints for a second, i.e.

                min ||delta||_2^2 s.t. b.dot(delta) = c

            The dual of this problem is quite straight-forward to solve,

                g(lambda, delta) = ||delta||_2^2 + lambda * (c - b.dot(delta))

            The minimum of this Lagrangian is delta^* = lambda * b / 2, and so

                inf_delta g(lambda, delta) = lambda^2 / 4 ||b||_2^2 + lambda * c

            and so the optimal lambda, which maximizes inf_delta g(lambda, delta), is given by

                lambda^* = 2c / ||b||_2^2

            which in turn yields the optimal delta:

                delta^* = c * b / ||b||_2^2

            To take into account the box-constraints we perform a binary search over lambda and apply the box
            constraint in each step.
        """
        N = x.shape[0]
        # print("")
        # print("Starting fun with ", epsilon, lambda0)

        # new box constraints
        _ell, _u = self._Linf_bounds(x0, epsilon, ell, u)

        # initialize lambda
        _lambda = lambda0

        # compute delta and determine active set
        k = 0

        lambda_max, lambda_min = 1e10, -1e10

        # check whether problem is actually solvable (i.e. check whether boundary constraint can be reached)
        max_c = 0
        min_c = 0

        for n in range(N):
            if b[n] > 0:
                max_c += b[n] * (_u[n] - x[n])
                min_c += b[n] * (_ell[n] - x[n])
            else:
                max_c += b[n] * (_ell[n] - x[n])
                min_c += b[n] * (_u[n] - x[n])

        if c > max_c or c < min_c:
            # print("Problem not solvable (boundary cannot be reached)", c, max_c, min_c)
            return -np.inf, k, _lambda

        while True:
            k += 1
            _c = 0
            norm = 0
            _active_bnorm = 0

            for n in range(N):
                lam_step = _lambda * b[n] / 2
                if lam_step + x[n] < _ell[n]:
                    delta_step = _ell[n] - x[n]
                elif lam_step + x[n] > _u[n]:
                    delta_step = _u[n] - x[n]
                else:
                    delta_step = lam_step
                    _active_bnorm += b[n] ** 2

                _c += b[n] * delta_step
                norm += delta_step ** 2

            if 0.9999 * np.abs(c) - EPS < np.abs(_c) < 1.0001 * np.abs(c) + EPS:
                if norm > r ** 2:
                    # print("Problem cannot be solved (norm outside of trust region).")
                    return -np.inf, k, _lambda
                else:
                    # print("Problem solved with lambda=", _lambda)
                    return -epsilon, k, _lambda
            else:
                # update lambda according to active variables
                if _c > c:
                    lambda_max = _lambda
                else:
                    lambda_min = _lambda
                #
                # print("Update _lambda", _lambda, c, _c, last_c, max_c, min_c)
                if _active_bnorm == 0:
                    # update is stepping out of feasible region, fallback to binary search
                    _lambda = (lambda_max - lambda_min) / 2 + lambda_min
                else:
                    _lambda += 2 * (c - _c) / _active_bnorm

                dlambda = lambda_max - lambda_min
                if (
                    _lambda > lambda_max - 0.1 * dlambda
                    or _lambda < lambda_min + 0.1 * dlambda
                ):
                    # update is stepping out of feasible region, fallback to binary search
                    _lambda = (lambda_max - lambda_min) / 2 + lambda_min

            #if k > 500:
            #    rnd = np.random.randint(100000000)
            #    np.save(f'/mnt/qb/wbrendel/gradient_boundary_attack/backup/Linf_{rnd}.npy', [x0, x, b, ell, u, c, r, lambda0])
            #    return -np.inf, k, _lambda

    def _get_final_delta(self, lam, eps, x0, x, b, min_, max_, c, r, touchup=True):
        N = x.shape[0]
        delta = np.empty_like(x0)

        # new box constraints
        _ell, _u = self._Linf_bounds(x0, eps, min_, max_)

        for n in range(N):
            lam_step = lam * b[n] / 2
            if lam_step + x[n] < _ell[n]:
                delta[n] = _ell[n] - x[n]
            elif lam_step + x[n] > _u[n]:
                delta[n] = _u[n] - x[n]
            else:
                delta[n] = lam_step

        return delta

    def _distance(self, x0, x):
        return np.abs(x0 - x).max()


@jitclass(spec=spec)
class L0Optimizer(Optimizer):
    def optimize_distance_s_t_boundary_and_trustregion(
        self, x0, x, b, min_, max_, c, r
    ):
        """ Find the solution to the optimization problem

            min_delta ||dx - delta||_p^p s.t. ||delta||_2^2 <= r^2 AND b^T delta = c AND min_ <= x + delta <= max_
        """
        params0 = np.array([0.0, 0.0])
        bounds = np.array([(-np.inf, np.inf), (0, np.inf)])

        return self.minimize(params0, bounds, x0, x, b, min_, max_, c, r)

    def minimize(
        self,
        q0,
        bounds,
        x0,
        x,
        b,
        min_,
        max_,
        c,
        r,
        ftol=1e-9,
        xtol=-1e-5,
        maxiter=1000,
    ):
        # First check whether solution can be computed without trust region
        delta, delta_norm = self.minimize_without_trustregion(
            x0, x, b, c, r, min_, max_
        )

        if delta_norm <= r:
            return delta
        else:
            # perform Nelder-Mead optimization
            args = (x0, x, b, min_, max_, c, r)

            results = self._nelder_mead_algorithm(
                q0, bounds, args=args, tol_f=ftol, tol_x=xtol, max_iter=maxiter
            )

            delta = self._get_final_delta(
                results[0], results[1], x0, x, b, min_, max_, c, r, touchup=True
            )

        return delta

    def minimize_without_trustregion(self, x0, x, b, c, r, ell, u):
        # compute maximum direction to b.dot(delta) within box-constraints
        delta = x0 - x
        total = np.empty_like(x0)
        total_b = np.empty_like(x0)
        bdotdelta = b.dot(delta)
        delta_bdotdelta = c - bdotdelta

        for k in range(x0.shape[0]):
            if b[k] > 0 and delta_bdotdelta > 0:
                total_b[k] = (u - x0[k]) * b[k]  # pos
                total[k] = u - x0[k]
            elif b[k] > 0 and delta_bdotdelta < 0:
                total_b[k] = (ell - x0[k]) * b[k]  # neg
                total[k] = ell - x0[k]
            elif b[k] < 0 and delta_bdotdelta > 0:
                total_b[k] = (ell - x0[k]) * b[k]  # pos
                total[k] = ell - x0[k]
            else:
                total_b[k] = (u - x0[k]) * b[k]  # neg
                total[k] = u - x0[k]

        b_argsort = np.argsort(np.abs(total_b))[::-1]

        for idx in b_argsort:
            # print(idx, bdotdelta, b[idx], total_b[idx])
            if np.abs(c - bdotdelta) > np.abs(total_b[idx]):
                delta[idx] += total[idx]
                bdotdelta += total_b[idx]
            else:
                delta[idx] += (c - bdotdelta) / (b[idx] + 1e-20)
                break

        delta_norm = np.linalg.norm(delta)

        return delta, delta_norm

    def _nelder_mead_algorithm(
        self,
        q0,
        bounds,
        args=(),
        ρ=1.0,
        χ=2.0,
        γ=0.5,
        σ=0.5,
        tol_f=1e-8,
        tol_x=1e-8,
        max_iter=1000,
    ):
        """
        Implements the Nelder-Mead algorithm described in Lagarias et al. (1998)
        modified to maximize instead of minimizing.

        Parameters
        ----------
        vertices : ndarray(float, ndim=2)
            Initial simplex with shape (n+1, n) to be modified in-place.

        args : tuple, optional
            Extra arguments passed to the objective function.

        ρ : scalar(float), optional(default=1.)
            Reflection parameter. Must be strictly greater than 0.

        χ : scalar(float), optional(default=2.)
            Expansion parameter. Must be strictly greater than max(1, ρ).

        γ : scalar(float), optional(default=0.5)
            Contraction parameter. Must be stricly between 0 and 1.

        σ : scalar(float), optional(default=0.5)
            Shrinkage parameter. Must be strictly between 0 and 1.

        tol_f : scalar(float), optional(default=1e-10)
            Tolerance to be used for the function value convergence test.

        tol_x : scalar(float), optional(default=1e-10)
            Tolerance to be used for the function domain convergence test.

        max_iter : scalar(float), optional(default=1000)
            The maximum number of allowed iterations.

        Returns
        ----------
        x : Approximate solution

        """
        vertices = self._initialize_simplex(q0)
        n = vertices.shape[1]
        self._check_params(ρ, χ, γ, σ, bounds, n)

        nit = 0

        ργ = ρ * γ
        ρχ = ρ * χ
        σ_n = σ ** n

        f_val = np.empty(n + 1, dtype=np.float64)
        for i in range(n + 1):
            f_val[i] = self._neg_bounded_fun(bounds, vertices[i], args=args)

        # Step 1: Sort
        sort_ind = f_val.argsort()
        LV_ratio = 1

        # Compute centroid
        x_bar = vertices[sort_ind[:n]].sum(axis=0) / n

        while True:
            # print("Iterate ", nit, " with current f_val: ", f_val)
            shrink = False

            # Check termination
            fail = nit >= max_iter

            best_val_idx = sort_ind[0]
            worst_val_idx = sort_ind[n]

            term_f = f_val[worst_val_idx] - f_val[best_val_idx] < tol_f

            # Linearized volume ratio test (see [2])
            term_x = LV_ratio < tol_x

            if term_x or term_f or fail:
                break

            # Step 2: Reflection
            x_r = x_bar + ρ * (x_bar - vertices[worst_val_idx])
            f_r = self._neg_bounded_fun(bounds, x_r, args=args)

            if f_r >= f_val[best_val_idx] and f_r < f_val[sort_ind[n - 1]]:
                # Accept reflection
                vertices[worst_val_idx] = x_r
                LV_ratio *= ρ

            # Step 3: Expansion
            elif f_r < f_val[best_val_idx]:
                x_e = x_bar + χ * (x_r - x_bar)
                f_e = self._neg_bounded_fun(bounds, x_e, args=args)
                if f_e < f_r:  # Greedy minimization
                    vertices[worst_val_idx] = x_e
                    LV_ratio *= ρχ
                else:
                    vertices[worst_val_idx] = x_r
                    LV_ratio *= ρ

            # Step 4 & 5: Contraction and Shrink
            else:
                # Step 4: Contraction
                if f_r < f_val[worst_val_idx]:  # Step 4.a: Outside Contraction
                    x_c = x_bar + γ * (x_r - x_bar)
                    LV_ratio_update = ργ
                else:  # Step 4.b: Inside Contraction
                    x_c = x_bar - γ * (x_r - x_bar)
                    LV_ratio_update = γ

                f_c = self._neg_bounded_fun(bounds, x_c, args=args)
                if f_c < min(f_r, f_val[worst_val_idx]):  # Accept contraction
                    vertices[worst_val_idx] = x_c
                    LV_ratio *= LV_ratio_update

                # Step 5: Shrink
                else:
                    shrink = True
                    for i in sort_ind[1:]:
                        vertices[i] = vertices[best_val_idx] + σ * (
                            vertices[i] - vertices[best_val_idx]
                        )
                        f_val[i] = self._neg_bounded_fun(bounds, vertices[i], args=args)

                    sort_ind[1:] = f_val[sort_ind[1:]].argsort() + 1

                    x_bar = (
                        vertices[best_val_idx]
                        + σ * (x_bar - vertices[best_val_idx])
                        + (vertices[worst_val_idx] - vertices[sort_ind[n]]) / n
                    )

                    LV_ratio *= σ_n

            if not shrink:  # Nonshrink ordering rule
                f_val[worst_val_idx] = self._neg_bounded_fun(
                    bounds, vertices[worst_val_idx], args=args
                )

                for i, j in enumerate(sort_ind):
                    if f_val[worst_val_idx] < f_val[j]:
                        sort_ind[i + 1 :] = sort_ind[i:-1]
                        sort_ind[i] = worst_val_idx
                        break

                x_bar += (vertices[worst_val_idx] - vertices[sort_ind[n]]) / n

            nit += 1

        return vertices[sort_ind[0]]

    def _initialize_simplex(self, x0):
        """
        Generates an initial simplex for the Nelder-Mead method.

        Parameters
        ----------
        x0 : ndarray(float, ndim=1)
            Initial guess. Array of real elements of size (n,), where ‘n’ is the
            number of independent variables.

        bounds: ndarray(float, ndim=2)
            Sequence of (min, max) pairs for each element in x0.

        Returns
        ----------
        vertices : ndarray(float, ndim=2)
            Initial simplex with shape (n+1, n).

        """
        n = x0.size

        vertices = np.empty((n + 1, n), dtype=np.float64)

        # Broadcast x0 on row dimension
        vertices[:] = x0

        nonzdelt = 0.05
        zdelt = 0.00025

        for i in range(n):
            # Generate candidate coordinate
            if vertices[i + 1, i] != 0.0:
                vertices[i + 1, i] *= 1 + nonzdelt
            else:
                vertices[i + 1, i] = zdelt

        return vertices

    def _check_params(self, ρ, χ, γ, σ, bounds, n):
        """
        Checks whether the parameters for the Nelder-Mead algorithm are valid.
        JIT-compiled in `nopython` mode using Numba.

        Parameters
        ----------
        ρ : scalar(float)
            Reflection parameter. Must be strictly greater than 0.

        χ : scalar(float)
            Expansion parameter. Must be strictly greater than max(1, ρ).

        γ : scalar(float)
            Contraction parameter. Must be stricly between 0 and 1.

        σ : scalar(float)
            Shrinkage parameter. Must be strictly between 0 and 1.

        bounds: ndarray(float, ndim=2)
            Sequence of (min, max) pairs for each element in x.

        n : scalar(int)
            Number of independent variables.

        """
        if ρ < 0:
            raise ValueError("ρ must be strictly greater than 0.")
        if χ < 1:
            raise ValueError("χ must be strictly greater than 1.")
        if χ < ρ:
            raise ValueError("χ must be strictly greater than ρ.")
        if γ < 0 or γ > 1:
            raise ValueError("γ must be strictly between 0 and 1.")
        if σ < 0 or σ > 1:
            raise ValueError("σ must be strictly between 0 and 1.")

        if not (bounds.shape == (0, 2) or bounds.shape == (n, 2)):
            raise ValueError("The shape of `bounds` is not valid.")
        if (np.atleast_2d(bounds)[:, 0] > np.atleast_2d(bounds)[:, 1]).any():
            raise ValueError("Lower bounds must be greater than upper bounds.")

    def _check_bounds(self, x, bounds):
        """
        Checks whether `x` is within `bounds`. JIT-compiled in `nopython` mode
        using Numba.

        Parameters
        ----------
        x : ndarray(float, ndim=1)
            1-D array with shape (n,) of independent variables.

        bounds: ndarray(float, ndim=2)
            Sequence of (min, max) pairs for each element in x.

        Returns
        ----------
        bool
            `True` if `x` is within `bounds`, `False` otherwise.

        """
        if bounds.shape == (0, 2):
            return True
        else:
            return (np.atleast_2d(bounds)[:, 0] <= x).all() and (
                x <= np.atleast_2d(bounds)[:, 1]
            ).all()

    def _neg_bounded_fun(self, bounds, x, args=()):
        """
        Wrapper for bounding and taking the negative of `fun` for the
        Nelder-Mead algorithm. JIT-compiled in `nopython` mode using Numba.

        Parameters
        ----------
        bounds: ndarray(float, ndim=2)
            Sequence of (min, max) pairs for each element in x.

        x : ndarray(float, ndim=1)
            1-D array with shape (n,) of independent variables at which `fun` is
            to be evaluated.

        args : tuple, optional
            Extra arguments passed to the objective function.

        Returns
        ----------
        scalar
            `-fun(x, *args)` if x is within `bounds`, `np.inf` otherwise.

        """
        if self._check_bounds(x, bounds):
            return -self.fun(x, *args)
        else:
            return np.inf

    def fun(self, params, x0, x, b, min_, max_, c, r):
        # arg min_delta ||delta - dx||_0 + lam * b^T delta + mu * ||delta||_2^2  s.t.  min <= delta + x <= max
        lam, mu = params
        N = x0.shape[0]

        g = -mu * r ** 2 - lam * c

        if mu > 0:
            t = 1 / (2 * mu)

            for n in range(N):
                dx = x0[n] - x[n]
                bn = b[n]

                case1 = lam * bn * dx + mu * dx ** 2

                optd = -lam * bn * t
                if optd < min_ - x[n]:
                    optd = min_ - x[n]
                elif optd > max_ - x[n]:
                    optd = max_ - x[n]

                case2 = 1 + lam * bn * optd + mu * optd ** 2

                if case1 <= case2:
                    g += mu * dx ** 2 + lam * bn * dx
                else:
                    g += 1 + mu * optd ** 2 + lam * bn * optd
        else:
            # arg min_delta ||delta - dx||_0 + lam * b^T delta
            # case delta[n] = dx[n]: lam * b[n] * dx[n]
            # case delta[n] != dx[n]: lam * b[n] * [min_ - x[n], max_ - x[n]]
            for n in range(N):
                dx = x0[n] - x[n]
                bn = b[n]
                case1 = lam * bn * dx
                case2 = 1 + lam * bn * (min_ - x[n])
                case3 = 1 + lam * bn * (max_ - x[n])
                if case1 <= case2 and case1 <= case3:
                    g += mu * dx ** 2 + lam * bn * dx
                elif case2 < case3:
                    g += 1 + mu * (min_ - x[n]) ** 2 + lam * bn * (min_ - x[n])
                else:
                    g += 1 + mu * (max_ - x[n]) ** 2 + lam * bn * (max_ - x[n])

        return g

    def _get_final_delta(self, lam, mu, x0, x, b, min_, max_, c, r, touchup=True):
        if touchup:
            delta = self.__get_final_delta(lam, mu, x0, x, b, min_, max_, c, r)
            if delta is not None:
                return delta
            else:
                # fallback
                params = [
                    (lam + 1e-5, mu),
                    (lam, mu + 1e-5),
                    (lam - 1e-5, mu),
                    (lam, mu - 1e-5),
                    (lam + 1e-5, mu + 1e-5),
                    (lam - 1e-5, mu - 1e-5),
                    (lam + 1e-5, mu - 1e-5),
                    (lam - 1e-5, mu + 1e-5),
                ]
                for param in params:
                    delta = self.__get_final_delta(
                        param[0], param[1], x0, x, b, min_, max_, c, r
                    )
                    if delta is not None:
                        return delta

                # 2nd fallback
                return self.__get_final_delta(
                    lam, mu, x0, x, b, min_, max_, c, r, False
                )
        else:
            return self.__get_final_delta(lam, mu, x0, x, b, min_, max_, c, r, False)

    def __get_final_delta(self, lam, mu, x0, x, b, min_, max_, c, r, touchup=True):
        delta = np.empty_like(x0)
        N = x0.shape[0]

        b_dot_d = 0
        norm_d = 0
        distance = 0

        if mu > 0:
            for n in range(N):
                dx = x0[n] - x[n]
                bn = b[n]
                t = 1 / (2 * mu)

                case1 = lam * bn * dx + mu * dx ** 2

                optd = -lam * bn * t
                if optd < min_ - x[n]:
                    optd = min_ - x[n]
                elif optd > max_ - x[n]:
                    optd = max_ - x[n]

                case2 = 1 + lam * bn * optd + mu * optd ** 2

                if case1 <= case2:
                    d = dx
                else:
                    d = optd
                    distance += 1

                delta[n] = d
                b_dot_d += bn * d
                norm_d += d ** 2
        else:  # mu == 0
            for n in range(N):
                dx = x0[n] - x[n]
                bn = b[n]
                case1 = lam * bn * dx
                case2 = 1 + lam * bn * (min_ - x[n])
                case3 = 1 + lam * bn * (max_ - x[n])
                if case1 <= case2 and case1 <= case3:
                    d = dx
                elif case2 < case3:
                    d = min_ - x[n]
                    distance += 1
                else:
                    d = max_ - x[n]
                    distance += 1

                delta[n] = d
                norm_d += d ** 2
                b_dot_d += bn * d

        if touchup:
            # search for the one index that
            # (a) we can modify to match boundary constraint
            # (b) stays within our trust region and
            # (c) minimize the distance to the original image.
            dc = c - b_dot_d
            k = 0
            min_distance = np.inf
            min_norm = np.inf
            min_distance_idx = 0
            for n in range(N):
                if np.abs(b[n]) > 0:
                    dx = x0[n] - x[n]
                    old_d = delta[n]
                    new_d = old_d + dc / b[n]

                    if (
                        x[n] + new_d <= max_
                        and x[n] + new_d >= min_
                        and norm_d - old_d ** 2 + new_d ** 2 <= r ** 2
                    ):
                        # conditions (a) and (b) are fulfilled
                        if k == 0:
                            min_distance = (
                                distance
                                - (np.abs(old_d - dx) > 1e-10)
                                + (np.abs(new_d - dx) > 1e-10)
                            )
                            min_distance_idx = n
                            min_norm = norm_d - old_d ** 2 + new_d ** 2
                            k += 1
                        else:
                            new_distance = (
                                distance
                                - (np.abs(old_d - dx) > 1e-10)
                                + (np.abs(new_d - dx) > 1e-10)
                            )
                            if (
                                min_distance > new_distance
                                or min_distance == new_distance
                                and min_norm > norm_d - old_d ** 2 + new_d ** 2
                            ):
                                min_distance = new_distance
                                min_norm = norm_d - old_d ** 2 + new_d ** 2
                                min_distance_idx = n

            if k > 0:
                # touchup successful
                idx = min_distance_idx
                old_d = delta[idx]

                new_d = old_d + dc / b[idx]
                delta[idx] = new_d

                return delta
            else:
                return None

        return delta

    def _distance(self, x0, x):
        return np.sum(np.abs(x - x0) > EPS)