Skip to content

Instantly share code, notes, and snippets.

@bveliqi
Created September 26, 2018 07:59
Show Gist options
  • Save bveliqi/61d1a77221f3faaaee7660a9badf552f to your computer and use it in GitHub Desktop.
Save bveliqi/61d1a77221f3faaaee7660a9badf552f to your computer and use it in GitHub Desktop.
Carlini & Wagner attack
import logging
import numpy as np
from .base import Attack
from .base import call_decorator
class CarliniWagnerAttack(Attack):
"""Implements Carlini & Wagner attack introduced in [1]_.
Implements the l-2 norm version of the attack only,
not the l0- oder l-infinity norms versions.
References
----------
.. [1] Nicholas Carlini & David Wagner,
"Towards Evaluating the Robustness of Neural Networks",
https://arxiv.org/abs/1608.04644
"""
@call_decorator
def __call__(self, input_or_adv, label=None, unpack=True,
confidence=5.0, learning_rate=1e-2, binary_search_steps=25,
max_iter=1000, initial_const=1e-3, decay=0.):
"""Simple and close to optimal gradient-based
adversarial attack.
Parameters
----------
input_or_adv : `numpy.ndarray` or :class:`Adversarial`
The original, unperturbed input as a `numpy.ndarray` or
an :class:`Adversarial` instance.
label : int
The reference label of the original input. Must be passed
if `a` is a `numpy.ndarray`, must not be passed if `a` is
an :class:`Adversarial` instance.
unpack : bool
If true, returns the adversarial input, otherwise returns
the Adversarial object.
confidence : int or float
Confidence of adversarial examples: higher produces examples
that are farther away, but more strongly classified as adversarial.
learning_rate : float
The learning rate for the attack algorithm. Smaller values
produce better results but are slower to converge.
binary_search_steps : int
The number of times we perform binary search to
find the optimal tradeoff-constant between distance and confidence.
max_iter : int
The maximum number of iterations. Larger values are more
accurate; setting too small will require a large learning rate and will
produce poor results.
initial_const : float
The initial tradeoff-constant to use to tune the relative
importance of distance and confidence. If binary_search_steps is large,
the initial constant is not important.
decay : float
Coefficient for learning rate decay.
"""
a = input_or_adv
del input_or_adv
del label
del unpack
if not a.has_gradient():
logging.fatal('Applied gradient-based attack to model that does not provide gradients.')
return
if a.target_class() is None:
logging.fatal('Carlini and Wagner is a targeted adversarial attack.')
return
clip_min, clip_max = a.bounds()
tanh_smoother = 1 - np.finfo(float).eps # for avoiding division by zero
const_upper_bound = 1e10 # if c exceeds this threshold, abort binary search
const_lower_bound = 0
perturbed_img = a.original_image.copy()
const = initial_const
# perform optimization in tanh space
image_tanh = np.clip(perturbed_img, clip_min, clip_max)
image_tanh = (image_tanh - clip_min) / (clip_max - clip_min)
image_tanh = np.arctanh(((image_tanh * 2) - 1) * tanh_smoother)
for _ in range(binary_search_steps):
is_attack = False
current_pertubation = np.zeros(image_tanh.shape)
adam_optimizer = AdamOptimizer(current_pertubation.shape)
for i in range(max_iter):
# transform current adversarial from tanh to original space
adversarial = image_tanh + current_pertubation
adversarial = (np.tanh(adversarial) / tanh_smoother + 1) / 2
adversarial = adversarial * (clip_max - clip_min) + clip_min
adversarial = np.clip(adversarial, 0, 1)
logits, squared_l2_dist, loss = CarliniWagnerAttack.loss(
a, a.original_image, adversarial, a.target_class(), const, confidence)
last_attack_success = loss - squared_l2_dist <= 0
is_attack = is_attack or last_attack_success
if last_attack_success:
break
else:
target_one_hot = np.zeros(a.num_classes())
target_one_hot[a.target_class()] = 1
label_add = np.argmax(logits * (1 - target_one_hot))
gradient = a.gradient(image=adversarial, label=a.target_class())
gradient -= a.gradient(image=adversarial, label=label_add)
gradient *= const
gradient += 2 * (adversarial - a.original_image)
gradient *= (clip_max - clip_min)
gradient *= (1 - np.square(np.tanh(image_tanh + current_pertubation))) / (2 * tanh_smoother)
learning_rate *= (1. / ( 1. + decay * i))
current_pertubation = adam_optimizer(gradient, learning_rate, current_pertubation)
# find new const by using binary search
if is_attack:
const = (const_lower_bound + const) / 2
else:
const_old = const
const = 2 * const
const_lower_bound = const_old
if const > const_upper_bound:
break
@staticmethod
def loss(a, original_image, adversarial_image, target, const, confidence):
dist = original_image - adversarial_image
squared_l2_dist = np.sum(np.square(dist))
logits, _, _ = a.predictions_and_gradient(adversarial_image)
logits_target = logits[target]
target_one_hot = np.zeros(a.num_classes())
target_one_hot[target] = 1
logits_other = np.max(logits * (1 - target_one_hot))
loss = max(logits_other - logits_target + confidence, 0)
return logits, squared_l2_dist, const * loss + squared_l2_dist
class AdamOptimizer:
"""Using the ADAM optimizer, as it is the most effective at quickly
finding adversarial examples according to the paper [1]_.
"""
def __init__(self, shape):
"""
shape: (int, int)
shape of the image
"""
self.m = np.zeros(shape)
self.v = np.zeros(shape)
def __call__(self, gradient, learning_rate, current_pertubation, beta1=0.9, beta2=0.999, epsilon=10**-8):
"""
gradient: the gradient in the current iteration
learning_rate: the learning rate in the current iteration
current_pertubation: the image pertubation at the current iteration
beta1: decay rate for calculating the exponentially decaying average of past gradients
beta2: decay rate for calculating the exponentially decaying average of past squared gradients
epsilon: to avoid division by zero
:return: the newly calculated pertubation
"""
self.m = beta1 * self.m + (1 - beta1) * gradient
self.v = beta2 * self.v + (1 - beta2) * gradient**2
return current_pertubation - (learning_rate * self.m / (np.sqrt(self.v) + epsilon))
import numpy as np
from foolbox.attacks import CarliniWagnerAttack as Attack
def test_targeted_attack(bn_targeted_adversarial):
adv = bn_targeted_adversarial
attack = Attack()
attack(adv)
assert adv.image is not None
assert adv.distance.value < np.inf
def test_attack_impossible(bn_impossible):
adv = bn_impossible
attack = Attack()
attack(adv)
assert adv.image is None
assert adv.distance.value == np.inf
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment