Created
September 26, 2018 07:59
-
-
Save bveliqi/61d1a77221f3faaaee7660a9badf552f to your computer and use it in GitHub Desktop.
Carlini & Wagner attack
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import numpy as np | |
from .base import Attack | |
from .base import call_decorator | |
class CarliniWagnerAttack(Attack): | |
"""Implements Carlini & Wagner attack introduced in [1]_. | |
Implements the l-2 norm version of the attack only, | |
not the l0- oder l-infinity norms versions. | |
References | |
---------- | |
.. [1] Nicholas Carlini & David Wagner, | |
"Towards Evaluating the Robustness of Neural Networks", | |
https://arxiv.org/abs/1608.04644 | |
""" | |
@call_decorator | |
def __call__(self, input_or_adv, label=None, unpack=True, | |
confidence=5.0, learning_rate=1e-2, binary_search_steps=25, | |
max_iter=1000, initial_const=1e-3, decay=0.): | |
"""Simple and close to optimal gradient-based | |
adversarial attack. | |
Parameters | |
---------- | |
input_or_adv : `numpy.ndarray` or :class:`Adversarial` | |
The original, unperturbed input as a `numpy.ndarray` or | |
an :class:`Adversarial` instance. | |
label : int | |
The reference label of the original input. Must be passed | |
if `a` is a `numpy.ndarray`, must not be passed if `a` is | |
an :class:`Adversarial` instance. | |
unpack : bool | |
If true, returns the adversarial input, otherwise returns | |
the Adversarial object. | |
confidence : int or float | |
Confidence of adversarial examples: higher produces examples | |
that are farther away, but more strongly classified as adversarial. | |
learning_rate : float | |
The learning rate for the attack algorithm. Smaller values | |
produce better results but are slower to converge. | |
binary_search_steps : int | |
The number of times we perform binary search to | |
find the optimal tradeoff-constant between distance and confidence. | |
max_iter : int | |
The maximum number of iterations. Larger values are more | |
accurate; setting too small will require a large learning rate and will | |
produce poor results. | |
initial_const : float | |
The initial tradeoff-constant to use to tune the relative | |
importance of distance and confidence. If binary_search_steps is large, | |
the initial constant is not important. | |
decay : float | |
Coefficient for learning rate decay. | |
""" | |
a = input_or_adv | |
del input_or_adv | |
del label | |
del unpack | |
if not a.has_gradient(): | |
logging.fatal('Applied gradient-based attack to model that does not provide gradients.') | |
return | |
if a.target_class() is None: | |
logging.fatal('Carlini and Wagner is a targeted adversarial attack.') | |
return | |
clip_min, clip_max = a.bounds() | |
tanh_smoother = 1 - np.finfo(float).eps # for avoiding division by zero | |
const_upper_bound = 1e10 # if c exceeds this threshold, abort binary search | |
const_lower_bound = 0 | |
perturbed_img = a.original_image.copy() | |
const = initial_const | |
# perform optimization in tanh space | |
image_tanh = np.clip(perturbed_img, clip_min, clip_max) | |
image_tanh = (image_tanh - clip_min) / (clip_max - clip_min) | |
image_tanh = np.arctanh(((image_tanh * 2) - 1) * tanh_smoother) | |
for _ in range(binary_search_steps): | |
is_attack = False | |
current_pertubation = np.zeros(image_tanh.shape) | |
adam_optimizer = AdamOptimizer(current_pertubation.shape) | |
for i in range(max_iter): | |
# transform current adversarial from tanh to original space | |
adversarial = image_tanh + current_pertubation | |
adversarial = (np.tanh(adversarial) / tanh_smoother + 1) / 2 | |
adversarial = adversarial * (clip_max - clip_min) + clip_min | |
adversarial = np.clip(adversarial, 0, 1) | |
logits, squared_l2_dist, loss = CarliniWagnerAttack.loss( | |
a, a.original_image, adversarial, a.target_class(), const, confidence) | |
last_attack_success = loss - squared_l2_dist <= 0 | |
is_attack = is_attack or last_attack_success | |
if last_attack_success: | |
break | |
else: | |
target_one_hot = np.zeros(a.num_classes()) | |
target_one_hot[a.target_class()] = 1 | |
label_add = np.argmax(logits * (1 - target_one_hot)) | |
gradient = a.gradient(image=adversarial, label=a.target_class()) | |
gradient -= a.gradient(image=adversarial, label=label_add) | |
gradient *= const | |
gradient += 2 * (adversarial - a.original_image) | |
gradient *= (clip_max - clip_min) | |
gradient *= (1 - np.square(np.tanh(image_tanh + current_pertubation))) / (2 * tanh_smoother) | |
learning_rate *= (1. / ( 1. + decay * i)) | |
current_pertubation = adam_optimizer(gradient, learning_rate, current_pertubation) | |
# find new const by using binary search | |
if is_attack: | |
const = (const_lower_bound + const) / 2 | |
else: | |
const_old = const | |
const = 2 * const | |
const_lower_bound = const_old | |
if const > const_upper_bound: | |
break | |
@staticmethod | |
def loss(a, original_image, adversarial_image, target, const, confidence): | |
dist = original_image - adversarial_image | |
squared_l2_dist = np.sum(np.square(dist)) | |
logits, _, _ = a.predictions_and_gradient(adversarial_image) | |
logits_target = logits[target] | |
target_one_hot = np.zeros(a.num_classes()) | |
target_one_hot[target] = 1 | |
logits_other = np.max(logits * (1 - target_one_hot)) | |
loss = max(logits_other - logits_target + confidence, 0) | |
return logits, squared_l2_dist, const * loss + squared_l2_dist | |
class AdamOptimizer: | |
"""Using the ADAM optimizer, as it is the most effective at quickly | |
finding adversarial examples according to the paper [1]_. | |
""" | |
def __init__(self, shape): | |
""" | |
shape: (int, int) | |
shape of the image | |
""" | |
self.m = np.zeros(shape) | |
self.v = np.zeros(shape) | |
def __call__(self, gradient, learning_rate, current_pertubation, beta1=0.9, beta2=0.999, epsilon=10**-8): | |
""" | |
gradient: the gradient in the current iteration | |
learning_rate: the learning rate in the current iteration | |
current_pertubation: the image pertubation at the current iteration | |
beta1: decay rate for calculating the exponentially decaying average of past gradients | |
beta2: decay rate for calculating the exponentially decaying average of past squared gradients | |
epsilon: to avoid division by zero | |
:return: the newly calculated pertubation | |
""" | |
self.m = beta1 * self.m + (1 - beta1) * gradient | |
self.v = beta2 * self.v + (1 - beta2) * gradient**2 | |
return current_pertubation - (learning_rate * self.m / (np.sqrt(self.v) + epsilon)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from foolbox.attacks import CarliniWagnerAttack as Attack | |
def test_targeted_attack(bn_targeted_adversarial): | |
adv = bn_targeted_adversarial | |
attack = Attack() | |
attack(adv) | |
assert adv.image is not None | |
assert adv.distance.value < np.inf | |
def test_attack_impossible(bn_impossible): | |
adv = bn_impossible | |
attack = Attack() | |
attack(adv) | |
assert adv.image is None | |
assert adv.distance.value == np.inf |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment