emuccino/generate_adversaries.py

## generate_adversaries.py
#custom activation function for keeping adversarial pixel values between 0.0 and 1.0
def clip(x):
    return K.clip(x, 0.0, 1.0)

#custom loss funciton for non-targeted misclassification
def negative_categorical_crossentropy(yTrue,yPred):
    return 0.0 - K.categorical_crossentropy(yTrue,yPred)

#add custom objects to dictionary
get_custom_objects().update({'clip': Activation(clip)})
get_custom_objects().update({'negative_categorical_crossentropy': negative_categorical_crossentropy})


#function for generating an adversarial example given a base image, adversarial class target, classifier, and regularization type
def generate_adversary(img,target,model,regularization,loss_function):

    #input for base image
    image = Input(shape=(28,28,1),name='image')
    #unit input for adversarial noise
    one = Input(shape=(1,),name='unity')

    #layer for learning adversarial noise to apply to image
    noise = Dense(28*28,activation = None,use_bias=False,kernel_initializer='random_normal',
                  kernel_regularizer=regularization, name='adversarial_noise')(one)

    #reshape noise in shape of image
    noise = Reshape((28,28,1),name='reshape')(noise)

    #add noise to image
    net = Add(name='add')([noise,image])
    #clip values to be within 0.0 and 1.0
    net = Activation('clip',name='clip_values')(net)

    #feed adversarial image to trained MNIST classifier
    outputs = model(net)

    adversarial_model = Model(inputs=[image,one], outputs=outputs)
    #freeze trained MNIST classifier layers
    adversarial_model.layers[-1].trainable = False

    adversarial_model.compile(optimizer='nadam', loss=loss_function, metrics=[categorical_accuracy])

    #target adversarial classification
    target_vector = np.zeros(10)
    target_vector[target] = 1.

    #callback for saving weights with smallest loss
    checkpoint = ModelCheckpoint('./adversarial_weights.h5', monitor='loss', verbose=0, save_best_only=True, save_weights_only=True,
                                 mode='auto', period=1)
    #train adversarial image
    adversarial_model.fit(x={'image':img,'one':np.ones(shape=(1,1))},y=target_vector.reshape(1,-1),epochs=10000,verbose=0,
                         callbacks=[checkpoint])
    #restore best weights
    adversarial_model.load_weights('./adversarial_weights.h5')

    #quantize adversarial noise
    quantized_weights = np.round(adversarial_model.get_weights()[0].reshape((28,28)) * 255.) / 255.

    #add trained weights to original image and clip values to produce adversarial image
    adversarial_img = np.clip(img.reshape((28,28)) + quantized_weights, 0., 1.)

    #display adversarial image
    plt.imshow(adversarial_img,vmin=0., vmax=1.)
    plt.show()
    #classify adversarial image
    adversarial_prediction = mnist_model.predict(adversarial_img.reshape((1,28,28,1)))
    print(adversarial_prediction)

    return adversarial_img

generate_adversary(img,5,mnist_model,l1(0.01),'negative_categorical_crossentropy')
generate_adversary(img,5,mnist_model,l2(0.01),'negative_categorical_crossentropy')
generate_adversary(img,5,mnist_model,l1_l2(l1=0.01,l2=0.01),'negative_categorical_crossentropy')

generate_adversary(img,9,mnist_model,l1(0.01),'categorical_crossentropy')
generate_adversary(img,9,mnist_model,l2(0.01),'categorical_crossentropy')
generate_adversary(img,9,mnist_model,l1_l2(l1=0.01,l2=0.01),'categorical_crossentropy')
	#custom activation function for keeping adversarial pixel values between 0.0 and 1.0
	def clip(x):
	return K.clip(x, 0.0, 1.0)

	#custom loss funciton for non-targeted misclassification
	def negative_categorical_crossentropy(yTrue,yPred):
	return 0.0 - K.categorical_crossentropy(yTrue,yPred)

	#add custom objects to dictionary
	get_custom_objects().update({'clip': Activation(clip)})
	get_custom_objects().update({'negative_categorical_crossentropy': negative_categorical_crossentropy})


	#function for generating an adversarial example given a base image, adversarial class target, classifier, and regularization type
	def generate_adversary(img,target,model,regularization,loss_function):

	#input for base image
	image = Input(shape=(28,28,1),name='image')
	#unit input for adversarial noise
	one = Input(shape=(1,),name='unity')

	#layer for learning adversarial noise to apply to image
	noise = Dense(28*28,activation = None,use_bias=False,kernel_initializer='random_normal',
	kernel_regularizer=regularization, name='adversarial_noise')(one)

	#reshape noise in shape of image
	noise = Reshape((28,28,1),name='reshape')(noise)

	#add noise to image
	net = Add(name='add')([noise,image])
	#clip values to be within 0.0 and 1.0
	net = Activation('clip',name='clip_values')(net)

	#feed adversarial image to trained MNIST classifier
	outputs = model(net)

	adversarial_model = Model(inputs=[image,one], outputs=outputs)
	#freeze trained MNIST classifier layers
	adversarial_model.layers[-1].trainable = False

	adversarial_model.compile(optimizer='nadam', loss=loss_function, metrics=[categorical_accuracy])

	#target adversarial classification
	target_vector = np.zeros(10)
	target_vector[target] = 1.

	#callback for saving weights with smallest loss
	checkpoint = ModelCheckpoint('./adversarial_weights.h5', monitor='loss', verbose=0, save_best_only=True, save_weights_only=True,
	mode='auto', period=1)
	#train adversarial image
	adversarial_model.fit(x={'image':img,'one':np.ones(shape=(1,1))},y=target_vector.reshape(1,-1),epochs=10000,verbose=0,
	callbacks=[checkpoint])
	#restore best weights
	adversarial_model.load_weights('./adversarial_weights.h5')

	#quantize adversarial noise
	quantized_weights = np.round(adversarial_model.get_weights()[0].reshape((28,28)) * 255.) / 255.

	#add trained weights to original image and clip values to produce adversarial image
	adversarial_img = np.clip(img.reshape((28,28)) + quantized_weights, 0., 1.)

	#display adversarial image
	plt.imshow(adversarial_img,vmin=0., vmax=1.)
	plt.show()
	#classify adversarial image
	adversarial_prediction = mnist_model.predict(adversarial_img.reshape((1,28,28,1)))
	print(adversarial_prediction)

	return adversarial_img

	generate_adversary(img,5,mnist_model,l1(0.01),'negative_categorical_crossentropy')
	generate_adversary(img,5,mnist_model,l2(0.01),'negative_categorical_crossentropy')
	generate_adversary(img,5,mnist_model,l1_l2(l1=0.01,l2=0.01),'negative_categorical_crossentropy')

	generate_adversary(img,9,mnist_model,l1(0.01),'categorical_crossentropy')
	generate_adversary(img,9,mnist_model,l2(0.01),'categorical_crossentropy')
	generate_adversary(img,9,mnist_model,l1_l2(l1=0.01,l2=0.01),'categorical_crossentropy')