Last active
August 6, 2016 15:20
-
-
Save kumekay/0ccd8b179c69869fbe6f0e0a7220cc02 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'''Neural style transfer with Keras. | |
Before running this script, download the weights for the VGG16 model at: | |
https://drive.google.com/file/d/0Bz7KyqmuGsilT0J5dmRCM0ROVHc/view?usp=sharing | |
(source: https://gist.github.com/baraldilorenzo/07d7802847aaad0a35d3) | |
and make sure the variable `weights_path` in this script matches the location of the file. | |
Run the script with: | |
``` | |
python neural_style_transfer.py path_to_your_base_image.jpg path_to_your_reference.jpg prefix_for_results | |
``` | |
e.g.: | |
``` | |
python neural_style_transfer.py img/tuebingen.jpg img/starry_night.jpg results/my_result | |
``` | |
It is preferable to run this script on GPU, for speed. | |
If running on CPU, prefer the TensorFlow backend (much faster). | |
Example result: https://twitter.com/fchollet/status/686631033085677568 | |
# Details | |
Style transfer consists in generating an image | |
with the same "content" as a base image, but with the | |
"style" of a different picture (typically artistic). | |
This is achieved through the optimization of a loss function | |
that has 3 components: "style loss", "content loss", | |
and "total variation loss": | |
- The total variation loss imposes local spatial continuity between | |
the pixels of the combination image, giving it visual coherence. | |
- The style loss is where the deep learning keeps in --that one is defined | |
using a deep convolutional neural network. Precisely, it consists in a sum of | |
L2 distances between the Gram matrices of the representations of | |
the base image and the style reference image, extracted from | |
different layers of a convnet (trained on ImageNet). The general idea | |
is to capture color/texture information at different spatial | |
scales (fairly large scales --defined by the depth of the layer considered). | |
- The content loss is a L2 distance between the features of the base | |
image (extracted from a deep layer) and the features of the combination image, | |
keeping the generated image close enough to the original one. | |
# References | |
- [A Neural Algorithm of Artistic Style](http://arxiv.org/abs/1508.06576) | |
''' | |
from __future__ import print_function | |
from scipy.misc import imread, imresize, imsave | |
import numpy as np | |
from scipy.optimize import fmin_l_bfgs_b | |
import time | |
import os | |
import argparse | |
import h5py | |
import tinys3 | |
from keras.models import Sequential | |
from keras.layers import Convolution2D, ZeroPadding2D, MaxPooling2D | |
from keras import backend as K | |
parser = argparse.ArgumentParser(description='Neural style transfer with Keras.') | |
parser.add_argument('base_image_path', metavar='base', type=str, | |
help='Path to the image to transform.') | |
parser.add_argument('style_reference_image_path', metavar='ref', type=str, | |
help='Path to the style reference image.') | |
parser.add_argument('batch_id', metavar='batch_id', type=str, | |
help='Batch id.') | |
parser.add_argument('frame', metavar='frame', type=str, | |
help='Frame number.') | |
args = parser.parse_args() | |
base_image_path = args.base_image_path | |
style_reference_image_path = args.style_reference_image_path | |
batch_id = args.batch_id | |
frame = args.frame | |
weights_path = 'vgg16_weights.h5' | |
# these are the weights of the different loss components | |
total_variation_weight = 1. | |
style_weight = 1. | |
content_weight = 0.025 | |
# dimensions of the generated picture. | |
img_width = 400 | |
img_height = 400 | |
assert img_height == img_width, 'Due to the use of the Gram matrix, width and height must match.' | |
# util function to open, resize and format pictures into appropriate tensors | |
def preprocess_image(image_path): | |
img = imresize(imread(image_path), (img_width, img_height)) | |
img = img[:, :, ::-1].astype('float64') | |
img[:, :, 0] -= 103.939 | |
img[:, :, 1] -= 116.779 | |
img[:, :, 2] -= 123.68 | |
img = img.transpose((2, 0, 1)) | |
img = np.expand_dims(img, axis=0) | |
return img | |
# util function to convert a tensor into a valid image | |
def deprocess_image(x): | |
x = x.transpose((1, 2, 0)) | |
x[:, :, 0] += 103.939 | |
x[:, :, 1] += 116.779 | |
x[:, :, 2] += 123.68 | |
x = x[:, :, ::-1] | |
x = np.clip(x, 0, 255).astype('uint8') | |
return x | |
# get tensor representations of our images | |
base_image = K.variable(preprocess_image(base_image_path)) | |
style_reference_image = K.variable(preprocess_image(style_reference_image_path)) | |
# this will contain our generated image | |
combination_image = K.placeholder((1, 3, img_width, img_height)) | |
# combine the 3 images into a single Keras tensor | |
input_tensor = K.concatenate([base_image, | |
style_reference_image, | |
combination_image], axis=0) | |
# build the VGG16 network with our 3 images as input | |
first_layer = ZeroPadding2D((1, 1)) | |
first_layer.set_input(input_tensor, shape=(3, 3, img_width, img_height)) | |
model = Sequential() | |
model.add(first_layer) | |
model.add(Convolution2D(64, 3, 3, activation='relu', name='conv1_1')) | |
model.add(ZeroPadding2D((1, 1))) | |
model.add(Convolution2D(64, 3, 3, activation='relu')) | |
model.add(MaxPooling2D((2, 2), strides=(2, 2))) | |
model.add(ZeroPadding2D((1, 1))) | |
model.add(Convolution2D(128, 3, 3, activation='relu', name='conv2_1')) | |
model.add(ZeroPadding2D((1, 1))) | |
model.add(Convolution2D(128, 3, 3, activation='relu')) | |
model.add(MaxPooling2D((2, 2), strides=(2, 2))) | |
model.add(ZeroPadding2D((1, 1))) | |
model.add(Convolution2D(256, 3, 3, activation='relu', name='conv3_1')) | |
model.add(ZeroPadding2D((1, 1))) | |
model.add(Convolution2D(256, 3, 3, activation='relu')) | |
model.add(ZeroPadding2D((1, 1))) | |
model.add(Convolution2D(256, 3, 3, activation='relu')) | |
model.add(MaxPooling2D((2, 2), strides=(2, 2))) | |
model.add(ZeroPadding2D((1, 1))) | |
model.add(Convolution2D(512, 3, 3, activation='relu', name='conv4_1')) | |
model.add(ZeroPadding2D((1, 1))) | |
model.add(Convolution2D(512, 3, 3, activation='relu', name='conv4_2')) | |
model.add(ZeroPadding2D((1, 1))) | |
model.add(Convolution2D(512, 3, 3, activation='relu')) | |
model.add(MaxPooling2D((2, 2), strides=(2, 2))) | |
model.add(ZeroPadding2D((1, 1))) | |
model.add(Convolution2D(512, 3, 3, activation='relu', name='conv5_1')) | |
model.add(ZeroPadding2D((1, 1))) | |
model.add(Convolution2D(512, 3, 3, activation='relu')) | |
model.add(ZeroPadding2D((1, 1))) | |
model.add(Convolution2D(512, 3, 3, activation='relu')) | |
model.add(MaxPooling2D((2, 2), strides=(2, 2))) | |
# load the weights of the VGG16 networks | |
# (trained on ImageNet, won the ILSVRC competition in 2014) | |
# note: when there is a complete match between your model definition | |
# and your weight savefile, you can simply call model.load_weights(filename) | |
assert os.path.exists(weights_path), 'Model weights not found (see "weights_path" variable in script).' | |
f = h5py.File(weights_path) | |
for k in range(f.attrs['nb_layers']): | |
if k >= len(model.layers): | |
# we don't look at the last (fully-connected) layers in the savefile | |
break | |
g = f['layer_{}'.format(k)] | |
weights = [g['param_{}'.format(p)] for p in range(g.attrs['nb_params'])] | |
model.layers[k].set_weights(weights) | |
f.close() | |
print('Model loaded.') | |
# get the symbolic outputs of each "key" layer (we gave them unique names). | |
outputs_dict = dict([(layer.name, layer.output) for layer in model.layers]) | |
# compute the neural style loss | |
# first we need to define 4 util functions | |
# the gram matrix of an image tensor (feature-wise outer product) | |
def gram_matrix(x): | |
assert K.ndim(x) == 3 | |
features = K.batch_flatten(x) | |
gram = K.dot(features, K.transpose(features)) | |
return gram | |
# the "style loss" is designed to maintain | |
# the style of the reference image in the generated image. | |
# It is based on the gram matrices (which capture style) of | |
# feature maps from the style reference image | |
# and from the generated image | |
def style_loss(style, combination): | |
assert K.ndim(style) == 3 | |
assert K.ndim(combination) == 3 | |
S = gram_matrix(style) | |
C = gram_matrix(combination) | |
channels = 3 | |
size = img_width * img_height | |
return K.sum(K.square(S - C)) / (4. * (channels ** 2) * (size ** 2)) | |
# an auxiliary loss function | |
# designed to maintain the "content" of the | |
# base image in the generated image | |
def content_loss(base, combination): | |
return K.sum(K.square(combination - base)) | |
# the 3rd loss function, total variation loss, | |
# designed to keep the generated image locally coherent | |
def total_variation_loss(x): | |
assert K.ndim(x) == 4 | |
a = K.square(x[:, :, :img_width-1, :img_height-1] - x[:, :, 1:, :img_height-1]) | |
b = K.square(x[:, :, :img_width-1, :img_height-1] - x[:, :, :img_width-1, 1:]) | |
return K.sum(K.pow(a + b, 1.25)) | |
# combine these loss functions into a single scalar | |
loss = K.variable(0.) | |
layer_features = outputs_dict['conv4_2'] | |
base_image_features = layer_features[0, :, :, :] | |
combination_features = layer_features[2, :, :, :] | |
loss += content_weight * content_loss(base_image_features, | |
combination_features) | |
feature_layers = ['conv1_1', 'conv2_1', 'conv3_1', 'conv4_1', 'conv5_1'] | |
for layer_name in feature_layers: | |
layer_features = outputs_dict[layer_name] | |
style_reference_features = layer_features[1, :, :, :] | |
combination_features = layer_features[2, :, :, :] | |
sl = style_loss(style_reference_features, combination_features) | |
loss += (style_weight / len(feature_layers)) * sl | |
loss += total_variation_weight * total_variation_loss(combination_image) | |
# get the gradients of the generated image wrt the loss | |
grads = K.gradients(loss, combination_image) | |
outputs = [loss] | |
if type(grads) in {list, tuple}: | |
outputs += grads | |
else: | |
outputs.append(grads) | |
f_outputs = K.function([combination_image], outputs) | |
def eval_loss_and_grads(x): | |
x = x.reshape((1, 3, img_width, img_height)) | |
outs = f_outputs([x]) | |
loss_value = outs[0] | |
if len(outs[1:]) == 1: | |
grad_values = outs[1].flatten().astype('float64') | |
else: | |
grad_values = np.array(outs[1:]).flatten().astype('float64') | |
return loss_value, grad_values | |
# this Evaluator class makes it possible | |
# to compute loss and gradients in one pass | |
# while retrieving them via two separate functions, | |
# "loss" and "grads". This is done because scipy.optimize | |
# requires separate functions for loss and gradients, | |
# but computing them separately would be inefficient. | |
class Evaluator(object): | |
def __init__(self): | |
self.loss_value = None | |
self.grads_values = None | |
def loss(self, x): | |
assert self.loss_value is None | |
loss_value, grad_values = eval_loss_and_grads(x) | |
self.loss_value = loss_value | |
self.grad_values = grad_values | |
return self.loss_value | |
def grads(self, x): | |
assert self.loss_value is not None | |
grad_values = np.copy(self.grad_values) | |
self.loss_value = None | |
self.grad_values = None | |
return grad_values | |
evaluator = Evaluator() | |
# run scipy-based optimization (L-BFGS) over the pixels of the generated image | |
# so as to minimize the neural style loss | |
x = np.random.uniform(0, 255, (1, 3, img_width, img_height)) | |
x[0, 0, :, :] -= 103.939 | |
x[0, 1, :, :] -= 116.779 | |
x[0, 2, :, :] -= 123.68 | |
# Create workfolder | |
if not os.path.exists(batch_id): | |
os.makedirs(batch_id) | |
for i in range(10): | |
print('Start of iteration', i) | |
start_time = time.time() | |
x, min_val, info = fmin_l_bfgs_b(evaluator.loss, x.flatten(), | |
fprime=evaluator.grads, maxfun=20) | |
print('Current loss value:', min_val) | |
# save current generated image | |
img = deprocess_image(x.copy().reshape((3, img_width, img_height))) | |
fname = batch_id + '/' + frame + '-%d.png' % i | |
imsave(fname, img) | |
end_time = time.time() | |
print('Image saved as', fname) | |
# Upload to S3 | |
f = open(fname,'rb') | |
conn = tinys3.Connection("******","******",tls=True) | |
conn.upload(fname, f, 'kubernets-artist') | |
print('Image uploaded to s3') | |
print('Iteration %d completed in %ds' % (i, end_time - start_time)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment