-
-
Save ProGamerGov/f735c1360207b420c4f920d69853e157 to your computer and use it in GitHub Desktop.
# Code - Trying to translate https://github.com/jcjohnson/neural-style/blob/master/neural_style.lua to PyTorch. | |
from __future__ import print_function | |
import torch | |
import torch.legacy.nn as nn | |
from torch.autograd import Variable | |
import torch.legacy.optim as optim | |
from PIL import Image | |
#from skimage import io,transform,img_as_float | |
#from skimage.io import imread,imsave | |
import torchvision | |
import torchvision.transforms as transforms | |
import torchvision.models as models | |
from torchvision.utils import save_image | |
import copy | |
import argparse | |
parser = argparse.ArgumentParser() | |
# Basic options | |
parser.add_argument("-style_image", help="Style target image", default='examples/inputs/seated-nude.jpg') | |
parser.add_argument("-content_image", help="Content target image", default='examples/inputs/tubingen.jpg') | |
parser.add_argument("-image_size", help="Maximum height / width of generated image", type=int, default=512) | |
# Optimization options | |
parser.add_argument("-content_weight", help="content weight", type=int, default=5) | |
parser.add_argument("-style_weight", help="style weight", type=int, default=10) | |
parser.add_argument("-num_iterations", help="iterations", type=int, default=1000) | |
parser.add_argument("-normalize_gradients", action='store_true') | |
parser.add_argument("-init", help="initialisation type", default="random", choices=["random", "image"]) | |
parser.add_argument("-init_image", help="initial image", default="") | |
parser.add_argument("-optimizer", help="optimiser", default="lbfgs", choices=["lbfgs", "adam"]) | |
parser.add_argument("-learning_rate", default=1) | |
parser.add_argument("-lbfgs_num_correction", help="lbfgs num correction", default=0) | |
# Output options | |
parser.add_argument("-output_image", default='out.png') | |
# Other options | |
parser.add_argument("-style_scale", help="style scale", type=float, default=1.0) | |
#parser.add_argument("-proto_file", default='models/VGG_ILSVRC_19_layers_deploy.prototxt') | |
#parser.add_argument("-model_file", default='models/VGG_ILSVRC_19_layers.caffemodel') | |
parser.add_argument("-backend", choices=["nn", "cudnn", "clnn"], default='cudnn') | |
parser.add_argument("-seed", help="random number seed", default=-1) | |
params = parser.parse_args() | |
use_cuda = torch.cuda.is_available() | |
dtype = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor | |
#cnn = loadcaffe.load(params.proto_file, params.model_file, params.backend) #.type(dtype) | |
cnn = models.vgg19(pretrained=True).features | |
loader = transforms.Compose([ | |
transforms.Scale(params.image_size), # scale imported image | |
transforms.ToTensor()]) # transform it into a torch tensor | |
def image_loader(image_name): | |
image = Image.open(image_name) | |
image = Variable(loader(image)) | |
# fake batch dimension required to fit network's input dimensions | |
image = image.unsqueeze(0) | |
return image | |
content_image_caffe = image_loader(params.content_image).type(dtype) | |
style_image_caffe = image_loader(params.style_image).type(dtype) | |
# move it to the GPU if possible: | |
if use_cuda: | |
cnn = cnn.cuda() | |
#print(cnn) | |
content_layers_default = ['relu_4'] | |
style_layers_default = ['relu_1', 'relu_2', 'relu_3', 'relu_4', 'relu_5'] | |
def create_model(cnn, style_image_caffe, content_image_caffe, style_weight=params.style_weight, content_weight=params.style_weight, content_layers=content_layers_default, style_layers=style_layers_default): | |
cnn = copy.deepcopy(cnn) | |
content_losses = [] | |
style_losses = [] | |
model = nn.Sequential() # the new Sequential module network | |
#gram = GramMatrix() # we need a gram module in order to compute style targets | |
# move these modules to the GPU if possible: | |
if use_cuda: | |
model = model.cuda() | |
#gram = gram.cuda() | |
i = 1 | |
for layer in list(cnn): | |
if isinstance(layer, nn.ReLU): | |
name = "relu_" + str(i) | |
model.add_module(name, layer) | |
if name in content_layers: | |
# add content loss: | |
target = model(content_image_caffe).clone() | |
content_loss = ContentLoss(target, content_weight) | |
model.add_module("content_loss_" + str(i), content_loss) | |
content_losses.append(content_loss) | |
if name in style_layers: | |
# add style loss: | |
target_feature = model(style_image_caffe).clone() | |
target_feature_gram = gram(target_feature).cuda() | |
style_loss = StyleLoss(target_feature_gram, style_weight) | |
model.add_module("style_loss_" + str(i), style_loss) | |
style_losses.append(style_loss) | |
i += 1 | |
return model, style_losses, content_losses | |
# Define an nn Module to compute content loss in-place | |
class ContentLoss(nn.Module): | |
def __init__(self, target, strength, normalize): | |
super(ContentLoss, self).__init__() | |
self.strength = strength | |
self.target = target.detach() * strength | |
self.normalize = false | |
self.loss = 0 | |
self.crit = nn.MSECriterion() | |
self.mode = None | |
def updateOutput(self, input): | |
if self.mode == 'loss': | |
self.loss = self.crit.updateOutput(input, self.target) * self.strength #Forward | |
elif self.mode == 'capture': | |
self.target.resize_as_(input).copy_(input) | |
self.output = input | |
return self.output | |
def updateGradInput(self, input, gradOutput): | |
if self.mode == 'loss': | |
if input.nelement() == self.target.nelement(): | |
self.gradInput = self.crit.updateGradInput(input, self.target) #Backward | |
if self.normalize: | |
self.gradInput.div(torch.norm(self.gradInput, 1) + 1e-8) # Normalize Gradients | |
self.gradInput_mul(self.strength) | |
self.gradInput.add(gradOutput) | |
else: | |
self.target.resize_as_(gradOutput).copy_(gradOutput) | |
return self.gradInput | |
class GramMatrix(nn.Module): | |
def __init__(self, input): | |
super(GramMatrix, self).__init__() | |
def updateOutput(self, input): | |
assert input.dim() == 3 | |
C, H, W = input.size(1), input.size(2), input.size(3) | |
x_flat = input.view(C, H * W) | |
self.output.resize(C, C) | |
self.output.mm(x_flat, x_flat.t()) | |
return self.output | |
def updateGradInput(self, input, gradOutput): | |
assert input.dim() == 3 and input.size(1) | |
C, H, W = input.size(1), input.size(2), input.size(3) | |
x_flat = input.view(C, H * W) | |
self.gradInput.resize(C, H * W).mm(gradOutput, x_flat) | |
self.gradInput.addmm(gradOutput.t(), x_flat) | |
self.gradInput = self.gradInput.view(C, H, W) | |
return self.gradInput | |
# Define an nn Module to compute style loss in-place | |
class StyleLoss(nn.Module): | |
def __init__(self, target, strength, normalize): | |
super(StyleLoss, self).__init__() | |
self.normalize = false | |
self.strength = strength | |
self.target = target.detach() * strength | |
self.mode = None | |
self.loss = 0 | |
self.gram = GramMatrix() | |
self.blend_weight = nil | |
self.G = None | |
self.crit = nn.MSECriterion() | |
def updateOutput(self, input): | |
self.G = self.gram.updateOutput(input) # Forward Gram | |
self.G.div(input.nelement()) | |
if self.mode == 'capture': | |
if self.blend_weight == None: | |
self.target.resize_as_(self.G).copy_(self.G) | |
elif self.target.nelement() == 0: | |
self.target.resize_as_(self.G).copy_(self.G).mul_(self.blend_weight) | |
else: | |
self.target.add(self.blend_weight, self.G) | |
elif self.mode == 'loss': | |
self.loss = self.strength * self.crit.updateOutput(input, self.target) #Forward | |
self.output = input | |
return self.output | |
def updateGradInput(self, input, gradOutput): | |
if self.mode == 'loss': | |
dG = self.crit.updateGradInpu(self.G, self.target) # Backward | |
dG.div(input.nelement()) | |
self.gradInput = self.gram.updateGradInput(input) # Gram Backward | |
if self.normalize: | |
self.gradInput.div(torch.norm(self.gradInput, 1) + 1e-8) # Normalize Gradients | |
self.gradInput_mul(self.strength) | |
self.gradInput.add(gradOutput) | |
else: | |
self.gradInput = gradOutput | |
return self.gradInput | |
model, style_losses, content_losses = create_model(cnn, style_image_caffe, content_image_caffe, params.style_weight, params.content_weight, content_layers_default, style_layers_default) | |
img = content_image_caffe.clone() | |
# Run it through the network once to get the proper size for the gradient | |
# All the gradients will come from the extra loss modules, so we just pass | |
# zeros into the top of the net on the backward pass. | |
y = model.updateOutput(img) | |
dy = img.clone().zero_() | |
#dy = dy.zero_() | |
# Declaring this here lets us access it in maybe_print | |
optim_state = None | |
if params.optimizer == 'lbfgs': | |
optim_state = { | |
"maxIter": params.num_iterations, | |
"verbose": True, | |
"tolX":-1, | |
"tolFun":-1, | |
} | |
if params.lbfgs_num_correction > 0: | |
optim_state.nCorrection = params.lbfgs_num_correction | |
elif params.optimizer == 'adam': | |
optim_state = { | |
"learningRate": params.learning_rate, | |
} | |
# Function to evaluate loss and gradient. We run the net forward and | |
# backward to get the gradient, and sum up losses from the loss modules. | |
# optim.lbfgs internally handles iteration and calls this function many | |
# times, so we manually count the number of iterations to handle printing | |
# and saving intermediate results. | |
num_calls = [0] | |
def feval(x): | |
num_calls[0] += 1 | |
model.updateOutput(x) | |
grad = model.updateGradInput(x, dy) | |
loss = 0 | |
for n, mod in content_losses: | |
loss = loss + mod.loss | |
for n, mod in style_losses: | |
loss = loss + mod.loss | |
# optim.lbfgs expects a vector for gradients | |
return loss, grad.view(grad.nelement()) | |
print("Model Loaded") | |
# Capture content targets | |
for i in content_losses: | |
content_losses[i].mode = 'capture' | |
print("Capturing content targets") | |
content_image_caffe = content_image_caffe.type(dtype) | |
model.updateOutput(content_image_caffe.type(dtype)) | |
# Capture style targets | |
for i in content_losses: | |
content_losses[i].mode = None | |
print("Capturing style target") | |
for j in style_losses: | |
style_losses[j].mode = 'capture' | |
style_losses[j].blend_weight = style_blend_weights[i] | |
model.updateOutput(style_image_caffe) | |
# Set all loss modules to loss mode | |
for i in content_losses: | |
content_losses[i].mode = loss | |
for i in style_losses: | |
style_losses[i].mode = loss | |
# Initialize the image | |
if params.seed >= 0: | |
torch.manualSeed(params.seed) | |
# Run optimization. | |
if params.optimizer == 'lbfgs': | |
print("Running optimization with L-BFGS") | |
x, losses = optim.lbfgs(feval, img, optim_state) | |
elif params.optimizer == 'adam': | |
print("Running optimization with ADAM") | |
for t in params.num_iterations: | |
x, losses = optim.adam(feval, img, optim_state) | |
print("Test CNN") | |
#print(model) | |
torchvision.utils.save_image(output_img, params.output_image, nrow=8, padding=2, normalize=False, range=None, scale_each=False, pad_value=0) |
The torch.legacy.nn
package in PyTorch doesn't support the Conv2d, and the MaxPool2d layers that the pretrained VGG models seem to all use.
So I have been trying to figure out how to either use those layers with torch.legacy.nn
, load the model and replace those layers, or convert a model's layers to the applicable legacy layers.
I went with torch.legacy.nn
instead of torch.nn
because it let me use the same functions in the ContentLoss, StyleLoss, and GramMatrix functions as Neural-Style uses.
There are also numerous issues like the how to images are processed, and how the input images need to be the same exact size, but I am trying to solve the issue of setting up the model first before I address those other issues.
I made a script which can change the model layers: https://gist.github.com/ProGamerGov/318b5f53e5b9da1e6779c7c2baf60a29
Looking at your code, the main issue with the model setup loop is that it should (like Justin's lua/torch code also does) take the layers from the pretrained model (cnn) and build a new model, adding layers one by one and also loss modules at the correct places. Your code does not do this, it makes a new model with just ReLUs and loss modules, but no conv layers etc at all.
Other things:
You seem to use two methods of setting loss targets. First in the model building loop (like Justin's neural-style did prior to Dec 2016) and then afterwards in the capture mode (like Justin's code does now).
You are trying to handle the gradients in the loss modules by yourself, like in Torch. I am not sure how this works in PyTorch with its autograd. I see you are using a legacy optimizer, so that might take care of it, but you are on your own there.
One of the most essential things to remember when working on images with Python is that pixels can be represented either by floats between 0 and 1, or unsigned integers between 0 and 255. It depends on the packages/libraries one is using, or it could even be that a package looks at the data type and assumes the pixel value range accordingly.
You don't seem to subtract any mean from the images. Remember in lua, using VGG caffemodel, we subtracted a mean somewhere around 150, as the caffemodel has been trained using values between 0 and 255. Here with Pytorch, using pixel values between 0 and 1, the mean is probably close to 0.5.
PS. I saw now that the Pytorch tutorial version does not subtract mean either, so maybe it is not required with this model (depends on how the model has been trained). Based on this http://pytorch.org/docs/master/torchvision/models.html#id2 however
so it seems the tutorial code is not correct here.