Created
April 27, 2016 10:07
-
-
Save young-geng/96118be89636cbd5cd72ec58f6d3725f to your computer and use it in GitHub Desktop.
Code that would trigger the problem of deconv layers in python caffe
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import division | |
import argparse | |
import numpy as np | |
import os | |
import tempfile | |
import time | |
parser = argparse.ArgumentParser( | |
description='Train and evaluate a net on the MIT mini-places dataset.') | |
parser.add_argument('--image_root', default='../../images/', | |
help='Directory where images are stored') | |
parser.add_argument('--crop', type=int, default=128, | |
help=('The edge length of the random image crops' | |
'(defaults to 96 for 96x96 crops)')) | |
parser.add_argument('--disp', type=int, default=10, | |
help='Print loss/accuracy every --disp training iterations') | |
parser.add_argument('--snapshot_dir', default='./snapshot', | |
help='Path to directory where snapshots are saved') | |
parser.add_argument('--snapshot_prefix', default='place_net', | |
help='Snapshot filename prefix') | |
parser.add_argument('--iters', type=int, default=50*1000, | |
help='Total number of iterations to train the network') | |
parser.add_argument('--batch', type=int, default=86, | |
help='The batch size to use for training') | |
parser.add_argument('--iter_size', type=int, default=3, | |
help=('The number of iterations (batches) over which to average the ' | |
'gradient computation. Effectively increases the batch size ' | |
'(--batch) by this factor, but without increasing memory use ')) | |
parser.add_argument('--lr', type=float, default=0.01, | |
help='The initial learning rate') | |
parser.add_argument('--gamma', type=float, default=0.63, | |
help='Factor by which to drop the learning rate') | |
parser.add_argument('--stepsize', type=int, default=2000, | |
help='Drop the learning rate every N iters -- this specifies N') | |
parser.add_argument('--momentum', type=float, default=0.9, | |
help='The momentum hyperparameter to use for momentum SGD') | |
parser.add_argument('--decay', type=float, default=5e-4, | |
help='The L2 weight decay coefficient') | |
parser.add_argument('--seed', type=int, default=1, | |
help='Seed for the random number generator') | |
parser.add_argument('--cudnn', action='store_true', | |
help='Use CuDNN at training time -- usually faster, but non-deterministic') | |
parser.add_argument('--gpu', type=int, default=0, | |
help='GPU ID to use for training and inference (-1 for CPU)') | |
parser.add_argument('--eval_only', action='store_true', help='only run evaluation') | |
parser.add_argument('--generate_proto_only', action='store_true', help='only run evaluation') | |
args = parser.parse_args() | |
# disable most Caffe logging (unless env var $GLOG_minloglevel is already set) | |
key = 'GLOG_minloglevel' | |
if not os.environ.get(key, ''): | |
os.environ[key] = '3' | |
import caffe | |
from caffe.proto import caffe_pb2 | |
from caffe import layers as L | |
from caffe import params as P | |
if args.gpu >= 0: | |
caffe.set_mode_gpu() | |
caffe.set_device(args.gpu) | |
else: | |
caffe.set_mode_cpu() | |
def to_tempfile(file_content): | |
"""Serialize a Python protobuf object str(proto), dump to a temporary file, | |
and return its filename.""" | |
with tempfile.NamedTemporaryFile(delete=False) as f: | |
f.write(file_content) | |
return f.name | |
weight_param = dict(lr_mult=1, decay_mult=1) | |
bias_param = dict(lr_mult=2, decay_mult=0) | |
learned_param = [weight_param, bias_param] | |
frozen_param = [dict(lr_mult=0)] * 2 | |
zero_filler = dict(type='constant', value=0) | |
msra_filler = dict(type='msra') | |
uniform_filler = dict(type='uniform', min=-0.1, max=0.1) | |
fc_filler = dict(type='gaussian', std=0.005) | |
# Original AlexNet used the following commented out Gaussian initialization; | |
# we'll use the "MSRA" one instead, which scales the Gaussian initialization | |
# of a convolutional filter based on its receptive field size. | |
# conv_filler = dict(type='gaussian', std=0.01) | |
conv_filler = dict(type='msra') | |
def conv_relu(bottom, ks, nout, stride=1, pad=0, group=1, | |
param=learned_param, | |
weight_filler=conv_filler, bias_filler=zero_filler, | |
train=False): | |
# set CAFFE engine to avoid CuDNN convolution -- non-deterministic results | |
engine = {} | |
if train and not args.cudnn: | |
engine.update(engine=P.Pooling.CAFFE) | |
conv = L.Convolution(bottom, kernel_size=ks, stride=stride, | |
num_output=nout, pad=pad, group=group, param=param, | |
weight_filler=weight_filler, bias_filler=bias_filler, | |
**engine) | |
return conv, L.ReLU(conv, in_place=True) | |
def fc_relu(bottom, nout, param=learned_param, | |
weight_filler=fc_filler, bias_filler=zero_filler): | |
fc = L.InnerProduct(bottom, num_output=nout, param=param, | |
weight_filler=weight_filler, bias_filler=bias_filler) | |
return fc, L.ReLU(fc, in_place=True) | |
def max_pool(bottom, ks, stride=1, train=False): | |
# set CAFFE engine to avoid CuDNN pooling -- non-deterministic results | |
engine = {} | |
if train and not args.cudnn: | |
engine.update(engine=P.Pooling.CAFFE) | |
return L.Pooling(bottom, pool=P.Pooling.MAX, kernel_size=ks, stride=stride, | |
**engine) | |
def conv_bn_scale_relu(bottom, ks, nout, stride=1, pad=0, group=1, | |
param=learned_param, | |
weight_filler=conv_filler, bias_filler=zero_filler, | |
bias_term=False, train=False): | |
engine = {} | |
if bias_term: | |
conv = L.Convolution(bottom, kernel_size=ks, stride=stride, | |
num_output=nout, pad=pad, group=group, param=param, | |
weight_filler=weight_filler, bias_filler=bias_filler, | |
bias_term=bias_term, **engine) | |
else: | |
conv = L.Convolution(bottom, kernel_size=ks, stride=stride, | |
num_output=nout, pad=pad, group=group, param=[weight_param], | |
weight_filler=weight_filler, bias_filler=bias_filler, | |
bias_term=bias_term, **engine) | |
bn = L.BatchNorm(conv, in_place=True) | |
scale = L.Scale(bn, bias_term=True, in_place=True) | |
relu = L.ReLU(scale, in_place=True) | |
return conv, bn, scale, relu | |
def deconv_bn_scale_relu(bottom, ks, nout, stride=1, pad=0, group=1, | |
param=learned_param, | |
weight_filler=conv_filler, bias_filler=zero_filler, | |
bias_term=False, train=False): | |
engine = {} | |
# deconv = L.Deconvolution(bottom, kernel_size=ks, stride=stride, | |
# num_output=nout, pad=pad, param=[weight_param], | |
# weight_filler=weight_filler, bias_filler=bias_filler, | |
# bias_term=bias_term, **engine) | |
deconv = L.Deconvolution(bottom, kernel_size=ks, stride=stride, | |
num_output=nout, pad=pad, param=[weight_param], | |
weight_filler=weight_filler, bias_filler=bias_filler, | |
bias_term=bias_term, **engine) | |
# deconv = L.Deconvolution(bottom) | |
bn = L.BatchNorm(deconv, in_place=True) | |
scale = L.Scale(bn, bias_term=True, in_place=True) | |
relu = L.ReLU(scale, in_place=True) | |
return deconv, bn, scale, relu | |
def minialexnet(data, train=False, param=learned_param, with_labels=True): | |
""" | |
Returns a protobuf text file specifying a variant of AlexNet, following the | |
original specification (<caffe>/models/bvlc_alexnet/train_val.prototxt). | |
The changes with respect to the original AlexNet are: | |
- LRN (local response normalization) layers are not included | |
- The Fully Connected (FC) layers (fc6 and fc7) have smaller dimensions | |
due to the lower resolution of mini-places images (128x128) compared | |
with ImageNet images (usually resized to 256x256) | |
""" | |
n = caffe.NetSpec() | |
if data is not None: | |
n.data = data | |
conv_kwargs = dict(param=param, train=train) | |
# Output_size 128 | |
n.conv1_1, n.bn1_1, n.scale1_1, n.relu1_1 = conv_bn_scale_relu(n.data, 3, 24, stride=1, pad=1, **conv_kwargs) | |
# Output_size 128 | |
n.conv1_2, n.bn1_2, n.scale1_2, n.relu1_2 = conv_bn_scale_relu(n.relu1_1, 3, 24, stride=1, pad=1, **conv_kwargs) | |
# Output_size 64 | |
n.pool1 = max_pool(n.relu1_2, ks=2, stride=2) | |
# Output_size 64 | |
n.conv2_1, n.bn2_1, n.scale2_1, n.relu2_1 = conv_bn_scale_relu(n.pool1, 3, 32, pad=1, **conv_kwargs) | |
# Output_size 64 | |
n.conv2_2, n.bn2_2, n.scale2_2, n.relu2_2 = conv_bn_scale_relu(n.relu2_1, 3, 32, pad=1, **conv_kwargs) | |
# Output_size 32 | |
n.pool2 = max_pool(n.relu2_2, ks=2, stride=2) | |
# Output_size 32 | |
n.conv3_1, n.bn3_1, n.scale3_1, n.relu3_1 = conv_bn_scale_relu(n.pool2, 3, 48, pad=1, **conv_kwargs) | |
# Output_size 32 | |
n.conv3_2, n.bn3_2, n.scale3_2, n.relu3_2 = conv_bn_scale_relu(n.relu3_1, 3, 48, pad=1, **conv_kwargs) | |
# Output_size 16 | |
n.pool3 = max_pool(n.relu3_2, ks=2, stride=2) | |
# Output_size 16 | |
n.conv4_1, n.bn4_1, n.scale4_1, n.relu4_1 = conv_bn_scale_relu(n.pool3, 3, 64, pad=1, **conv_kwargs) | |
# Output_size 16 | |
n.conv4_2, n.bn4_2, n.scale4_2, n.relu4_2 = conv_bn_scale_relu(n.relu4_1, 3, 64, pad=1, **conv_kwargs) | |
# Output_size 8 | |
n.pool4 = max_pool(n.relu4_2, ks=2, stride=2) | |
# Output_size 8 | |
n.conv_bn, n.bn_bn, n.scale_bn, n.relu_bn = conv_bn_scale_relu(n.pool4, 3, 3, pad=1, **conv_kwargs) | |
# Output_size 16 | |
n.dc5_1, n.bn5_1, n.scale5_1, n.relu5_1 = deconv_bn_scale_relu(n.relu_bn, 2, 4, pad=0, stride=2, **conv_kwargs) | |
n.dc5_2, n.bn5_2, n.scale5_2, n.relu5_2 = deconv_bn_scale_relu(n.relu5_1, 3, 4, pad=1, **conv_kwargs) | |
n.dc5_3, n.bn5_3, n.scale5_3, n.relu5_3 = deconv_bn_scale_relu(n.relu5_2, 3, 4, pad=1, **conv_kwargs) | |
# Output_size 32 | |
n.dc6_1, n.bn6_1, n.scale6_1, n.relu6_1 = deconv_bn_scale_relu(n.relu5_3, 2, 4, pad=0, stride=2, **conv_kwargs) | |
n.dc6_2, n.bn6_2, n.scale6_2, n.relu6_2 = deconv_bn_scale_relu(n.relu6_1, 3, 4, pad=1, **conv_kwargs) | |
n.dc6_3, n.bn6_3, n.scale6_3, n.relu6_3 = deconv_bn_scale_relu(n.relu6_2, 3, 4, pad=1, **conv_kwargs) | |
# Output_size 64 | |
n.dc7_1, n.bn7_1, n.scale7_1, n.relu7_1 = deconv_bn_scale_relu(n.relu6_3, 2, 4, pad=0, stride=2, **conv_kwargs) | |
n.dc7_2, n.bn7_2, n.scale7_2, n.relu7_2 = deconv_bn_scale_relu(n.relu7_1, 3, 4, pad=1, **conv_kwargs) | |
n.dc7_3, n.bn7_3, n.scale7_3, n.relu7_3 = deconv_bn_scale_relu(n.relu7_2, 3, 4, pad=1, **conv_kwargs) | |
# Output_size 128 | |
n.dc8_1, n.bn8_1, n.scale8_1, n.relu8_1 = deconv_bn_scale_relu(n.relu7_3, 2, 4, pad=0, stride=2, **conv_kwargs) | |
n.dc8_2, n.bn8_2, n.scale8_2, n.relu8_2 = deconv_bn_scale_relu(n.relu8_1, 3, 4, pad=1, **conv_kwargs) | |
n.dc8_3, n.bn8_3, n.scale8_3, _ = deconv_bn_scale_relu(n.relu8_2, 3, 3, pad=1, **conv_kwargs) | |
if with_labels: | |
n.label = data | |
n.loss = L.EuclideanLoss(n.scale8_3, n.label) | |
else: | |
n.ignored_label = data | |
n.silence_label = L.Silence(n.ignored_label, ntop=0) | |
with open('network.prototxt', 'w') as fout: | |
fout.write(str(n.to_proto())) | |
return to_tempfile(str(n.to_proto())) | |
def get_split(split): | |
filename = '../../images/labels/%s.txt' % split | |
if not os.path.exists(filename): | |
raise IOError('Split data file not found: %s' % split) | |
return filename | |
def miniplaces_net(source, train=False, with_labels=True): | |
#mean = [104, 117, 123] # per-channel mean of the BGR image pixels | |
mean = [127, 127, 127] | |
transform_param = dict(mirror=False, crop_size=args.crop, mean_value=mean) | |
# batch_size = args.batch if train else 100 | |
batch_size = args.batch | |
places_data, places_labels = L.ImageData(transform_param=transform_param, | |
source=source, root_folder=args.image_root, shuffle=train, | |
batch_size=batch_size, ntop=2) | |
return minialexnet(data=places_data, train=train, | |
with_labels=with_labels) | |
def snapshot_prefix(): | |
return os.path.join(args.snapshot_dir, args.snapshot_prefix) | |
def snapshot_at_iteration(iteration): | |
return '%s_iter_%d.caffemodel' % (snapshot_prefix(), iteration) | |
def miniplaces_solver(train_net_path, test_net_path=None): | |
s = caffe_pb2.SolverParameter() | |
# Specify locations of the train and (maybe) test networks. | |
s.train_net = train_net_path | |
if test_net_path is not None: | |
s.test_net.append(test_net_path) | |
# Test after every 1000 training iterations. | |
s.test_interval = 1000 | |
# Set `test_iter` to test on 100 batches each time we test. | |
# With test batch size 100, this covers the entire validation set of | |
# 10K images (100 * 100 = 10K). | |
s.test_iter.append(100) | |
else: | |
s.test_interval = args.iters + 1 # don't test during training | |
# The number of batches over which to average the gradient. | |
# Effectively boosts the training batch size by the given factor, without | |
# affecting memory utilization. | |
s.iter_size = args.iter_size | |
# Solve using the stochastic gradient descent (SGD) algorithm. | |
# Other choices include 'Adam' and 'RMSProp'. | |
s.type = 'SGD' | |
# The following settings (base_lr, lr_policy, gamma, stepsize, and max_iter), | |
# define the following learning rate schedule: | |
# Iterations [ 0, 20K) -> learning rate 0.01 = base_lr | |
# Iterations [20K, 40K) -> learning rate 0.001 = base_lr * gamma | |
# Iterations [40K, 50K) -> learning rate 0.0001 = base_lr * gamma^2 | |
# Set the initial learning rate for SGD. | |
s.base_lr = args.lr | |
# Set `lr_policy` to define how the learning rate changes during training. | |
# Here, we 'step' the learning rate by multiplying it by a factor `gamma` | |
# every `stepsize` iterations. | |
s.lr_policy = 'step' | |
s.gamma = args.gamma | |
s.stepsize = args.stepsize | |
# `max_iter` is the number of times to update the net (training iterations). | |
s.max_iter = args.iters | |
# Set other SGD hyperparameters. Setting a non-zero `momentum` takes a | |
# weighted average of the current gradient and previous gradients to make | |
# learning more stable. L2 weight decay regularizes learning, to help | |
# prevent the model from overfitting. | |
s.momentum = args.momentum | |
s.weight_decay = args.decay | |
# Display the current training loss and accuracy every `display` iterations. | |
# This doesn't have an effect for Python training here as logging is | |
# disabled by this script (see the GLOG_minloglevel setting). | |
s.display = args.disp | |
# Number of training iterations over which to smooth the displayed loss. | |
# The summed loss value (Iteration N, loss = X) will be averaged, | |
# but individual loss values (Train net output #K: my_loss = X) won't be. | |
s.average_loss = 10 | |
# Seed the RNG for deterministic results. | |
# (May not be so deterministic if using CuDNN.) | |
s.random_seed = args.seed | |
# Snapshots are files used to store networks we've trained. Here, we'll | |
# snapshot twice per learning rate step to the location specified by the | |
# --snapshot_dir and --snapshot_prefix args. | |
s.snapshot = args.stepsize // 2 | |
s.snapshot_prefix = snapshot_prefix() | |
# Create snapshot dir if it doesn't already exist. | |
if not os.path.exists(args.snapshot_dir): | |
os.makedirs(args.snapshot_dir) | |
with open('solver.prototxt', 'w') as fout: | |
fout.write(str(s)) | |
return to_tempfile(str(s)) | |
def train_net(with_val_net=False): | |
train_net_file = miniplaces_net(get_split('train'), train=True) | |
# Set with_val_net=True to test during training. | |
# Environment variable GLOG_minloglevel should be set to 0 to display | |
# Caffe output in this case; otherwise, the test result will not be | |
# displayed. | |
if with_val_net: | |
val_net_file = miniplaces_net(get_split('val'), train=False) | |
else: | |
val_net_file = None | |
solver_file = miniplaces_solver(train_net_file, val_net_file) | |
solver = caffe.get_solver(solver_file) | |
outputs = sorted(solver.net.outputs) | |
def str_output(output): | |
value = solver.net.blobs[output].data | |
if output.startswith('accuracy'): | |
valstr = '%5.2f%%' % (100 * value, ) | |
else: | |
valstr = '%6f' % value | |
return '%s = %s' % (output, valstr) | |
def disp_outputs(iteration, iter_pad_len=len(str(args.iters))): | |
metrics = '; '.join(str_output(o) for o in outputs) | |
return 'Iteration %*d: %s' % (iter_pad_len, iteration, metrics) | |
# We could just call `solver.solve()` rather than `step()`ing in a loop. | |
# (If we hadn't set GLOG_minloglevel = 3 at the top of this file, Caffe | |
# would display loss/accuracy information during training.) | |
previous_time = None | |
for iteration in xrange(args.iters): | |
solver.step(1) | |
if (args.disp > 0) and (iteration % args.disp == 0): | |
current_time = time.clock() | |
if previous_time is None: | |
benchmark = '' | |
else: | |
time_per_iter = (current_time - previous_time) / args.disp | |
benchmark = ' (%5f s/it)' % time_per_iter | |
previous_time = current_time | |
print disp_outputs(iteration), benchmark | |
# if (iteration > 0) and (iteration % (args.stepsize // 2) == 0): | |
# eval_net("train", iters=iteration) | |
# eval_net("val", iters=iteration) | |
# Print accuracy for last iteration. | |
solver.net.forward() | |
disp_outputs(args.iters) | |
solver.net.save(snapshot_at_iteration(args.iters)) | |
def eval_net(split, K=5): | |
print 'Running evaluation for split:', split | |
filenames = [] | |
labels = [] | |
split_file = get_split(split) | |
with open(split_file, 'r') as f: | |
for line in f.readlines(): | |
parts = line.split() | |
assert 1 <= len(parts) <= 2, 'malformed line' | |
filenames.append(parts[0]) | |
if len(parts) > 1: | |
labels.append(int(parts[1])) | |
known_labels = (len(labels) > 0) | |
if known_labels: | |
assert len(labels) == len(filenames) | |
else: | |
# create file with 'dummy' labels (all 0s) | |
split_file = to_tempfile(''.join('%s 0\n' % name for name in filenames)) | |
test_net_file = miniplaces_net(split_file, train=False, with_labels=False) | |
weights_file = snapshot_at_iteration(args.iters) | |
net = caffe.Net(test_net_file, weights_file, caffe.TEST) | |
top_k_predictions = np.zeros((len(filenames), K), dtype=np.int32) | |
if known_labels: | |
correct_label_probs = np.zeros(len(filenames)) | |
offset = 0 | |
all_probs = np.zeros((len(filenames), 10)) | |
while offset < len(filenames): | |
probs = net.forward()['probs'] | |
for prob in probs: | |
all_probs[offset] = prob | |
top_k_predictions[offset] = (-prob).argsort()[:K] | |
if known_labels: | |
correct_label_probs[offset] = prob[labels[offset]] | |
offset += 1 | |
if offset >= len(filenames): | |
break | |
if known_labels: | |
def accuracy_at_k(preds, labels, k): | |
assert len(preds) == len(labels) | |
num_correct = sum(l in p[:k] for p, l in zip(preds, labels)) | |
return num_correct / len(preds) | |
for k in [1, K]: | |
accuracy = 100 * accuracy_at_k(top_k_predictions, labels, k) | |
print '\tAccuracy at %d = %4.2f%%' % (k, accuracy) | |
cross_ent_error = -np.log(correct_label_probs).mean() | |
print '\tSoftmax cross-entropy error = %.4f' % (cross_ent_error, ) | |
else: | |
print 'Not computing accuracy; ground truth unknown for split:', split | |
#all_probs = np.vstack(all_probs) | |
if split != 'train': | |
np.savetxt("prob.%s.csv" % split, all_probs, delimiter=",") | |
filename = 'top_%d_predictions.%s.csv' % (K, split) | |
with open(filename, 'w') as f: | |
f.write(','.join(['image'] + ['label%d' % i for i in range(1, K+1)])) | |
f.write('\n') | |
f.write(''.join('%s,%s\n' % (image, ','.join(str(p) for p in preds)) | |
for image, preds in zip(filenames, top_k_predictions))) | |
print 'Predictions for split %s dumped to: %s' % (split, filename) | |
if __name__ == '__main__': | |
if args.generate_proto_only: | |
miniplaces_net(source="", train=True, with_labels=False) | |
exit(0) | |
if not args.eval_only: | |
print 'Training net...\n' | |
train_net() | |
print '\nTraining complete. Evaluating...\n' | |
# for split in ('train', 'val', 'test'): | |
# eval_net(split) | |
# print 'Evaluation complete.' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment