Skip to content

Instantly share code, notes, and snippets.

Created April 27, 2016 10:07
Show Gist options
  • Save young-geng/96118be89636cbd5cd72ec58f6d3725f to your computer and use it in GitHub Desktop.
Save young-geng/96118be89636cbd5cd72ec58f6d3725f to your computer and use it in GitHub Desktop.
Code that would trigger the problem of deconv layers in python caffe
#!/usr/bin/env python
from __future__ import division
import argparse
import numpy as np
import os
import tempfile
import time
parser = argparse.ArgumentParser(
description='Train and evaluate a net on the MIT mini-places dataset.')
parser.add_argument('--image_root', default='../../images/',
help='Directory where images are stored')
parser.add_argument('--crop', type=int, default=128,
help=('The edge length of the random image crops'
'(defaults to 96 for 96x96 crops)'))
parser.add_argument('--disp', type=int, default=10,
help='Print loss/accuracy every --disp training iterations')
parser.add_argument('--snapshot_dir', default='./snapshot',
help='Path to directory where snapshots are saved')
parser.add_argument('--snapshot_prefix', default='place_net',
help='Snapshot filename prefix')
parser.add_argument('--iters', type=int, default=50*1000,
help='Total number of iterations to train the network')
parser.add_argument('--batch', type=int, default=86,
help='The batch size to use for training')
parser.add_argument('--iter_size', type=int, default=3,
help=('The number of iterations (batches) over which to average the '
'gradient computation. Effectively increases the batch size '
'(--batch) by this factor, but without increasing memory use '))
parser.add_argument('--lr', type=float, default=0.01,
help='The initial learning rate')
parser.add_argument('--gamma', type=float, default=0.63,
help='Factor by which to drop the learning rate')
parser.add_argument('--stepsize', type=int, default=2000,
help='Drop the learning rate every N iters -- this specifies N')
parser.add_argument('--momentum', type=float, default=0.9,
help='The momentum hyperparameter to use for momentum SGD')
parser.add_argument('--decay', type=float, default=5e-4,
help='The L2 weight decay coefficient')
parser.add_argument('--seed', type=int, default=1,
help='Seed for the random number generator')
parser.add_argument('--cudnn', action='store_true',
help='Use CuDNN at training time -- usually faster, but non-deterministic')
parser.add_argument('--gpu', type=int, default=0,
help='GPU ID to use for training and inference (-1 for CPU)')
parser.add_argument('--eval_only', action='store_true', help='only run evaluation')
parser.add_argument('--generate_proto_only', action='store_true', help='only run evaluation')
args = parser.parse_args()
# disable most Caffe logging (unless env var $GLOG_minloglevel is already set)
key = 'GLOG_minloglevel'
if not os.environ.get(key, ''):
os.environ[key] = '3'
import caffe
from caffe.proto import caffe_pb2
from caffe import layers as L
from caffe import params as P
if args.gpu >= 0:
def to_tempfile(file_content):
"""Serialize a Python protobuf object str(proto), dump to a temporary file,
and return its filename."""
with tempfile.NamedTemporaryFile(delete=False) as f:
weight_param = dict(lr_mult=1, decay_mult=1)
bias_param = dict(lr_mult=2, decay_mult=0)
learned_param = [weight_param, bias_param]
frozen_param = [dict(lr_mult=0)] * 2
zero_filler = dict(type='constant', value=0)
msra_filler = dict(type='msra')
uniform_filler = dict(type='uniform', min=-0.1, max=0.1)
fc_filler = dict(type='gaussian', std=0.005)
# Original AlexNet used the following commented out Gaussian initialization;
# we'll use the "MSRA" one instead, which scales the Gaussian initialization
# of a convolutional filter based on its receptive field size.
# conv_filler = dict(type='gaussian', std=0.01)
conv_filler = dict(type='msra')
def conv_relu(bottom, ks, nout, stride=1, pad=0, group=1,
weight_filler=conv_filler, bias_filler=zero_filler,
# set CAFFE engine to avoid CuDNN convolution -- non-deterministic results
engine = {}
if train and not args.cudnn:
conv = L.Convolution(bottom, kernel_size=ks, stride=stride,
num_output=nout, pad=pad, group=group, param=param,
weight_filler=weight_filler, bias_filler=bias_filler,
return conv, L.ReLU(conv, in_place=True)
def fc_relu(bottom, nout, param=learned_param,
weight_filler=fc_filler, bias_filler=zero_filler):
fc = L.InnerProduct(bottom, num_output=nout, param=param,
weight_filler=weight_filler, bias_filler=bias_filler)
return fc, L.ReLU(fc, in_place=True)
def max_pool(bottom, ks, stride=1, train=False):
# set CAFFE engine to avoid CuDNN pooling -- non-deterministic results
engine = {}
if train and not args.cudnn:
return L.Pooling(bottom, pool=P.Pooling.MAX, kernel_size=ks, stride=stride,
def conv_bn_scale_relu(bottom, ks, nout, stride=1, pad=0, group=1,
weight_filler=conv_filler, bias_filler=zero_filler,
bias_term=False, train=False):
engine = {}
if bias_term:
conv = L.Convolution(bottom, kernel_size=ks, stride=stride,
num_output=nout, pad=pad, group=group, param=param,
weight_filler=weight_filler, bias_filler=bias_filler,
bias_term=bias_term, **engine)
conv = L.Convolution(bottom, kernel_size=ks, stride=stride,
num_output=nout, pad=pad, group=group, param=[weight_param],
weight_filler=weight_filler, bias_filler=bias_filler,
bias_term=bias_term, **engine)
bn = L.BatchNorm(conv, in_place=True)
scale = L.Scale(bn, bias_term=True, in_place=True)
relu = L.ReLU(scale, in_place=True)
return conv, bn, scale, relu
def deconv_bn_scale_relu(bottom, ks, nout, stride=1, pad=0, group=1,
weight_filler=conv_filler, bias_filler=zero_filler,
bias_term=False, train=False):
engine = {}
# deconv = L.Deconvolution(bottom, kernel_size=ks, stride=stride,
# num_output=nout, pad=pad, param=[weight_param],
# weight_filler=weight_filler, bias_filler=bias_filler,
# bias_term=bias_term, **engine)
deconv = L.Deconvolution(bottom, kernel_size=ks, stride=stride,
num_output=nout, pad=pad, param=[weight_param],
weight_filler=weight_filler, bias_filler=bias_filler,
bias_term=bias_term, **engine)
# deconv = L.Deconvolution(bottom)
bn = L.BatchNorm(deconv, in_place=True)
scale = L.Scale(bn, bias_term=True, in_place=True)
relu = L.ReLU(scale, in_place=True)
return deconv, bn, scale, relu
def minialexnet(data, train=False, param=learned_param, with_labels=True):
Returns a protobuf text file specifying a variant of AlexNet, following the
original specification (<caffe>/models/bvlc_alexnet/train_val.prototxt).
The changes with respect to the original AlexNet are:
- LRN (local response normalization) layers are not included
- The Fully Connected (FC) layers (fc6 and fc7) have smaller dimensions
due to the lower resolution of mini-places images (128x128) compared
with ImageNet images (usually resized to 256x256)
n = caffe.NetSpec()
if data is not None: = data
conv_kwargs = dict(param=param, train=train)
# Output_size 128
n.conv1_1, n.bn1_1, n.scale1_1, n.relu1_1 = conv_bn_scale_relu(, 3, 24, stride=1, pad=1, **conv_kwargs)
# Output_size 128
n.conv1_2, n.bn1_2, n.scale1_2, n.relu1_2 = conv_bn_scale_relu(n.relu1_1, 3, 24, stride=1, pad=1, **conv_kwargs)
# Output_size 64
n.pool1 = max_pool(n.relu1_2, ks=2, stride=2)
# Output_size 64
n.conv2_1, n.bn2_1, n.scale2_1, n.relu2_1 = conv_bn_scale_relu(n.pool1, 3, 32, pad=1, **conv_kwargs)
# Output_size 64
n.conv2_2, n.bn2_2, n.scale2_2, n.relu2_2 = conv_bn_scale_relu(n.relu2_1, 3, 32, pad=1, **conv_kwargs)
# Output_size 32
n.pool2 = max_pool(n.relu2_2, ks=2, stride=2)
# Output_size 32
n.conv3_1, n.bn3_1, n.scale3_1, n.relu3_1 = conv_bn_scale_relu(n.pool2, 3, 48, pad=1, **conv_kwargs)
# Output_size 32
n.conv3_2, n.bn3_2, n.scale3_2, n.relu3_2 = conv_bn_scale_relu(n.relu3_1, 3, 48, pad=1, **conv_kwargs)
# Output_size 16
n.pool3 = max_pool(n.relu3_2, ks=2, stride=2)
# Output_size 16
n.conv4_1, n.bn4_1, n.scale4_1, n.relu4_1 = conv_bn_scale_relu(n.pool3, 3, 64, pad=1, **conv_kwargs)
# Output_size 16
n.conv4_2, n.bn4_2, n.scale4_2, n.relu4_2 = conv_bn_scale_relu(n.relu4_1, 3, 64, pad=1, **conv_kwargs)
# Output_size 8
n.pool4 = max_pool(n.relu4_2, ks=2, stride=2)
# Output_size 8
n.conv_bn, n.bn_bn, n.scale_bn, n.relu_bn = conv_bn_scale_relu(n.pool4, 3, 3, pad=1, **conv_kwargs)
# Output_size 16
n.dc5_1, n.bn5_1, n.scale5_1, n.relu5_1 = deconv_bn_scale_relu(n.relu_bn, 2, 4, pad=0, stride=2, **conv_kwargs)
n.dc5_2, n.bn5_2, n.scale5_2, n.relu5_2 = deconv_bn_scale_relu(n.relu5_1, 3, 4, pad=1, **conv_kwargs)
n.dc5_3, n.bn5_3, n.scale5_3, n.relu5_3 = deconv_bn_scale_relu(n.relu5_2, 3, 4, pad=1, **conv_kwargs)
# Output_size 32
n.dc6_1, n.bn6_1, n.scale6_1, n.relu6_1 = deconv_bn_scale_relu(n.relu5_3, 2, 4, pad=0, stride=2, **conv_kwargs)
n.dc6_2, n.bn6_2, n.scale6_2, n.relu6_2 = deconv_bn_scale_relu(n.relu6_1, 3, 4, pad=1, **conv_kwargs)
n.dc6_3, n.bn6_3, n.scale6_3, n.relu6_3 = deconv_bn_scale_relu(n.relu6_2, 3, 4, pad=1, **conv_kwargs)
# Output_size 64
n.dc7_1, n.bn7_1, n.scale7_1, n.relu7_1 = deconv_bn_scale_relu(n.relu6_3, 2, 4, pad=0, stride=2, **conv_kwargs)
n.dc7_2, n.bn7_2, n.scale7_2, n.relu7_2 = deconv_bn_scale_relu(n.relu7_1, 3, 4, pad=1, **conv_kwargs)
n.dc7_3, n.bn7_3, n.scale7_3, n.relu7_3 = deconv_bn_scale_relu(n.relu7_2, 3, 4, pad=1, **conv_kwargs)
# Output_size 128
n.dc8_1, n.bn8_1, n.scale8_1, n.relu8_1 = deconv_bn_scale_relu(n.relu7_3, 2, 4, pad=0, stride=2, **conv_kwargs)
n.dc8_2, n.bn8_2, n.scale8_2, n.relu8_2 = deconv_bn_scale_relu(n.relu8_1, 3, 4, pad=1, **conv_kwargs)
n.dc8_3, n.bn8_3, n.scale8_3, _ = deconv_bn_scale_relu(n.relu8_2, 3, 3, pad=1, **conv_kwargs)
if with_labels:
n.label = data
n.loss = L.EuclideanLoss(n.scale8_3, n.label)
n.ignored_label = data
n.silence_label = L.Silence(n.ignored_label, ntop=0)
with open('network.prototxt', 'w') as fout:
return to_tempfile(str(n.to_proto()))
def get_split(split):
filename = '../../images/labels/%s.txt' % split
if not os.path.exists(filename):
raise IOError('Split data file not found: %s' % split)
return filename
def miniplaces_net(source, train=False, with_labels=True):
#mean = [104, 117, 123] # per-channel mean of the BGR image pixels
mean = [127, 127, 127]
transform_param = dict(mirror=False, crop_size=args.crop, mean_value=mean)
# batch_size = args.batch if train else 100
batch_size = args.batch
places_data, places_labels = L.ImageData(transform_param=transform_param,
source=source, root_folder=args.image_root, shuffle=train,
batch_size=batch_size, ntop=2)
return minialexnet(data=places_data, train=train,
def snapshot_prefix():
return os.path.join(args.snapshot_dir, args.snapshot_prefix)
def snapshot_at_iteration(iteration):
return '%s_iter_%d.caffemodel' % (snapshot_prefix(), iteration)
def miniplaces_solver(train_net_path, test_net_path=None):
s = caffe_pb2.SolverParameter()
# Specify locations of the train and (maybe) test networks.
s.train_net = train_net_path
if test_net_path is not None:
# Test after every 1000 training iterations.
s.test_interval = 1000
# Set `test_iter` to test on 100 batches each time we test.
# With test batch size 100, this covers the entire validation set of
# 10K images (100 * 100 = 10K).
s.test_interval = args.iters + 1 # don't test during training
# The number of batches over which to average the gradient.
# Effectively boosts the training batch size by the given factor, without
# affecting memory utilization.
s.iter_size = args.iter_size
# Solve using the stochastic gradient descent (SGD) algorithm.
# Other choices include 'Adam' and 'RMSProp'.
s.type = 'SGD'
# The following settings (base_lr, lr_policy, gamma, stepsize, and max_iter),
# define the following learning rate schedule:
# Iterations [ 0, 20K) -> learning rate 0.01 = base_lr
# Iterations [20K, 40K) -> learning rate 0.001 = base_lr * gamma
# Iterations [40K, 50K) -> learning rate 0.0001 = base_lr * gamma^2
# Set the initial learning rate for SGD.
s.base_lr =
# Set `lr_policy` to define how the learning rate changes during training.
# Here, we 'step' the learning rate by multiplying it by a factor `gamma`
# every `stepsize` iterations.
s.lr_policy = 'step'
s.gamma = args.gamma
s.stepsize = args.stepsize
# `max_iter` is the number of times to update the net (training iterations).
s.max_iter = args.iters
# Set other SGD hyperparameters. Setting a non-zero `momentum` takes a
# weighted average of the current gradient and previous gradients to make
# learning more stable. L2 weight decay regularizes learning, to help
# prevent the model from overfitting.
s.momentum = args.momentum
s.weight_decay = args.decay
# Display the current training loss and accuracy every `display` iterations.
# This doesn't have an effect for Python training here as logging is
# disabled by this script (see the GLOG_minloglevel setting).
s.display = args.disp
# Number of training iterations over which to smooth the displayed loss.
# The summed loss value (Iteration N, loss = X) will be averaged,
# but individual loss values (Train net output #K: my_loss = X) won't be.
s.average_loss = 10
# Seed the RNG for deterministic results.
# (May not be so deterministic if using CuDNN.)
s.random_seed = args.seed
# Snapshots are files used to store networks we've trained. Here, we'll
# snapshot twice per learning rate step to the location specified by the
# --snapshot_dir and --snapshot_prefix args.
s.snapshot = args.stepsize // 2
s.snapshot_prefix = snapshot_prefix()
# Create snapshot dir if it doesn't already exist.
if not os.path.exists(args.snapshot_dir):
with open('solver.prototxt', 'w') as fout:
return to_tempfile(str(s))
def train_net(with_val_net=False):
train_net_file = miniplaces_net(get_split('train'), train=True)
# Set with_val_net=True to test during training.
# Environment variable GLOG_minloglevel should be set to 0 to display
# Caffe output in this case; otherwise, the test result will not be
# displayed.
if with_val_net:
val_net_file = miniplaces_net(get_split('val'), train=False)
val_net_file = None
solver_file = miniplaces_solver(train_net_file, val_net_file)
solver = caffe.get_solver(solver_file)
outputs = sorted(
def str_output(output):
value =[output].data
if output.startswith('accuracy'):
valstr = '%5.2f%%' % (100 * value, )
valstr = '%6f' % value
return '%s = %s' % (output, valstr)
def disp_outputs(iteration, iter_pad_len=len(str(args.iters))):
metrics = '; '.join(str_output(o) for o in outputs)
return 'Iteration %*d: %s' % (iter_pad_len, iteration, metrics)
# We could just call `solver.solve()` rather than `step()`ing in a loop.
# (If we hadn't set GLOG_minloglevel = 3 at the top of this file, Caffe
# would display loss/accuracy information during training.)
previous_time = None
for iteration in xrange(args.iters):
if (args.disp > 0) and (iteration % args.disp == 0):
current_time = time.clock()
if previous_time is None:
benchmark = ''
time_per_iter = (current_time - previous_time) / args.disp
benchmark = ' (%5f s/it)' % time_per_iter
previous_time = current_time
print disp_outputs(iteration), benchmark
# if (iteration > 0) and (iteration % (args.stepsize // 2) == 0):
# eval_net("train", iters=iteration)
# print
# eval_net("val", iters=iteration)
# print
# Print accuracy for last iteration.
def eval_net(split, K=5):
print 'Running evaluation for split:', split
filenames = []
labels = []
split_file = get_split(split)
with open(split_file, 'r') as f:
for line in f.readlines():
parts = line.split()
assert 1 <= len(parts) <= 2, 'malformed line'
if len(parts) > 1:
known_labels = (len(labels) > 0)
if known_labels:
assert len(labels) == len(filenames)
# create file with 'dummy' labels (all 0s)
split_file = to_tempfile(''.join('%s 0\n' % name for name in filenames))
test_net_file = miniplaces_net(split_file, train=False, with_labels=False)
weights_file = snapshot_at_iteration(args.iters)
net = caffe.Net(test_net_file, weights_file, caffe.TEST)
top_k_predictions = np.zeros((len(filenames), K), dtype=np.int32)
if known_labels:
correct_label_probs = np.zeros(len(filenames))
offset = 0
all_probs = np.zeros((len(filenames), 10))
while offset < len(filenames):
probs = net.forward()['probs']
for prob in probs:
all_probs[offset] = prob
top_k_predictions[offset] = (-prob).argsort()[:K]
if known_labels:
correct_label_probs[offset] = prob[labels[offset]]
offset += 1
if offset >= len(filenames):
if known_labels:
def accuracy_at_k(preds, labels, k):
assert len(preds) == len(labels)
num_correct = sum(l in p[:k] for p, l in zip(preds, labels))
return num_correct / len(preds)
for k in [1, K]:
accuracy = 100 * accuracy_at_k(top_k_predictions, labels, k)
print '\tAccuracy at %d = %4.2f%%' % (k, accuracy)
cross_ent_error = -np.log(correct_label_probs).mean()
print '\tSoftmax cross-entropy error = %.4f' % (cross_ent_error, )
print 'Not computing accuracy; ground truth unknown for split:', split
#all_probs = np.vstack(all_probs)
if split != 'train':
np.savetxt("prob.%s.csv" % split, all_probs, delimiter=",")
filename = 'top_%d_predictions.%s.csv' % (K, split)
with open(filename, 'w') as f:
f.write(','.join(['image'] + ['label%d' % i for i in range(1, K+1)]))
f.write(''.join('%s,%s\n' % (image, ','.join(str(p) for p in preds))
for image, preds in zip(filenames, top_k_predictions)))
print 'Predictions for split %s dumped to: %s' % (split, filename)
if __name__ == '__main__':
if args.generate_proto_only:
miniplaces_net(source="", train=True, with_labels=False)
if not args.eval_only:
print 'Training net...\n'
print '\nTraining complete. Evaluating...\n'
# for split in ('train', 'val', 'test'):
# eval_net(split)
# print
# print 'Evaluation complete.'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment