young-geng/deconv_problem

## deconv_problem
#!/usr/bin/env python

from __future__ import division

import argparse
import numpy as np
import os
import tempfile
import time

parser = argparse.ArgumentParser(
    description='Train and evaluate a net on the MIT mini-places dataset.')
parser.add_argument('--image_root', default='../../images/',
    help='Directory where images are stored')
parser.add_argument('--crop', type=int, default=128,
    help=('The edge length of the random image crops'
          '(defaults to 96 for 96x96 crops)'))
parser.add_argument('--disp', type=int, default=10,
    help='Print loss/accuracy every --disp training iterations')
parser.add_argument('--snapshot_dir', default='./snapshot',
    help='Path to directory where snapshots are saved')
parser.add_argument('--snapshot_prefix', default='place_net',
    help='Snapshot filename prefix')
parser.add_argument('--iters', type=int, default=50*1000,
    help='Total number of iterations to train the network')
parser.add_argument('--batch', type=int, default=86,
    help='The batch size to use for training')
parser.add_argument('--iter_size', type=int, default=3,
    help=('The number of iterations (batches) over which to average the '
          'gradient computation. Effectively increases the batch size '
          '(--batch) by this factor, but without increasing memory use '))
parser.add_argument('--lr', type=float, default=0.01,
    help='The initial learning rate')
parser.add_argument('--gamma', type=float, default=0.63,
    help='Factor by which to drop the learning rate')
parser.add_argument('--stepsize', type=int, default=2000,
    help='Drop the learning rate every N iters -- this specifies N')
parser.add_argument('--momentum', type=float, default=0.9,
    help='The momentum hyperparameter to use for momentum SGD')
parser.add_argument('--decay', type=float, default=5e-4,
    help='The L2 weight decay coefficient')
parser.add_argument('--seed', type=int, default=1,
    help='Seed for the random number generator')
parser.add_argument('--cudnn', action='store_true',
    help='Use CuDNN at training time -- usually faster, but non-deterministic')
parser.add_argument('--gpu', type=int, default=0,
    help='GPU ID to use for training and inference (-1 for CPU)')
parser.add_argument('--eval_only', action='store_true', help='only run evaluation')
parser.add_argument('--generate_proto_only', action='store_true', help='only run evaluation')
args = parser.parse_args()

# disable most Caffe logging (unless env var $GLOG_minloglevel is already set)
key = 'GLOG_minloglevel'
if not os.environ.get(key, ''):
    os.environ[key] = '3'

import caffe
from caffe.proto import caffe_pb2
from caffe import layers as L
from caffe import params as P

if args.gpu >= 0:
    caffe.set_mode_gpu()
    caffe.set_device(args.gpu)
else:
    caffe.set_mode_cpu()

def to_tempfile(file_content):
    """Serialize a Python protobuf object str(proto), dump to a temporary file,
       and return its filename."""
    with tempfile.NamedTemporaryFile(delete=False) as f:
        f.write(file_content)
        return f.name

weight_param = dict(lr_mult=1, decay_mult=1)
bias_param   = dict(lr_mult=2, decay_mult=0)
learned_param = [weight_param, bias_param]
frozen_param = [dict(lr_mult=0)] * 2

zero_filler     = dict(type='constant', value=0)
msra_filler     = dict(type='msra')
uniform_filler  = dict(type='uniform', min=-0.1, max=0.1)
fc_filler       = dict(type='gaussian', std=0.005)
# Original AlexNet used the following commented out Gaussian initialization;
# we'll use the "MSRA" one instead, which scales the Gaussian initialization
# of a convolutional filter based on its receptive field size.
# conv_filler     = dict(type='gaussian', std=0.01)
conv_filler     = dict(type='msra')

def conv_relu(bottom, ks, nout, stride=1, pad=0, group=1,
              param=learned_param,
              weight_filler=conv_filler, bias_filler=zero_filler,
              train=False):
    # set CAFFE engine to avoid CuDNN convolution -- non-deterministic results
    engine = {}
    if train and not args.cudnn:
        engine.update(engine=P.Pooling.CAFFE)
    conv = L.Convolution(bottom, kernel_size=ks, stride=stride,
                         num_output=nout, pad=pad, group=group, param=param,
                         weight_filler=weight_filler, bias_filler=bias_filler,
                         **engine)
    return conv, L.ReLU(conv, in_place=True)

def fc_relu(bottom, nout, param=learned_param,
            weight_filler=fc_filler, bias_filler=zero_filler):
    fc = L.InnerProduct(bottom, num_output=nout, param=param,
                        weight_filler=weight_filler, bias_filler=bias_filler)
    return fc, L.ReLU(fc, in_place=True)


def max_pool(bottom, ks, stride=1, train=False):
    # set CAFFE engine to avoid CuDNN pooling -- non-deterministic results
    engine = {}
    if train and not args.cudnn:
        engine.update(engine=P.Pooling.CAFFE)
    return L.Pooling(bottom, pool=P.Pooling.MAX, kernel_size=ks, stride=stride,
                     **engine)

def conv_bn_scale_relu(bottom, ks, nout, stride=1, pad=0, group=1,
                       param=learned_param,
                       weight_filler=conv_filler, bias_filler=zero_filler,
                       bias_term=False, train=False):
    engine = {}
    if bias_term:
        conv = L.Convolution(bottom, kernel_size=ks, stride=stride,
                             num_output=nout, pad=pad, group=group, param=param,
                             weight_filler=weight_filler, bias_filler=bias_filler,
                             bias_term=bias_term, **engine)
    else:
        conv = L.Convolution(bottom, kernel_size=ks, stride=stride,
                             num_output=nout, pad=pad, group=group, param=[weight_param],
                             weight_filler=weight_filler, bias_filler=bias_filler,
                             bias_term=bias_term, **engine)
    bn = L.BatchNorm(conv, in_place=True)
    scale = L.Scale(bn, bias_term=True, in_place=True)
    relu = L.ReLU(scale, in_place=True)
    return conv, bn, scale, relu

def deconv_bn_scale_relu(bottom, ks, nout, stride=1, pad=0, group=1,
                         param=learned_param,
                         weight_filler=conv_filler, bias_filler=zero_filler,
                         bias_term=False, train=False):
    engine = {}
    # deconv = L.Deconvolution(bottom, kernel_size=ks, stride=stride,
    #                          num_output=nout, pad=pad, param=[weight_param],
    #                          weight_filler=weight_filler, bias_filler=bias_filler,
    #                          bias_term=bias_term, **engine)
    deconv = L.Deconvolution(bottom, kernel_size=ks, stride=stride,
                             num_output=nout, pad=pad, param=[weight_param],
                             weight_filler=weight_filler, bias_filler=bias_filler,
                             bias_term=bias_term, **engine)
    # deconv = L.Deconvolution(bottom)
    bn = L.BatchNorm(deconv, in_place=True)
    scale = L.Scale(bn, bias_term=True, in_place=True)
    relu = L.ReLU(scale, in_place=True)
    return deconv, bn, scale, relu


def minialexnet(data, train=False, param=learned_param, with_labels=True):
    """
    Returns a protobuf text file specifying a variant of AlexNet, following the
    original specification (<caffe>/models/bvlc_alexnet/train_val.prototxt).
    The changes with respect to the original AlexNet are:
        - LRN (local response normalization) layers are not included
        - The Fully Connected (FC) layers (fc6 and fc7) have smaller dimensions
          due to the lower resolution of mini-places images (128x128) compared
          with ImageNet images (usually resized to 256x256)
    """
    n = caffe.NetSpec()
    if data is not None:
        n.data = data
    conv_kwargs = dict(param=param, train=train)
    # Output_size 128
    n.conv1_1, n.bn1_1, n.scale1_1, n.relu1_1 = conv_bn_scale_relu(n.data, 3, 24, stride=1, pad=1, **conv_kwargs)
    # Output_size 128
    n.conv1_2, n.bn1_2, n.scale1_2, n.relu1_2 = conv_bn_scale_relu(n.relu1_1, 3, 24, stride=1, pad=1, **conv_kwargs)
    # Output_size 64
    n.pool1 = max_pool(n.relu1_2, ks=2, stride=2)


    # Output_size 64
    n.conv2_1, n.bn2_1, n.scale2_1, n.relu2_1 = conv_bn_scale_relu(n.pool1, 3, 32, pad=1, **conv_kwargs)
    # Output_size 64
    n.conv2_2, n.bn2_2, n.scale2_2, n.relu2_2 = conv_bn_scale_relu(n.relu2_1, 3, 32, pad=1, **conv_kwargs)
    # Output_size 32
    n.pool2 = max_pool(n.relu2_2, ks=2, stride=2)


    # Output_size 32
    n.conv3_1, n.bn3_1, n.scale3_1, n.relu3_1 = conv_bn_scale_relu(n.pool2, 3, 48, pad=1, **conv_kwargs)
    # Output_size 32
    n.conv3_2, n.bn3_2, n.scale3_2, n.relu3_2 = conv_bn_scale_relu(n.relu3_1, 3, 48, pad=1, **conv_kwargs)
    # Output_size 16
    n.pool3 = max_pool(n.relu3_2, ks=2, stride=2)

    # Output_size 16
    n.conv4_1, n.bn4_1, n.scale4_1, n.relu4_1 = conv_bn_scale_relu(n.pool3, 3, 64, pad=1, **conv_kwargs)
    # Output_size 16
    n.conv4_2, n.bn4_2, n.scale4_2, n.relu4_2 = conv_bn_scale_relu(n.relu4_1, 3, 64, pad=1, **conv_kwargs)
    # Output_size 8
    n.pool4 = max_pool(n.relu4_2, ks=2, stride=2)

    # Output_size 8
    n.conv_bn, n.bn_bn, n.scale_bn, n.relu_bn = conv_bn_scale_relu(n.pool4, 3, 3, pad=1, **conv_kwargs)

    # Output_size 16
    n.dc5_1, n.bn5_1, n.scale5_1, n.relu5_1 = deconv_bn_scale_relu(n.relu_bn, 2, 4, pad=0, stride=2, **conv_kwargs)
    n.dc5_2, n.bn5_2, n.scale5_2, n.relu5_2 = deconv_bn_scale_relu(n.relu5_1, 3, 4, pad=1, **conv_kwargs)
    n.dc5_3, n.bn5_3, n.scale5_3, n.relu5_3 = deconv_bn_scale_relu(n.relu5_2, 3, 4, pad=1, **conv_kwargs)

    # Output_size 32
    n.dc6_1, n.bn6_1, n.scale6_1, n.relu6_1 = deconv_bn_scale_relu(n.relu5_3, 2, 4, pad=0, stride=2, **conv_kwargs)
    n.dc6_2, n.bn6_2, n.scale6_2, n.relu6_2 = deconv_bn_scale_relu(n.relu6_1, 3, 4, pad=1, **conv_kwargs)
    n.dc6_3, n.bn6_3, n.scale6_3, n.relu6_3 = deconv_bn_scale_relu(n.relu6_2, 3, 4, pad=1, **conv_kwargs)

    # Output_size 64
    n.dc7_1, n.bn7_1, n.scale7_1, n.relu7_1 = deconv_bn_scale_relu(n.relu6_3, 2, 4, pad=0, stride=2, **conv_kwargs)
    n.dc7_2, n.bn7_2, n.scale7_2, n.relu7_2 = deconv_bn_scale_relu(n.relu7_1, 3, 4, pad=1, **conv_kwargs)
    n.dc7_3, n.bn7_3, n.scale7_3, n.relu7_3 = deconv_bn_scale_relu(n.relu7_2, 3, 4, pad=1, **conv_kwargs)

    # Output_size 128
    n.dc8_1, n.bn8_1, n.scale8_1, n.relu8_1 = deconv_bn_scale_relu(n.relu7_3, 2, 4, pad=0, stride=2, **conv_kwargs)
    n.dc8_2, n.bn8_2, n.scale8_2, n.relu8_2 = deconv_bn_scale_relu(n.relu8_1, 3, 4, pad=1, **conv_kwargs)
    n.dc8_3, n.bn8_3, n.scale8_3, _ = deconv_bn_scale_relu(n.relu8_2, 3, 3, pad=1, **conv_kwargs)


    if with_labels:
        n.label = data
        n.loss = L.EuclideanLoss(n.scale8_3, n.label)
    else:
        n.ignored_label = data
        n.silence_label = L.Silence(n.ignored_label, ntop=0)


    with open('network.prototxt', 'w') as fout:
        fout.write(str(n.to_proto()))

    return to_tempfile(str(n.to_proto()))

def get_split(split):
    filename = '../../images/labels/%s.txt' % split
    if not os.path.exists(filename):
        raise IOError('Split data file not found: %s' % split)
    return filename

def miniplaces_net(source, train=False, with_labels=True):
    #mean = [104, 117, 123]  # per-channel mean of the BGR image pixels
    mean = [127, 127, 127]
    transform_param = dict(mirror=False, crop_size=args.crop, mean_value=mean)
    # batch_size = args.batch if train else 100
    batch_size = args.batch
    places_data, places_labels = L.ImageData(transform_param=transform_param,
        source=source, root_folder=args.image_root, shuffle=train,
        batch_size=batch_size, ntop=2)
    return minialexnet(data=places_data, train=train,
                       with_labels=with_labels)

def snapshot_prefix():
    return os.path.join(args.snapshot_dir, args.snapshot_prefix)

def snapshot_at_iteration(iteration):
    return '%s_iter_%d.caffemodel' % (snapshot_prefix(), iteration)

def miniplaces_solver(train_net_path, test_net_path=None):
    s = caffe_pb2.SolverParameter()

    # Specify locations of the train and (maybe) test networks.
    s.train_net = train_net_path
    if test_net_path is not None:
        s.test_net.append(test_net_path)
        # Test after every 1000 training iterations.
        s.test_interval = 1000
        # Set `test_iter` to test on 100 batches each time we test.
        # With test batch size 100, this covers the entire validation set of
        # 10K images (100 * 100 = 10K).
        s.test_iter.append(100)
    else:
        s.test_interval = args.iters + 1  # don't test during training

    # The number of batches over which to average the gradient.
    # Effectively boosts the training batch size by the given factor, without
    # affecting memory utilization.
    s.iter_size = args.iter_size

    # Solve using the stochastic gradient descent (SGD) algorithm.
    # Other choices include 'Adam' and 'RMSProp'.
    s.type = 'SGD'

    # The following settings (base_lr, lr_policy, gamma, stepsize, and max_iter),
    # define the following learning rate schedule:
    #   Iterations [  0, 20K) -> learning rate 0.01   = base_lr
    #   Iterations [20K, 40K) -> learning rate 0.001  = base_lr * gamma
    #   Iterations [40K, 50K) -> learning rate 0.0001 = base_lr * gamma^2

    # Set the initial learning rate for SGD.
    s.base_lr = args.lr

    # Set `lr_policy` to define how the learning rate changes during training.
    # Here, we 'step' the learning rate by multiplying it by a factor `gamma`
    # every `stepsize` iterations.
    s.lr_policy = 'step'
    s.gamma = args.gamma
    s.stepsize = args.stepsize

    # `max_iter` is the number of times to update the net (training iterations).
    s.max_iter = args.iters

    # Set other SGD hyperparameters. Setting a non-zero `momentum` takes a
    # weighted average of the current gradient and previous gradients to make
    # learning more stable. L2 weight decay regularizes learning, to help
    # prevent the model from overfitting.
    s.momentum = args.momentum
    s.weight_decay = args.decay

    # Display the current training loss and accuracy every `display` iterations.
    # This doesn't have an effect for Python training here as logging is
    # disabled by this script (see the GLOG_minloglevel setting).
    s.display = args.disp

    # Number of training iterations over which to smooth the displayed loss.
    # The summed loss value (Iteration N, loss = X) will be averaged,
    # but individual loss values (Train net output #K: my_loss = X) won't be.
    s.average_loss = 10

    # Seed the RNG for deterministic results.
    # (May not be so deterministic if using CuDNN.)
    s.random_seed = args.seed

    # Snapshots are files used to store networks we've trained.  Here, we'll
    # snapshot twice per learning rate step to the location specified by the
    # --snapshot_dir and --snapshot_prefix args.
    s.snapshot = args.stepsize // 2
    s.snapshot_prefix = snapshot_prefix()

    # Create snapshot dir if it doesn't already exist.
    if not os.path.exists(args.snapshot_dir):
        os.makedirs(args.snapshot_dir)

    with open('solver.prototxt', 'w') as fout:
        fout.write(str(s))

    return to_tempfile(str(s))

def train_net(with_val_net=False):
    train_net_file = miniplaces_net(get_split('train'), train=True)
    # Set with_val_net=True to test during training.
    # Environment variable GLOG_minloglevel should be set to 0 to display
    # Caffe output in this case; otherwise, the test result will not be
    # displayed.
    if with_val_net:
        val_net_file = miniplaces_net(get_split('val'), train=False)
    else:
        val_net_file = None
    solver_file = miniplaces_solver(train_net_file, val_net_file)
    solver = caffe.get_solver(solver_file)
    outputs = sorted(solver.net.outputs)
    def str_output(output):
        value = solver.net.blobs[output].data
        if output.startswith('accuracy'):
            valstr = '%5.2f%%' % (100 * value, )
        else:
            valstr = '%6f' % value
        return '%s = %s' % (output, valstr)
    def disp_outputs(iteration, iter_pad_len=len(str(args.iters))):
        metrics = '; '.join(str_output(o) for o in outputs)
        return 'Iteration %*d: %s' % (iter_pad_len, iteration, metrics)
    # We could just call `solver.solve()` rather than `step()`ing in a loop.
    # (If we hadn't set GLOG_minloglevel = 3 at the top of this file, Caffe
    # would display loss/accuracy information during training.)
    previous_time = None
    for iteration in xrange(args.iters):
        solver.step(1)
        if (args.disp > 0) and (iteration % args.disp == 0):
            current_time = time.clock()
            if previous_time is None:
                benchmark = ''
            else:
                time_per_iter = (current_time - previous_time) / args.disp
                benchmark = ' (%5f s/it)' % time_per_iter
            previous_time = current_time
            print disp_outputs(iteration), benchmark


        # if (iteration > 0) and (iteration % (args.stepsize // 2) == 0):
        #     eval_net("train", iters=iteration)
        #     print
        #     eval_net("val", iters=iteration)
        #     print

    # Print accuracy for last iteration.
    solver.net.forward()
    disp_outputs(args.iters)
    solver.net.save(snapshot_at_iteration(args.iters))

def eval_net(split, K=5):
    print 'Running evaluation for split:', split
    filenames = []
    labels = []
    split_file = get_split(split)
    with open(split_file, 'r') as f:
        for line in f.readlines():
            parts = line.split()
            assert 1 <= len(parts) <= 2, 'malformed line'
            filenames.append(parts[0])
            if len(parts) > 1:
                labels.append(int(parts[1]))
    known_labels = (len(labels) > 0)
    if known_labels:
        assert len(labels) == len(filenames)
    else:
        # create file with 'dummy' labels (all 0s)
        split_file = to_tempfile(''.join('%s 0\n' % name for name in filenames))
    test_net_file = miniplaces_net(split_file, train=False, with_labels=False)
    weights_file = snapshot_at_iteration(args.iters)
    net = caffe.Net(test_net_file, weights_file, caffe.TEST)
    top_k_predictions = np.zeros((len(filenames), K), dtype=np.int32)
    if known_labels:
        correct_label_probs = np.zeros(len(filenames))
    offset = 0
    all_probs = np.zeros((len(filenames), 10))
    while offset < len(filenames):
        probs = net.forward()['probs']
        for prob in probs:
            all_probs[offset] = prob
            top_k_predictions[offset] = (-prob).argsort()[:K]
            if known_labels:
                correct_label_probs[offset] = prob[labels[offset]]
            offset += 1
            if offset >= len(filenames):
                break
    if known_labels:
        def accuracy_at_k(preds, labels, k):
            assert len(preds) == len(labels)
            num_correct = sum(l in p[:k] for p, l in zip(preds, labels))
            return num_correct / len(preds)
        for k in [1, K]:
            accuracy = 100 * accuracy_at_k(top_k_predictions, labels, k)
            print '\tAccuracy at %d = %4.2f%%' % (k, accuracy)
        cross_ent_error = -np.log(correct_label_probs).mean()
        print '\tSoftmax cross-entropy error = %.4f' % (cross_ent_error, )
    else:
        print 'Not computing accuracy; ground truth unknown for split:', split

    #all_probs = np.vstack(all_probs)

    if split != 'train':
        np.savetxt("prob.%s.csv" % split, all_probs, delimiter=",")

    filename = 'top_%d_predictions.%s.csv' % (K, split)
    with open(filename, 'w') as f:
        f.write(','.join(['image'] + ['label%d' % i for i in range(1, K+1)]))
        f.write('\n')
        f.write(''.join('%s,%s\n' % (image, ','.join(str(p) for p in preds))
                        for image, preds in zip(filenames, top_k_predictions)))
    print 'Predictions for split %s dumped to: %s' % (split, filename)

if __name__ == '__main__':

    if args.generate_proto_only:
        miniplaces_net(source="", train=True, with_labels=False)
        exit(0)

    if not args.eval_only:
        print 'Training net...\n'
        train_net()

    print '\nTraining complete. Evaluating...\n'
    # for split in ('train', 'val', 'test'):
    #     eval_net(split)
    #     print
    # print 'Evaluation complete.'
	#!/usr/bin/env python

	from __future__ import division

	import argparse
	import numpy as np
	import os
	import tempfile
	import time

	parser = argparse.ArgumentParser(
	description='Train and evaluate a net on the MIT mini-places dataset.')
	parser.add_argument('--image_root', default='../../images/',
	help='Directory where images are stored')
	parser.add_argument('--crop', type=int, default=128,
	help=('The edge length of the random image crops'
	'(defaults to 96 for 96x96 crops)'))
	parser.add_argument('--disp', type=int, default=10,
	help='Print loss/accuracy every --disp training iterations')
	parser.add_argument('--snapshot_dir', default='./snapshot',
	help='Path to directory where snapshots are saved')
	parser.add_argument('--snapshot_prefix', default='place_net',
	help='Snapshot filename prefix')
	parser.add_argument('--iters', type=int, default=50*1000,
	help='Total number of iterations to train the network')
	parser.add_argument('--batch', type=int, default=86,
	help='The batch size to use for training')
	parser.add_argument('--iter_size', type=int, default=3,
	help=('The number of iterations (batches) over which to average the '
	'gradient computation. Effectively increases the batch size '
	'(--batch) by this factor, but without increasing memory use '))
	parser.add_argument('--lr', type=float, default=0.01,
	help='The initial learning rate')
	parser.add_argument('--gamma', type=float, default=0.63,
	help='Factor by which to drop the learning rate')
	parser.add_argument('--stepsize', type=int, default=2000,
	help='Drop the learning rate every N iters -- this specifies N')
	parser.add_argument('--momentum', type=float, default=0.9,
	help='The momentum hyperparameter to use for momentum SGD')
	parser.add_argument('--decay', type=float, default=5e-4,
	help='The L2 weight decay coefficient')
	parser.add_argument('--seed', type=int, default=1,
	help='Seed for the random number generator')
	parser.add_argument('--cudnn', action='store_true',
	help='Use CuDNN at training time -- usually faster, but non-deterministic')
	parser.add_argument('--gpu', type=int, default=0,
	help='GPU ID to use for training and inference (-1 for CPU)')
	parser.add_argument('--eval_only', action='store_true', help='only run evaluation')
	parser.add_argument('--generate_proto_only', action='store_true', help='only run evaluation')
	args = parser.parse_args()

	# disable most Caffe logging (unless env var $GLOG_minloglevel is already set)
	key = 'GLOG_minloglevel'
	if not os.environ.get(key, ''):
	os.environ[key] = '3'

	import caffe
	from caffe.proto import caffe_pb2
	from caffe import layers as L
	from caffe import params as P

	if args.gpu >= 0:
	caffe.set_mode_gpu()
	caffe.set_device(args.gpu)
	else:
	caffe.set_mode_cpu()

	def to_tempfile(file_content):
	"""Serialize a Python protobuf object str(proto), dump to a temporary file,
	and return its filename."""
	with tempfile.NamedTemporaryFile(delete=False) as f:
	f.write(file_content)
	return f.name

	weight_param = dict(lr_mult=1, decay_mult=1)
	bias_param = dict(lr_mult=2, decay_mult=0)
	learned_param = [weight_param, bias_param]
	frozen_param = [dict(lr_mult=0)] * 2

	zero_filler = dict(type='constant', value=0)
	msra_filler = dict(type='msra')
	uniform_filler = dict(type='uniform', min=-0.1, max=0.1)
	fc_filler = dict(type='gaussian', std=0.005)
	# Original AlexNet used the following commented out Gaussian initialization;
	# we'll use the "MSRA" one instead, which scales the Gaussian initialization
	# of a convolutional filter based on its receptive field size.
	# conv_filler = dict(type='gaussian', std=0.01)
	conv_filler = dict(type='msra')

	def conv_relu(bottom, ks, nout, stride=1, pad=0, group=1,
	param=learned_param,
	weight_filler=conv_filler, bias_filler=zero_filler,
	train=False):
	# set CAFFE engine to avoid CuDNN convolution -- non-deterministic results
	engine = {}
	if train and not args.cudnn:
	engine.update(engine=P.Pooling.CAFFE)
	conv = L.Convolution(bottom, kernel_size=ks, stride=stride,
	num_output=nout, pad=pad, group=group, param=param,
	weight_filler=weight_filler, bias_filler=bias_filler,
	**engine)
	return conv, L.ReLU(conv, in_place=True)

	def fc_relu(bottom, nout, param=learned_param,
	weight_filler=fc_filler, bias_filler=zero_filler):
	fc = L.InnerProduct(bottom, num_output=nout, param=param,
	weight_filler=weight_filler, bias_filler=bias_filler)
	return fc, L.ReLU(fc, in_place=True)


	def max_pool(bottom, ks, stride=1, train=False):
	# set CAFFE engine to avoid CuDNN pooling -- non-deterministic results
	engine = {}
	if train and not args.cudnn:
	engine.update(engine=P.Pooling.CAFFE)
	return L.Pooling(bottom, pool=P.Pooling.MAX, kernel_size=ks, stride=stride,
	**engine)

	def conv_bn_scale_relu(bottom, ks, nout, stride=1, pad=0, group=1,
	param=learned_param,
	weight_filler=conv_filler, bias_filler=zero_filler,
	bias_term=False, train=False):
	engine = {}
	if bias_term:
	conv = L.Convolution(bottom, kernel_size=ks, stride=stride,
	num_output=nout, pad=pad, group=group, param=param,
	weight_filler=weight_filler, bias_filler=bias_filler,
	bias_term=bias_term, **engine)
	else:
	conv = L.Convolution(bottom, kernel_size=ks, stride=stride,
	num_output=nout, pad=pad, group=group, param=[weight_param],
	weight_filler=weight_filler, bias_filler=bias_filler,
	bias_term=bias_term, **engine)
	bn = L.BatchNorm(conv, in_place=True)
	scale = L.Scale(bn, bias_term=True, in_place=True)
	relu = L.ReLU(scale, in_place=True)
	return conv, bn, scale, relu

	def deconv_bn_scale_relu(bottom, ks, nout, stride=1, pad=0, group=1,
	param=learned_param,
	weight_filler=conv_filler, bias_filler=zero_filler,
	bias_term=False, train=False):
	engine = {}
	# deconv = L.Deconvolution(bottom, kernel_size=ks, stride=stride,
	# num_output=nout, pad=pad, param=[weight_param],
	# weight_filler=weight_filler, bias_filler=bias_filler,
	# bias_term=bias_term, **engine)
	deconv = L.Deconvolution(bottom, kernel_size=ks, stride=stride,
	num_output=nout, pad=pad, param=[weight_param],
	weight_filler=weight_filler, bias_filler=bias_filler,
	bias_term=bias_term, **engine)
	# deconv = L.Deconvolution(bottom)
	bn = L.BatchNorm(deconv, in_place=True)
	scale = L.Scale(bn, bias_term=True, in_place=True)
	relu = L.ReLU(scale, in_place=True)
	return deconv, bn, scale, relu



	def minialexnet(data, train=False, param=learned_param, with_labels=True):
	"""
	Returns a protobuf text file specifying a variant of AlexNet, following the
	original specification (<caffe>/models/bvlc_alexnet/train_val.prototxt).
	The changes with respect to the original AlexNet are:
	- LRN (local response normalization) layers are not included
	- The Fully Connected (FC) layers (fc6 and fc7) have smaller dimensions
	due to the lower resolution of mini-places images (128x128) compared
	with ImageNet images (usually resized to 256x256)
	"""
	n = caffe.NetSpec()
	if data is not None:
	n.data = data
	conv_kwargs = dict(param=param, train=train)
	# Output_size 128
	n.conv1_1, n.bn1_1, n.scale1_1, n.relu1_1 = conv_bn_scale_relu(n.data, 3, 24, stride=1, pad=1, **conv_kwargs)
	# Output_size 128
	n.conv1_2, n.bn1_2, n.scale1_2, n.relu1_2 = conv_bn_scale_relu(n.relu1_1, 3, 24, stride=1, pad=1, **conv_kwargs)
	# Output_size 64
	n.pool1 = max_pool(n.relu1_2, ks=2, stride=2)


	# Output_size 64
	n.conv2_1, n.bn2_1, n.scale2_1, n.relu2_1 = conv_bn_scale_relu(n.pool1, 3, 32, pad=1, **conv_kwargs)
	# Output_size 64
	n.conv2_2, n.bn2_2, n.scale2_2, n.relu2_2 = conv_bn_scale_relu(n.relu2_1, 3, 32, pad=1, **conv_kwargs)
	# Output_size 32
	n.pool2 = max_pool(n.relu2_2, ks=2, stride=2)


	# Output_size 32
	n.conv3_1, n.bn3_1, n.scale3_1, n.relu3_1 = conv_bn_scale_relu(n.pool2, 3, 48, pad=1, **conv_kwargs)
	# Output_size 32
	n.conv3_2, n.bn3_2, n.scale3_2, n.relu3_2 = conv_bn_scale_relu(n.relu3_1, 3, 48, pad=1, **conv_kwargs)
	# Output_size 16
	n.pool3 = max_pool(n.relu3_2, ks=2, stride=2)

	# Output_size 16
	n.conv4_1, n.bn4_1, n.scale4_1, n.relu4_1 = conv_bn_scale_relu(n.pool3, 3, 64, pad=1, **conv_kwargs)
	# Output_size 16
	n.conv4_2, n.bn4_2, n.scale4_2, n.relu4_2 = conv_bn_scale_relu(n.relu4_1, 3, 64, pad=1, **conv_kwargs)
	# Output_size 8
	n.pool4 = max_pool(n.relu4_2, ks=2, stride=2)

	# Output_size 8
	n.conv_bn, n.bn_bn, n.scale_bn, n.relu_bn = conv_bn_scale_relu(n.pool4, 3, 3, pad=1, **conv_kwargs)

	# Output_size 16
	n.dc5_1, n.bn5_1, n.scale5_1, n.relu5_1 = deconv_bn_scale_relu(n.relu_bn, 2, 4, pad=0, stride=2, **conv_kwargs)
	n.dc5_2, n.bn5_2, n.scale5_2, n.relu5_2 = deconv_bn_scale_relu(n.relu5_1, 3, 4, pad=1, **conv_kwargs)
	n.dc5_3, n.bn5_3, n.scale5_3, n.relu5_3 = deconv_bn_scale_relu(n.relu5_2, 3, 4, pad=1, **conv_kwargs)

	# Output_size 32
	n.dc6_1, n.bn6_1, n.scale6_1, n.relu6_1 = deconv_bn_scale_relu(n.relu5_3, 2, 4, pad=0, stride=2, **conv_kwargs)
	n.dc6_2, n.bn6_2, n.scale6_2, n.relu6_2 = deconv_bn_scale_relu(n.relu6_1, 3, 4, pad=1, **conv_kwargs)
	n.dc6_3, n.bn6_3, n.scale6_3, n.relu6_3 = deconv_bn_scale_relu(n.relu6_2, 3, 4, pad=1, **conv_kwargs)

	# Output_size 64
	n.dc7_1, n.bn7_1, n.scale7_1, n.relu7_1 = deconv_bn_scale_relu(n.relu6_3, 2, 4, pad=0, stride=2, **conv_kwargs)
	n.dc7_2, n.bn7_2, n.scale7_2, n.relu7_2 = deconv_bn_scale_relu(n.relu7_1, 3, 4, pad=1, **conv_kwargs)
	n.dc7_3, n.bn7_3, n.scale7_3, n.relu7_3 = deconv_bn_scale_relu(n.relu7_2, 3, 4, pad=1, **conv_kwargs)

	# Output_size 128
	n.dc8_1, n.bn8_1, n.scale8_1, n.relu8_1 = deconv_bn_scale_relu(n.relu7_3, 2, 4, pad=0, stride=2, **conv_kwargs)
	n.dc8_2, n.bn8_2, n.scale8_2, n.relu8_2 = deconv_bn_scale_relu(n.relu8_1, 3, 4, pad=1, **conv_kwargs)
	n.dc8_3, n.bn8_3, n.scale8_3, _ = deconv_bn_scale_relu(n.relu8_2, 3, 3, pad=1, **conv_kwargs)






	if with_labels:
	n.label = data
	n.loss = L.EuclideanLoss(n.scale8_3, n.label)
	else:
	n.ignored_label = data
	n.silence_label = L.Silence(n.ignored_label, ntop=0)


	with open('network.prototxt', 'w') as fout:
	fout.write(str(n.to_proto()))

	return to_tempfile(str(n.to_proto()))

	def get_split(split):
	filename = '../../images/labels/%s.txt' % split
	if not os.path.exists(filename):
	raise IOError('Split data file not found: %s' % split)
	return filename

	def miniplaces_net(source, train=False, with_labels=True):
	#mean = [104, 117, 123] # per-channel mean of the BGR image pixels
	mean = [127, 127, 127]
	transform_param = dict(mirror=False, crop_size=args.crop, mean_value=mean)
	# batch_size = args.batch if train else 100
	batch_size = args.batch
	places_data, places_labels = L.ImageData(transform_param=transform_param,
	source=source, root_folder=args.image_root, shuffle=train,
	batch_size=batch_size, ntop=2)
	return minialexnet(data=places_data, train=train,
	with_labels=with_labels)

	def snapshot_prefix():
	return os.path.join(args.snapshot_dir, args.snapshot_prefix)

	def snapshot_at_iteration(iteration):
	return '%s_iter_%d.caffemodel' % (snapshot_prefix(), iteration)

	def miniplaces_solver(train_net_path, test_net_path=None):
	s = caffe_pb2.SolverParameter()

	# Specify locations of the train and (maybe) test networks.
	s.train_net = train_net_path
	if test_net_path is not None:
	s.test_net.append(test_net_path)
	# Test after every 1000 training iterations.
	s.test_interval = 1000
	# Set `test_iter` to test on 100 batches each time we test.
	# With test batch size 100, this covers the entire validation set of
	# 10K images (100 * 100 = 10K).
	s.test_iter.append(100)
	else:
	s.test_interval = args.iters + 1 # don't test during training

	# The number of batches over which to average the gradient.
	# Effectively boosts the training batch size by the given factor, without
	# affecting memory utilization.
	s.iter_size = args.iter_size

	# Solve using the stochastic gradient descent (SGD) algorithm.
	# Other choices include 'Adam' and 'RMSProp'.
	s.type = 'SGD'

	# The following settings (base_lr, lr_policy, gamma, stepsize, and max_iter),
	# define the following learning rate schedule:
	# Iterations [ 0, 20K) -> learning rate 0.01 = base_lr
	# Iterations [20K, 40K) -> learning rate 0.001 = base_lr * gamma
	# Iterations [40K, 50K) -> learning rate 0.0001 = base_lr * gamma^2

	# Set the initial learning rate for SGD.
	s.base_lr = args.lr

	# Set `lr_policy` to define how the learning rate changes during training.
	# Here, we 'step' the learning rate by multiplying it by a factor `gamma`
	# every `stepsize` iterations.
	s.lr_policy = 'step'
	s.gamma = args.gamma
	s.stepsize = args.stepsize

	# `max_iter` is the number of times to update the net (training iterations).
	s.max_iter = args.iters

	# Set other SGD hyperparameters. Setting a non-zero `momentum` takes a
	# weighted average of the current gradient and previous gradients to make
	# learning more stable. L2 weight decay regularizes learning, to help
	# prevent the model from overfitting.
	s.momentum = args.momentum
	s.weight_decay = args.decay

	# Display the current training loss and accuracy every `display` iterations.
	# This doesn't have an effect for Python training here as logging is
	# disabled by this script (see the GLOG_minloglevel setting).
	s.display = args.disp

	# Number of training iterations over which to smooth the displayed loss.
	# The summed loss value (Iteration N, loss = X) will be averaged,
	# but individual loss values (Train net output #K: my_loss = X) won't be.
	s.average_loss = 10

	# Seed the RNG for deterministic results.
	# (May not be so deterministic if using CuDNN.)
	s.random_seed = args.seed

	# Snapshots are files used to store networks we've trained. Here, we'll
	# snapshot twice per learning rate step to the location specified by the
	# --snapshot_dir and --snapshot_prefix args.
	s.snapshot = args.stepsize // 2
	s.snapshot_prefix = snapshot_prefix()

	# Create snapshot dir if it doesn't already exist.
	if not os.path.exists(args.snapshot_dir):
	os.makedirs(args.snapshot_dir)

	with open('solver.prototxt', 'w') as fout:
	fout.write(str(s))

	return to_tempfile(str(s))

	def train_net(with_val_net=False):
	train_net_file = miniplaces_net(get_split('train'), train=True)
	# Set with_val_net=True to test during training.
	# Environment variable GLOG_minloglevel should be set to 0 to display
	# Caffe output in this case; otherwise, the test result will not be
	# displayed.
	if with_val_net:
	val_net_file = miniplaces_net(get_split('val'), train=False)
	else:
	val_net_file = None
	solver_file = miniplaces_solver(train_net_file, val_net_file)
	solver = caffe.get_solver(solver_file)
	outputs = sorted(solver.net.outputs)
	def str_output(output):
	value = solver.net.blobs[output].data
	if output.startswith('accuracy'):
	valstr = '%5.2f%%' % (100 * value, )
	else:
	valstr = '%6f' % value
	return '%s = %s' % (output, valstr)
	def disp_outputs(iteration, iter_pad_len=len(str(args.iters))):
	metrics = '; '.join(str_output(o) for o in outputs)
	return 'Iteration %*d: %s' % (iter_pad_len, iteration, metrics)
	# We could just call `solver.solve()` rather than `step()`ing in a loop.
	# (If we hadn't set GLOG_minloglevel = 3 at the top of this file, Caffe
	# would display loss/accuracy information during training.)
	previous_time = None
	for iteration in xrange(args.iters):
	solver.step(1)
	if (args.disp > 0) and (iteration % args.disp == 0):
	current_time = time.clock()
	if previous_time is None:
	benchmark = ''
	else:
	time_per_iter = (current_time - previous_time) / args.disp
	benchmark = ' (%5f s/it)' % time_per_iter
	previous_time = current_time
	print disp_outputs(iteration), benchmark


	# if (iteration > 0) and (iteration % (args.stepsize // 2) == 0):
	# eval_net("train", iters=iteration)
	# print
	# eval_net("val", iters=iteration)
	# print

	# Print accuracy for last iteration.
	solver.net.forward()
	disp_outputs(args.iters)
	solver.net.save(snapshot_at_iteration(args.iters))

	def eval_net(split, K=5):
	print 'Running evaluation for split:', split
	filenames = []
	labels = []
	split_file = get_split(split)
	with open(split_file, 'r') as f:
	for line in f.readlines():
	parts = line.split()
	assert 1 <= len(parts) <= 2, 'malformed line'
	filenames.append(parts[0])
	if len(parts) > 1:
	labels.append(int(parts[1]))
	known_labels = (len(labels) > 0)
	if known_labels:
	assert len(labels) == len(filenames)
	else:
	# create file with 'dummy' labels (all 0s)
	split_file = to_tempfile(''.join('%s 0\n' % name for name in filenames))
	test_net_file = miniplaces_net(split_file, train=False, with_labels=False)
	weights_file = snapshot_at_iteration(args.iters)
	net = caffe.Net(test_net_file, weights_file, caffe.TEST)
	top_k_predictions = np.zeros((len(filenames), K), dtype=np.int32)
	if known_labels:
	correct_label_probs = np.zeros(len(filenames))
	offset = 0
	all_probs = np.zeros((len(filenames), 10))
	while offset < len(filenames):
	probs = net.forward()['probs']
	for prob in probs:
	all_probs[offset] = prob
	top_k_predictions[offset] = (-prob).argsort()[:K]
	if known_labels:
	correct_label_probs[offset] = prob[labels[offset]]
	offset += 1
	if offset >= len(filenames):
	break
	if known_labels:
	def accuracy_at_k(preds, labels, k):
	assert len(preds) == len(labels)
	num_correct = sum(l in p[:k] for p, l in zip(preds, labels))
	return num_correct / len(preds)
	for k in [1, K]:
	accuracy = 100 * accuracy_at_k(top_k_predictions, labels, k)
	print '\tAccuracy at %d = %4.2f%%' % (k, accuracy)
	cross_ent_error = -np.log(correct_label_probs).mean()
	print '\tSoftmax cross-entropy error = %.4f' % (cross_ent_error, )
	else:
	print 'Not computing accuracy; ground truth unknown for split:', split

	#all_probs = np.vstack(all_probs)

	if split != 'train':
	np.savetxt("prob.%s.csv" % split, all_probs, delimiter=",")

	filename = 'top_%d_predictions.%s.csv' % (K, split)
	with open(filename, 'w') as f:
	f.write(','.join(['image'] + ['label%d' % i for i in range(1, K+1)]))
	f.write('\n')
	f.write(''.join('%s,%s\n' % (image, ','.join(str(p) for p in preds))
	for image, preds in zip(filenames, top_k_predictions)))
	print 'Predictions for split %s dumped to: %s' % (split, filename)

	if __name__ == '__main__':

	if args.generate_proto_only:
	miniplaces_net(source="", train=True, with_labels=False)
	exit(0)

	if not args.eval_only:
	print 'Training net...\n'
	train_net()

	print '\nTraining complete. Evaluating...\n'
	# for split in ('train', 'val', 'test'):
	# eval_net(split)
	# print
	# print 'Evaluation complete.'