Skip to content

Instantly share code, notes, and snippets.

@dineshj1
Last active August 28, 2016 22:39
Show Gist options
  • Save dineshj1/7e38e6a68d6f7d81cc771ed77ce3d656 to your computer and use it in GitHub Desktop.
Save dineshj1/7e38e6a68d6f7d81cc771ed77ce3d656 to your computer and use it in GitHub Desktop.
Hyperparameter optimization scripts
function submitjobs(nosubmit, numjobs, main)
try
xlwrite_path='../data_utils/xlwrite/';
addpath(xlwrite_path);
javaaddpath([xlwrite_path 'poi_library/poi-3.8-20120326.jar']);
javaaddpath([xlwrite_path 'poi_library/poi-ooxml-3.8-20120326.jar']);
javaaddpath([xlwrite_path 'poi_library/poi-ooxml-schemas-3.8-20120326.jar']);
javaaddpath([xlwrite_path 'poi_library/xmlbeans-2.3.0.jar']);
javaaddpath([xlwrite_path 'poi_library/dom4j-1.6.1.jar']);
javaaddpath([xlwrite_path 'poi_library/stax-api-1.0.1.jar']);
if nargin<1, nosubmit =true; end
if nargin<2, numjobs=1; end
if nargin<3, main='./lookahead_attention.lua'; end
% --- establishing system setup
system('hostname > tmp.txt');
if ~isempty(regexp(fileread('tmp.txt'), 'vision', 'once')) || ~isempty(regexp(fileread('tmp.txt'), 'adriana', 'once')) || ~isempty(regexp(fileread('tmp.txt'), 'eldar', 'once'))
hpcsystem='condor';
else
hpcsystem='slurm';
end
if ~isempty(regexp(fileread('tmp.txt'), 'maverick', 'once'))
clustername='maverick';
elseif ~isempty(regexp(fileread('tmp.txt'), 'stampede', 'once'))
clustername='stampede';
else
clustername='condor';
end
if strcmp(hpcsystem, 'condor')
% get clusterno through condor_q or condor_history commands
clusterno=-1;
system('condor_q -format "%d\n" ClusterId > tmp.txt 2> /dev/null');
allclust=load('tmp.txt');
if ~isempty(allclust)
clusterno=max(max(allclust), clusterno);
end
system('condor_history -format "%d\n" ClusterId 2> /dev/null | head -50 > tmp.txt');
allclust=load('tmp.txt');
if ~isempty(allclust)
clusterno=max(max(allclust), clusterno);
end
else % do the next best thing - assign another unique ID
origclusterno=load('.lastclusterno');
clusterno=origclusterno;
end
% --- hpcjob template construction
hpcjob_template.main=main;
hpcjob_template.standard_args=' --cuda --no_debug --no_loadInit --full_data';
%hpcjob_template.standard_args=' --cuda --no_debug --no_loadInit';
hpcjob_template.useGPU=true;
% --- declaring hyperparameter ranges
hypno=0;
autoreport_flag=false;
autoreport_oldjobs=0;
%hypno=hypno+1; hyp(hypno)=HyperParam('learningRate', 'log_uniform', [-2.3,-1.7]);
%hypno=hypno+1; hyp(hypno)=HyperParam('learningRate', 'log_uniform', [-2 -1]);
%hypno=hypno+1; hyp(hypno)=HyperParam('learningRate', 'log_uniform', [-3,0]);
hypno=hypno+1; hyp(hypno)=HyperParam('initModel', 'fix', '/home/01932/dineshj/save/2002450.dat');
%hypno=hypno+1; hyp(hypno)=HyperParam('initModel', 'fix', '/home/01932/dineshj/save/2002365.dat');
%hypno=hypno+1; hyp(hypno)=HyperParam('initModel', 'fix', '/home/01932/dineshj/save/2002329.dat');
%hypno=hypno+1; hyp(hypno)=HyperParam('learningRate', 'log_uniform', [-4,-2.5]);
%hypno=hypno+1; hyp(hypno)=HyperParam('learningRate', 'sequence', logspace(-2,-1,numjobs));
hypno=hypno+1; hyp(hypno)=HyperParam('learningRate', 'fix', 0.00316);
hypno=hypno+1; hyp(hypno)=HyperParam('finetune_lrMult', 'fix', -1);
hypno=hypno+1; hyp(hypno)=HyperParam('finetuneTopFlag', 'fix', +1);
hypno=hypno+1; hyp(hypno)=HyperParam('actOnTime', 'fix', +1);
hypno=hypno+1; hyp(hypno)=HyperParam('actOnElev', 'fix', +1);
%hypno=hypno+1; hyp(hypno)=HyperParam('learningRate', 'fix', 0.04);
%hypno=hypno+1; hyp(hypno)=HyperParam('learningRate', 'fix', 0.0224);
%hypno=hypno+1; hyp(hypno)=HyperParam('learningRate', 'fix', 0.01);
%hypno=hypno+1; hyp(hypno)=HyperParam('lookahead_loss_wt', 'log_uniform', [1.5,2.5]);
hypno=hypno+1; hyp(hypno)=HyperParam('weightDecay', 'fix', 0.005);
hypno=hypno+1; hyp(hypno)=HyperParam('featDropout', 'fix', 2e-5);
%hypno=hypno+1; hyp(hypno)=HyperParam('featDropout', 'fix', 0);
hypno=hypno+1; hyp(hypno)=HyperParam('combineDropout', 'fix', 0.5);
hypno=hypno+1; hyp(hypno)=HyperParam('batchNormFlag', 'fix', +1);
%hypno=hypno+1; hyp(hypno)=HyperParam('combineDropout', 'fix', 0);
hypno=hypno+1; hyp(hypno)=HyperParam('lookahead_loss_wt', 'fix', 100);
%hypno=hypno+1; hyp(hypno)=HyperParam('equiv_reg_wt', 'sequence', logspace(-4,-0,numjobs));
hypno=hypno+1; hyp(hypno)=HyperParam('equiv_reg_wt', 'fix', 42);
%hypno=hypno+1; hyp(hypno)=HyperParam('equiv_reg_wt', 'fix', 0);
%hypno=hypno+1; hyp(hypno)=HyperParam('num_canonical_acts', 'pmf', [1:9; 1./(1:9).^2]);
hypno=hypno+1; hyp(hypno)=HyperParam('lookAheadActorFlag', 'fix', -1);
tmp=getHyperParam(hyp, 'lookAheadActorFlag');
if ~(strcmp(tmp.dist, 'fix') && tmp.params<=0)
hypno=hypno+1; hyp(hypno)=HyperParam('num_canonical_acts', 'fix', 0); % relevant only if lookAheadActor
hypno=hypno+1; hyp(hypno)=HyperParam('lookAheadClassifyFlag', 'fix', -1); % relevant only if lookAheadActor
else
hypno=hypno+1; hyp(hypno)=HyperParam('num_canonical_acts', 'fix', 0);
hypno=hypno+1; hyp(hypno)=HyperParam('lookAheadClassifyFlag', 'fix', -1); % relevant only if lookAheadActor
end
hypno=hypno+1; hyp(hypno)=HyperParam('rho', 'fix', 3);
hypno=hypno+1; hyp(hypno)=HyperParam('greedyLossFlag', 'fix', +1);
hypno=hypno+1; hyp(hypno)=HyperParam('shareClassifiersFlag', 'fix', -1);
hypno=hypno+1; hyp(hypno)=HyperParam('identFeedbackFlag', 'fix', -1);
hypno=hypno+1; hyp(hypno)=HyperParam('initFeedbackFlag', 'fix', +1);
hypno=hypno+1; hyp(hypno)=HyperParam('relativeActionFlag', 'fix', +1);
hypno=hypno+1; hyp(hypno)=HyperParam('simplePatchSensorFlag', 'fix', +1);
hypno=hypno+1; hyp(hypno)=HyperParam('maxTries', 'fix', 50);
hypno=hypno+1; hyp(hypno)=HyperParam('pretrainModeEpochs', 'fix', 0);
%hypno=hypno+1; hyp(hypno)=HyperParam('pretrainModeEpochs', 'sequence', [0, 10, 0, 10]);
hypno=hypno+1; hyp(hypno)=HyperParam('lookahead_bottleneck', 'fix', 100);
hypno=hypno+1; hyp(hypno)=HyperParam('lookahead_distance', 'fix', 'cosine');
%hypno=hypno+1; hyp(hypno)=HyperParam('maxEpoch', 'fix', 2000);
%hypno=hypno+1; hyp(hypno)=HyperParam('manual_seed', 'sequence', [79836, 94301, 68372, 13209, 72273, 11036, 11750, 64072, 32882, 65382]);
%hypno=hypno+1; hyp(hypno)=HyperParam('manual_seed', 'sequence', [ceil(rand(numjobs,1)*1e5)]);
hypno=hypno+1; hyp(hypno)=HyperParam('manual_seed', 'sequence', [1000 2000 6000 4000 5000]);
%hypno=hypno+1; hyp(hypno)=HyperParam('manual_seed', 'fix', 38828);
%hypno=hypno+1; hyp(hypno)=HyperParam('manual_seed', 'fix', 6000);
hypno=hypno+1; hyp(hypno)=HyperParam('report_res_iter', 'fix', 20);
hypno=hypno+1; hyp(hypno)=HyperParam('randomActionsFlag', 'fix', 0); % to implement random actions baseline
%hypno=hypno+1; hyp(hypno)=HyperParam('randomActionsFlag', 'sequence', 0); % to implement random actions baseline
time=200
xlscells={};
xlscells{1,1}='jobno';
xlscells{1,2}='commit';
[~,curr_commit]=system('git log --format="%h" -n 1');
curr_commit=curr_commit(9:15);
for j=1:numel(hyp)
xlscells{1,j+2}=hyp(j).name;
end
for i=1:numjobs
fprintf(['\n' repmat('-',1,20) '\n']);
clusterno = clusterno+1;
if i==1
start_clusterno=clusterno;
stop_clusterno=clusterno+numjobs-1;
end
tmp=fieldnames(hpcjob_template);
for fieldno=1:numel(tmp)
hpcjob(i).(tmp{fieldno})=hpcjob_template.(tmp{fieldno});
end
hpcjob(i).clusterno=clusterno;
xlscells{i+1,1}=clusterno;
xlscells{i+1,2}=curr_commit;
% ----- sampling random hyperparameters
for j=1:numel(hyp)
switch hyp(j).dist
case 'uniform'
% params: [lb, ub]
val=rand(1)*(hyp(j).params(2)-hyp(j).params(1))+hyp(j).params(1);
case 'log_uniform'
% params: [lb, ub] in log space
val=10^(rand(1)*(hyp(j).params(2)-hyp(j).params(1))+hyp(j).params(1));
case 'disc_uniform'
% params: [values]
val=randsample(hyp(j).params, 1);
case 'pmf'
% params: [values; weights]
pmf=hyp(j).params(2,:);
pmf=pmf/sum(pmf);
val=hyp(j).params(1,mnrnd(1,pmf));
%cmf=cumsum(pmf);
%val=hyp(j).params(find(rand(1)<cmf, 1));
case 'gaussian'
% params: [mean, std]
val=randn(1)*hyp(j).params(2)+hyp(j).params(1);
case 'sequence'
% deterministic progression
val=hyp(j).params(i);
case 'fix'
% fixed to single value
val=hyp(j).params;
otherwise
error('unknown distribution %s', hyp(j).dist);
end
if isfield(hpcjob(i), 'args')
hpcjob(i).args(end+1)=struct('name', hyp(j).name, 'val', val);
else
hpcjob(i).args(1)=struct('name', hyp(j).name, 'val', val);
end
xlscells{i+1,j+2}=val;
end
if strcmp(hpcsystem,'slurm')
hpcjob(i).args(end+1)=struct('name', 'sys_cmd', 'val', sprintf('"rsync -avrz -e ssh ../clust_runs/*%d* dineshj@adriana.cs.utexas.edu:/scratch/vision/dineshj/active/clust_runs/; rsync -avrz -e ssh ../clust_runs/*%d* dineshj@dineshj.csres.utexas.edu:/home/dineshj/Documents/clust_runs/;"', clusterno, clusterno));
hpcjob(i).args(end+1)=struct('name', 'sys_cmd_iter', 'val', 10);
if autoreport_flag && i==numjobs
hpcjob(i).args(end+1)= struct('name', 'sys_cmd2', 'val', sprintf('th collate_results.lua --start %d --stop %d;"', start_clusterno-autoreport_oldjobs, stop_clusterno));
end
end
%-- setting jobno and logger file
hpcjob(i).args(end+1)=struct('name', 'loggerfile', 'val', sprintf('../clust_runs/%d.rec', clusterno));
hpcjob(i).args(end+1)=struct('name', 'jobno', 'val', clusterno);
fprintf('Sampled args(%d):\n', clusterno);
fprintf('\t---\n')
arg_disp(hpcjob(i));
fprintf('\t---\n')
delete('hyperparams.xls');
xlwrite('hyperparams.xls', xlscells);
%-- creating submit file and submitting
switch hpcsystem
case 'condor'
condor_submitFile=sprintf('../clust_runs/%d.submit.%s', clusterno, hpcsystem);
fprintf('Submit file %s\n', condor_submitFile);
struct2condorsubmit(hpcjob(i), condor_submitFile);
if nosubmit, keyboard; end
status=system(sprintf('condor_submit %s 2> /dev/null', condor_submitFile));
assert(status==0);
case 'slurm'
slurm_submitFile=sprintf('../clust_runs/%d.submit.%s', clusterno, hpcsystem);
fprintf('Submit file %s\n', slurm_submitFile);
if ~exist('time', 'var')
time=20;
end
struct2slurmsubmit(hpcjob(i), slurm_submitFile, clusterno, clustername, time);
if nosubmit, keyboard; end
status=system(sprintf('sbatch %s', slurm_submitFile));
assert(status==0);
% updating lastclusterno file
origclusterno=origclusterno+1;
f=fopen('.lastclusterno', 'w');
fprintf(f, '%d', origclusterno);
fclose(f);
otherwise
error('Unknown hpc system');
end
end
catch err
getReport(err)
keyboard
end
end
function struct2slurmsubmit(jobstruct, filename, clusterno, cluster, num_mins)
FILE=fopen(filename,'w');
fprintf(FILE, '#!/bin/bash\n');
fprintf(FILE, '#SBATCH -J stark\n');
fprintf(FILE, '#SBATCH -o ../clust_runs/%d.out\n', clusterno);
fprintf(FILE, '#SBATCH -n 1\n');
if jobstruct.useGPU
switch cluster
case 'stampede'
fprintf(FILE, '#SBATCH -A Visual-Recognition\n');
%fprintf(FILE, '#SBATCH -A Fine-Tuning-CNNs\n');
fprintf(FILE, '#SBATCH -p gpu\n');
%fprintf(FILE, '#SBATCH -p vis\n');
case 'maverick'
fprintf(FILE, '#SBATCH -A Visual-Recognition\n');
%fprintf(FILE, '#SBATCH -A Fine-Tuning-CNNs\n');
fprintf(FILE, '#SBATCH -p gpu\n');
end
else
fprintf(FILE, '#SBATCH -p normal\n');
end
fprintf(FILE, '#SBATCH -t 00:%02d:00\n', num_mins);
switch cluster
case 'stampede'
fprintf(FILE, 'th %s %s', jobstruct.main, jobstruct.standard_args);
case 'maverick'
fprintf(FILE, 'th %s %s', jobstruct.main, jobstruct.standard_args);
otherwise
error('Unknown cluster name');
end
arg_string='';
for argno=1:numel(jobstruct.args)
arg_string=[arg_string, ' --', jobstruct.args(argno).name, ' ', arg2str(jobstruct.args(argno).val)];
end
fprintf(FILE, arg_string);
fclose(FILE);
end
function struct2condorsubmit(jobstruct, filename)
FILE=fopen(filename,'w');
fprintf(FILE, '+Group="GRAD"\n');
fprintf(FILE, '+Project="AI_ROBOTICS"\n');
fprintf(FILE, '+ProjectDescription=""\n');
if jobstruct.useGPU
fprintf(FILE, '+GPUJob=true\n');
fprintf(FILE, 'Requirements=TARGET.GPUSlot\n');
end
fprintf(FILE, 'Universe = vanilla\n');
fprintf(FILE, 'Getenv = True\n');
fprintf(FILE, 'Log = ../clust_runs/%d.log\n', jobstruct.clusterno);
fprintf(FILE, 'Output = ../clust_runs/%d.out\n', jobstruct.clusterno);
fprintf(FILE, 'Error = ../clust_runs/%d.err\n', jobstruct.clusterno);
fprintf(FILE, 'Executable=/vision/vision_users/dineshj/torch_cuda/bin/th \n');
fprintf(FILE, 'Arguments= %s %s', jobstruct.main, jobstruct.standard_args);
arg_string='';
for argno=1:numel(jobstruct.args)
arg_string=[arg_string, ' --', jobstruct.args(argno).name, ' ', arg2str(jobstruct.args(argno).val)];
end
fprintf(FILE, arg_string);
fprintf(FILE, '\nQueue %d', 1);
fclose(FILE);
end
function str=arg2str(arg)
if ischar(arg)
str=arg;
return
end
if isnumeric(arg)
str=num2str(arg);
return;
end
end
function HyperParamObject = HyperParam(name, dist, params)
HyperParamObject = struct('name', name, 'dist', dist, 'params', params);
end
function param = getHyperParam(hyp_array, param_name)
hyp_names=arrayfun(@(x) x.name, hyp_array, 'UniformOutput', false);
tmp=find(strcmp(hyp_names, param_name));
assert(~isempty(tmp), sprintf('No parameter %s\n', param_name));
assert(numel(tmp)<2, sprintf('More than one parameter %s\n', param_name));
param=hyp_array(tmp); return;
end
function arg_disp(jobstruct)
if ~isfield(jobstruct, 'args')
return
end
for i=1:numel(jobstruct.args)
fprintf('%s\t= %s\n', jobstruct.args(i).name, arg2str(jobstruct.args(i).val));
end
end
import sys
import csv
sys.path.insert(0, "../kitti_codes/")
import socket
import time
import re
import argparse
import numpy as np
from subprocess import call
from IPython.core.debugger import Tracer; debug_here = Tracer()
machine_name=socket.gethostname();
if (re.search('vision',machine_name) or
re.search('eldar', machine_name) or
re.search('adriana', machine_name) or
re.search('jaechul', machine_name)):
hpcsystem='condor';
#error('Not implemented');
sys.path.insert(0, "/scratch/vision/dineshj/caffe2/distribute_CPU/python")
else:
hpcsystem='slurm';
sys.path.insert(0, "/work/01932/dineshj/caffe2/python")
import caffe
from caffe import layers as L
from caffe import params as P
import layer_stack as LS
if re.search('maverick', machine_name):
clustername='maverick';
elif re.search('stampede', machine_name):
clustername='stampede';
else:
clustername='condor';
#hpcsystem='slurm'
#clustername='maverick'
#read clusterno
lastclusterno=int(open('.lastclusterno').read());
print("Last clusterno: %d" % lastclusterno);
clusterno=lastclusterno;
resume_flag=False
snapshot=''
finetune_flag=False
weights=''
def main():
global clusterno
#config='drlim';
base_solver=LS.CaffeSolver(debug=args.debug)
if args.config == 'cls':
num_cls=397;
cls_batchsize=64;
cls_loss_weights=10*np.ones(args.numjobs);
base_solver.sp["max_iter"]=str(30000);
base_solver.sp["snapshot"]=str(10000);
base_solver.sp["stepsize"]=str(5000); base_solver.sp["gamma"]=str(0.5);
base_solver.sp["weight_decay"]=str(0.0005);
runtime=720;
learning_rates=np.logspace(-4,-4,args.numjobs);
elif args.config == 'drlim' or args.config == "ved_drlim":
num_cls=397;
nonDiscrete_flag=False
dynamicCrop_flag=False
pair_batchsize=64; cls_batchsize=64;
#drlim_loss_weights=np.ones(args.numjobs);
drlim_loss_weights=np.zeros(args.numjobs);
#trans_loss_weights=1*np.ones(args.numjobs)*(1 if args.config=='ved_drlim' else 0);
#trans_loss_weights=np.logspace(-2,0,args.numjobs)*(1 if args.config=='ved_drlim' else 0);
#trans_loss_weights=np.linspace(0.1,0.9,args.numjobs)*(1 if args.config=='ved_drlim' else 0);
trans_loss_weights=np.ones(args.numjobs)*(1 if args.config=='ved_drlim' else 0);
drlim_loss_margins=100*np.ones(args.numjobs);
trans_loss_margins=100*np.ones(args.numjobs);
cls_loss_weights=10*np.ones(args.numjobs);
base_solver.sp["max_iter"]=str(70000);
base_solver.sp["snapshot"]=str(10000);
base_solver.sp["stepsize"]=str(5000); base_solver.sp["gamma"]=str(0.5);
base_solver.sp["weight_decay"]=str(0.005);
runtime=720;
learning_rates=np.logspace(-4.5,-2.5,args.numjobs);
csvfile = open('hyperparams.csv', 'w');
fieldnames=[
"jobno" ,
"config" ,
"base_lr" ,
"nonDiscrete_flag" ,
"dynamicCrop_flag" ,
"pair_batchsize" ,
"cls_batchsize" ,
"cls_loss_weight" ,
"drlim_loss_weight" ,
"trans_loss_weight" ,
"drlim_loss_margin" ,
"trans_loss_margin" ,
"weight_decay" ,
"max_iter" ,
"snapshot" ,
"stepsize" ,
"gamma" ,
"type" ,
"momentum" ,
"momentum2" ,
"random_seed"
];
csvwriter= csv.DictWriter(csvfile, fieldnames=fieldnames);
csvwriter.writeheader()
for jobno in range(args.numjobs):
if jobno>0:
print("Delay...");
time.sleep(5);
clusterno=clusterno+1;
print ("Creating files for job clusterno %d" % clusterno);
solver_file="../condor/" + str(clusterno)+'_solver.prototxt'
trainnet_file="../condor/" + str(clusterno) + '_trainnet.prototxt'
testnet_file="../condor/" + str(clusterno) + '_testnet.prototxt'
deploynet_file="../condor/" + str(clusterno) + '_deploynet.prototxt'
if args.config=='drlim' or args.config=='ved_drlim':
#pretrain=True
solver=base_solver;
solver.sp["random_seed"]=str(clusterno);
solver.sp["train_net"]= '"'+trainnet_file+'"';
solver.sp["test_net"]= '"'+testnet_file+'"';
solver.sp["snapshot_prefix"]='"'+"../caffe_snapshots/"+ str(clusterno) + "_snap"+'"';
solver.sp["base_lr"]= str(learning_rates[jobno]);
def net_phase(phase): # phase == 'train', 'test' or 'deploy'
assert(phase=='train' or
phase=='test' or
phase=='deploy'
)
net=caffe.NetSpec()
## Data layers ########################
if phase=='train':
if not dynamicCrop_flag:
net["pair_data"]=L.Data(
source="../kitti_codes/KITTI_files/trn_trans_pairs_full-clust6_3-nbd10/" ,
transform_param=dict(
mean_value=[104,117,123,104,117,123]),
batch_size=pair_batchsize,
ntop=1,
backend=P.Data.LMDB,
#include=dict(phase=caffe.TRAIN),
name="pair_kitti_data_trn");
net["trans_labelvec"]=L.HDF5Data(
source="../kitti_codes/trn_full_trans_pairs-clust6_3-nbd10_labels.txt",
batch_size=pair_batchsize,
ntop=1,
#include=dict(phase=caffe.TRAIN),
name="pair_kitti_label_trn"
)
else:
net["pair_data"]=L.Data(
source="../kitti_codes/KITTI/trn_trans_pairs_370x1226_clust6_3-nbd7/" ,
transform_param=dict(
mirror=False,
crop_size=227,
mean_value=[104,117,123,104,117,123]),
batch_size=pair_batchsize,
ntop=1,
backend=P.Data.LMDB,
#include=dict(phase=caffe.TRAIN),
name="pair_kitti_data_trn");
net["trans_labelvec"]=L.HDF5Data(
source="../kitti_codes/trn_370x1226_trans_pairs-clust6_3-nbd7_labels.txt",
batch_size=pair_batchsize,
ntop=1,
#include=dict(phase=caffe.TRAIN),
name="pair_kitti_label_trn"
)
elif phase=='test':
if not dynamicCrop_flag:
net["pair_data"]=L.Data(
source="../kitti_codes/KITTI_files/trn_trans_pairs_full-clust6_3-nbd10/" ,
transform_param=dict(
mean_value=[104,117,123,104,117,123]),
batch_size=pair_batchsize,
ntop=1,
backend=P.Data.LMDB,
#include=dict(phase=caffe.TRAIN),
name="pair_kitti_data_tst");
net["trans_labelvec"]=L.HDF5Data(
source="../kitti_codes/trn_full_trans_pairs-clust6_3-nbd10_labels.txt",
batch_size=pair_batchsize,
ntop=1,
#include=dict(phase=caffe.TRAIN),
name="pair_kitti_label_tst"
)
else:
net["pair_data"]=L.Data(
source="../kitti_codes/KITTI/tst_trans_pairs_370x1226_clust6_3-nbd7/" ,
transform_param=dict(
mirror=False,
crop_size=227,
mean_value=[104,117,123,104,117,123]),
batch_size=pair_batchsize,
ntop=1,
backend=P.Data.LMDB,
#include=dict(phase=caffe.TRAIN),
name="pair_kitti_data_tst");
net["trans_labelvec"]=L.HDF5Data(
source="../kitti_codes/tst_370x1226_trans_pairs-clust6_3-nbd7_labels.txt",
batch_size=pair_batchsize,
ntop=1,
#include=dict(phase=caffe.TRAIN),
name="pair_kitti_label_tst"
)
#net["pair_data"], net["pair_labelvec"] = L.DummyData(name="dummy_pair_data",
# ntop=2,
# shape=[dict(dim=[pair_batchsize, 6, 227, 227]), dict(dim=[pair_batchsize, 1, 1, 7])]
# );
if phase=='train' or phase == 'test': # no deploy
net["a_data"], net["b_data"] = L.Slice(
net["pair_data"], name="slice_data",
slice_param=dict(
slice_dim=1,
slice_point=[3]),
ntop=2
);
net["sim_label"], net["trans_label1"], net["trans_label2"], net["trans_label3"], net["trans_mot_labels"] = L.Slice(
net["trans_labelvec"],
name="slice_pair_label",
slice_param=dict(
slice_point=[1, 2, 3, 4]
),
ntop=5,
)
net["silent"]=L.Silence(
net["sim_label"],
net["trans_label1"],
net["trans_label2"],
net["trans_label3"],
net["trans_mot_labels"],
ntop=0
)
## Siamese network ########################
net=LS.generate_conv1_to_bn6(
net,
blob_prefix="a_",
layer_prefix="a_",
param_prefix="shared_",
bottom_blob="a_data",
top_blob="a_top",
num_dropouts=0,
learn_all=True,
in_place_pool5=False
);
net=LS.generate_conv1_to_bn6(
net,
blob_prefix="b_",
layer_prefix="b_",
param_prefix="shared_",
bottom_blob="b_data",
top_blob="b_top",
num_dropouts=0,
learn_all=True,
in_place_pool5=False
);
## drlim loss ########################
net=LS.generate_contrastive_loss(
net,
blob_prefix="drlim_",
layer_prefix="drlim_",
param_prefix="",
bottom_blob=["a_top", "b_top", "sim_label"],
loss_weight=drlim_loss_weights[jobno],
loss_margin=drlim_loss_margins[jobno],
learn_all=True
);
## equivariance loss ########################
if args.config=='ved_drlim':
if not nonDiscrete_flag:
num_transforms=3;
for i in range(num_transforms):
net=LS.generate_equivariant_map(
net,
blob_prefix="trans"+str(i+1)+"_",
layer_prefix="trans"+str(i+1)+"_",
bottom_blob="b_top",
top_blob="c_top" + str(i+1),
bottleneck_size=128,
nonDiscrete_flag=nonDiscrete_flag,
learn_all=True
)
net=LS.generate_contrastive_loss(
net,
blob_prefix="trans_",
blob_suffix=str(i+1),
layer_prefix="trans_",
layer_suffix=str(i+1),
bottom_blob=["a_top", "c_top" + str(i+1), "trans_label" + str(i+1)],
loss_weight=trans_loss_weights[jobno],
loss_margin=trans_loss_margins[jobno],
learn_all=True
)
else:
net=LS.generate_equivariant_map(
net,
blob_prefix="trans_",
layer_prefix="trans_",
bottom_blob="b_top",
top_blob="c_top",
bottleneck_size=128,
nonDiscrete_flag=nonDiscrete_flag,
motion_blob="trans_mot_labels",
learn_all=True
)
net["trans_loss"]=L.EuclideanLoss(
net["a_top"],
net["c_top"],
loss_weight=trans_loss_weights[jobno]
)
## classification pipeline (for either monitoring, or training)
if phase == 'train':
net["cls_data"], net["cls_label"]=L.Data(
source="../kitti_codes/SUN/pulkit_lmdbs/sun_imSz256_ntpc5_run1_train-lmdb" ,
transform_param=dict(
mirror=True,
crop_size=227,
mean_value=[104,117,123]),
batch_size=cls_batchsize,
ntop=2,
backend=P.Data.LMDB,
#include=dict(phase=caffe.TRAIN),
name="cls_sun_trn");
elif phase=='test':
net["cls_data"], net["cls_label"]=L.Data(
source="../kitti_codes/SUN/pulkit_lmdbs/sun_imSz256_ntpc50_run1_test-lmdb/" ,
transform_param=dict(
mirror=False,
crop_size=227,
mean_value=[104,117,123]),
batch_size=cls_batchsize,
ntop=2,
backend=P.Data.LMDB,
#include=dict(phase=caffe.TEST),
name="cls_sun_tst");
elif phase=='deploy':
net["cls_data"]=L.Input(
input_param=dict(
shape=dict(dim=[1,3,227,227])
),
name="cls_sun_deploy"
)
#net["cls_data"], net["cls_label"] = L.DummyData(name="dummy_cls_data",
# ntop=2,
# shape=[dict(dim=[cls_batchsize,3,227,227]), dict(dim=[cls_batchsize, 1])]
# )
if phase=='train' or phase== 'test' or phase== 'deploy': # trivially satisfied since there is a previous assert, but just in case something changes later
net=LS.generate_conv1_to_bn6(
net,
blob_prefix="cls_",
layer_prefix="",
param_prefix="shared_",
bottom_blob="cls_data",
top_blob="cls_bn6",
num_dropouts=1 if args.num_dropouts>=2 else 0,
#learn_all=False if args.pretrain else True
learn_all=True
)
if phase=='train' or phase== 'test':
#L6
net=LS.generate_classifier(
net,
blob_prefix="L6_",
layer_prefix="L6_",
param_prefix="",
bottom_blob=["cls_bn6", "cls_label"],
learn_all=True,
propagate_down=True if (not args.pretrain and cls_loss_level=='L6') else False,
num_cls=num_cls,
loss_weight=cls_loss_weights[jobno],
loss_name="cls_L6_loss",
acc_name="cls_L6_acc",
num_dropouts=1 if args.num_dropouts>=1 else 0,
)
#L3
net=LS.generate_classifier(
net,
blob_prefix="L3_",
layer_prefix="L3_",
param_prefix="",
bottom_blob=["cls_bn3", "cls_label"],
learn_all=True,
propagate_down=True if (not args.pretrain and cls_loss_level=='L3') else False,
num_cls=num_cls,
loss_weight=cls_loss_weights[jobno],
loss_name="cls_L3_loss",
acc_name="cls_L3_acc",
)
return net
else:
raise Exception('config %s not handled yet' % config)
print ("Solver: %s" % solver_file);
solver.write(solver_file);
train_net=net_phase('train');
test_net=net_phase('test');
deploy_net=net_phase('deploy');
print ("Train net: %s" % trainnet_file);
with open(trainnet_file, 'w') as f:
f.write(str(train_net.to_proto()));
print ("Test net: %s" % testnet_file);
with open(testnet_file, 'w') as f:
f.write(str(test_net.to_proto()));
print ("Deploy net: %s" % deploynet_file);
with open(deploynet_file, 'w') as f:
f.write(str(deploy_net.to_proto()));
if args.debug:
try:
net=caffe.Net(trainnet_file, caffe.TRAIN);
net=caffe.Net(testnet_file, caffe.TEST);
net=caffe.Net(deploynet_file, caffe.TEST);
except:
raise Exception("network file raises error");
# store job details to csv
job_signature = {};
job_signature["jobno"] = clusterno
job_signature["config"] = args.config
job_signature["base_lr"] = solver.sp["base_lr"]
job_signature["nonDiscrete_flag"] = nonDiscrete_flag
job_signature["dynamicCrop_flag"] = dynamicCrop_flag
job_signature["pair_batchsize"] = pair_batchsize
job_signature["cls_batchsize"] = cls_batchsize
job_signature["cls_loss_weight"] = cls_loss_weights[jobno]
job_signature["drlim_loss_weight"] = drlim_loss_weights[jobno]
job_signature["trans_loss_weight"] = trans_loss_weights[jobno]
job_signature["drlim_loss_margin"] = drlim_loss_margins[jobno]
job_signature["trans_loss_margin"] = trans_loss_margins[jobno]
job_signature["weight_decay"] = solver.sp["weight_decay"]
job_signature["max_iter"] = solver.sp["max_iter"]
job_signature["snapshot"] = solver.sp["snapshot"]
job_signature["stepsize"] = solver.sp["stepsize"]
job_signature["gamma"] = solver.sp["gamma"]
job_signature["momentum"] = solver.sp["momentum"]
job_signature["momentum2"] = solver.sp["momentum2"]
job_signature["type"] = solver.sp["type"]
job_signature["random_seed"] = solver.sp["random_seed"]
csvwriter.writerow(job_signature);
print("Job signature stored to %s" % 'hyperparams.csv')
# submit job
if hpcsystem=="condor":
condor_submitFile='../condor/%d.condor_submit' % clusterno
print("Condor submit file: %s" % condor_submitFile);
with open(condor_submitFile, 'w') as file:
file.write('+Group="GRAD"\n');
file.write('+Project="AI_ROBOTICS"\n');
file.write('+ProjectDescription=""\n');
file.write('+GPUJOB=true\n');
file.write('Requirements=TARGET.GPUSlot\n');
file.write('Environment=LD_LIBRARY_PATH=/scratch/vision/dineshj/caffe_vision_extra//leveldb/:/scratch/vision/dineshj/caffe_vision_extra//snappy/install/lib/:/scratch/vision/dineshj/caffe_vision_extra//OpenBLAS/build//lib/:/scratch/vision/dineshj/caffe_vision_extra//glog-0.3.3/install/lib/:/scratch/vision/dineshj/caffe_vision_extra//gflags/build/lib/:/scratch/vision/dineshj/caffe_vision_extra//lmdb/:/scratch/vision/dineshj/caffe_vision_extra//protobuf/install//lib/:/lusr/opt/boost-1.54/lib/:/opt/cuda-7.0/lib64/:/opt/cuda-7.0/nvvm/libdevice/:/usr/:/scratch/vision/dineshj/caffe_vision_extra/hdf5-1.8.15-patch1/install/lib/:/usr/lib/x86_64-linux-gnu/:/scratch/vision/dineshj/caffe_vis/build/lib/:/lib/x86_64-linux-gnu/:/v/filer4b/software/matlab-r2015b/bin/glnxa64/:/v/filer4b/software/matlab-r2015b/runtime/glnxa64/:/vision/vision_users/dineshj/local_installs/lib/;\n');
file.write('Universe = vanilla\n');
file.write('Getenv = True\n');
file.write('Log = ../condor/%d.log\n' % clusterno);
file.write('Output = ../condor/%d.out\n' % clusterno);
file.write('Error = ../condor/%d.err\n' % clusterno);
file.write('Notification = Complete\n');
file.write('Executable=../caffe2/tools/caffe\n');
file.write('Arguments = train -gpu 0');
file.write(' -solver %s' % solver_file);
if resume_flag:
file.write(' -snapshot %s' % snapshot)
if (not resume_flag) and finetune_flag:
file.write(' -weights %s' % weights)
file.write('\nQueue 1');
if not args.submit:
debug_here()
retcode=call("condor_submit %s 2> /dev/null" % condor_submitFile, shell=True)
elif hpcsystem=="slurm":
slurm_submitFile='../condor/%d.slurm_submit' % clusterno
try:
runtime
except:
runtime=20
print ("Slurm submit file: %s" % slurm_submitFile);
with open(slurm_submitFile,'w') as file:
file.write("#!/bin/bash\n");
file.write("#SBATCH -J tyrion\n");
file.write("#SBATCH -o ../condor/%d.err\n" % clusterno)
file.write('#SBATCH -p gpu\n');
file.write('#SBATCH -n 1\n');
file.write('#SBATCH -A Visual-Recognition\n'); #Visual-Recognition || CS381V-Visual-Recogn || Fine-Tuning-CNNs
file.write('#SBATCH -t 00:%02d:00\n' % runtime);
if clustername=="stampede":
file.write('time ../caffe2_build_stampede/tools/caffe train -gpu 0');
elif clustername=="maverick":
file.write('time ../caffe2/tools/caffe train -gpu 0');
else:
raise Exception('clustername %s not handled for slurm submission' % clustername)
file.write(' -solver %s' % solver_file);
if resume_flag:
file.write(' -snapshot %s' % snapshot)
if (not resume_flag) and finetune_flag:
file.write(' -weights %s' % weights)
file.write('\npython nn_eval.py --max_jobs 25 --max_test_images 1000 -k 1 -m %s' % solver.sp["snapshot_prefix"].strip('"'));
if not args.submit:
debug_here()
retcode=call("sbatch " + slurm_submitFile, shell=True)
if retcode !=0:
print>>sys.stderr, "Child was terminated by signal", retcode
raise Exception('could not submit job')
# then update .last clustenro
with open('.lastclusterno', 'w') as f:
f.write('%d' % clusterno);
csvfile.close()
if __name__ == "__main__":
parser=argparse.ArgumentParser()
parser.add_argument('-n', '--numjobs', type=int,
default=1, help="number of jobs to submit");
submit_parser = parser.add_mutually_exclusive_group(required=False)
submit_parser.add_argument('--submit', dest='submit', action='store_true')
submit_parser.add_argument('--nosubmit', dest='submit', action='store_false')
parser.set_defaults(submit=True)
debug_parser = parser.add_mutually_exclusive_group(required=False)
debug_parser.add_argument('--debug', dest='debug', action='store_true')
debug_parser.add_argument('--nodebug', dest='debug', action='store_false')
parser.set_defaults(debug=False)
parser.add_argument('--config', type=str,
default="drlim", help="cls | drlim | ved_drlim");
pretrain_parser = parser.add_mutually_exclusive_group(required=False)
pretrain_parser.add_argument('--pretrain', dest='pretrain', action='store_true');
pretrain_parser.add_argument('--finetune', dest='pretrain', action='store_false');
parser.set_defaults(pretrain=True)
parser.add_argument('--num_dropouts', type=int,
default=1, help="");
args=parser.parse_args()
caffe.set_mode_gpu()
main()
function kitti227_submitjobs(nosubmit, numjobs)
% Write a script to submit multiple jobs, by
% (1) DONE Modifying finderNet_solver as necessary e.g. net file name: condor/finderNet_%d.prototxt, snapshot name, and dumping it back into a new solver file,
% (2) DONE reading in the current version of finderNet.prototxt, modifying it as necessary, and dumping it back into a new prototxt condor/finderNet_%d.prototxt,
% (3) DONE reading in the current version of run_learnCNN.condor, modifying it as necessary (new solvername) and dumping it back into a new condor/run_learnCNN_%d.condor
% (4) DONE issuing condor_submit condor/run_learnCNN_%d.condor
try
run('../addLibs.m');
system('hostname > tmp.txt');
if ~isempty(regexp(fileread('tmp.txt'), 'vision', 'once')) || ~isempty(regexp(fileread('tmp.txt'), 'adriana', 'once')) || ~isempty(regexp(fileread('tmp.txt'), 'eldar', 'once'))
hpcsystem='condor';
else
hpcsystem='slurm';
end
if ~isempty(regexp(fileread('tmp.txt'), 'maverick', 'once'))
clustername='maverick';
elseif ~isempty(regexp(fileread('tmp.txt'), 'stampede', 'once'))
clustername='stampede';
else
clustername='condor';
end
if nargin<1, numjobs=1; end
if nargin<2, nosubmit=true; end
no_test=false;
%% Declaring solver base parameters
input_dim={'1','3','227','227'};
hpcjob.hide.resume_flag=false;
hpcjob.snapshot_clust=1e6+(3626:3635);
hpcjob.snapshot_iter=15000*ones(1,numel(hpcjob.snapshot_clust));
hpcjob.hide.finetune_flag=false;
%starting_model_file='/work/01932/dineshj/caffe2/models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel';
%if hpcjob.hide.finetune_flag
% switch clustername
% case {'stampede', 'maverick'}
% hpcjob.model='/work/01932/dineshj/caffe2/models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel';
% case 'condor'
% hpcjob.model='/scratch/vision/dineshj/caffe2/models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel';
% end
%end
config='ved_drlim';
switch config
case 'cls'
hpcjob.hide.finetune_flag=true; new_layers={'"cls_prefinal"','"cls_final"'}; finetune_lr_slowdown_factor=10;
no_test=false;
if strcmp(hpcsystem, 'condor')
no_test=true; % to keep things fast
end
num_cls=397;
solver_base=readPrototxt('../kitti_codes/solver_227.prototxt')
net_base=readPrototxt('../kitti_codes/clsnet_227.prototxt')
%cls_trn_sfx='_mini'; solver_base.max_iter=num2str(3e3); solver_base.snapshot=num2str(1.5e3); time=10;
cls_trn_sfx='_5'; solver_base.max_iter=num2str(2e4); solver_base.snapshot=num2str(2e3); time=240;
%cls_trn_sfx='_50'; solver_base.max_iter=num2str(1e4); solver_base.snapshot=num2str(2e3); time=150;
solver_base.stepsize=num2str(2e4);
batch_size=num2str(128);
solver_base.weight_decay=num2str(0);
solver_base.type='"SGD"';
switch solver_base.type
case '"SGD"'
%loss_weights=10.^[-4:0];
%loss_weights=10.^-1*ones(1,5);
%loss_weights=10.^[0:-0.5:-4.5];
loss_weights=10.^[2.0:-0.5:0];
case '"Adagrad"'
loss_weights=10.^[-2:+2];
%loss_weights=10.^[-5:-3, +3:+4];
otherwise
abort;
end
case {'cls_L3','cls_L5', 'cls_L6'}
new_layers={'"cls_final"'}; finetune_lr_slowdown_factor=10;
hpcjob.hide.finetune_flag=true;
hpcjob.model_clust=1e6+(3701:3710);
hpcjob.model_iter=10000*ones(1,numel(hpcjob.model_clust));
no_test=false;
%if strcmp(hpcsystem, 'condor')
% no_test=true; % to keep things fast
%end
num_cls=397;
solver_base=readPrototxt('../kitti_codes/solver_227.prototxt')
if strcmp(config, 'cls_L5'), net_base=readPrototxt('../kitti_codes/clsnet_227_L5.prototxt');
elseif strcmp(config, 'cls_L6'), net_base=readPrototxt('../kitti_codes/clsnet_227_L6.prototxt');
elseif strcmp(config, 'cls_L3'), net_base=readPrototxt('../kitti_codes/clsnet_227_L3.prototxt');
else, error; end
%cls_trn_sfx='_mini'; solver_base.max_iter=num2str(3e3); solver_base.snapshot=num2str(1.5e3); time=10;
cls_trn_sfx='_ntpc5_run1_train-lmdb'; solver_base.max_iter=num2str(2e4); solver_base.snapshot=num2str(2e3); time=600;
%cls_trn_sfx='_50'; solver_base.max_iter=num2str(1e4); solver_base.snapshot=num2str(2e3); time=150;
solver_base.momentum=num2str(0.9);
%solver_base.momentum2=num2str(0.999);
solver_base.lr_policy='"step"';
solver_base.stepsize=num2str(5000);
solver_base.gamma=num2str(0.5);
solver_base.weight_decay =num2str(0.005);
batch_size=num2str(128);
solver_base.display=num2str(20);
solver_base.type='"SGD"';
%solver_base.type='"Adam"';
switch solver_base.type
case '"SGD"'
%loss_weights=10.^[-4:0];
%loss_weights=10.^-1*ones(1,5);
%loss_weights=10.^[0:-0.5:-4.5];
%loss_weights=10.^[1:-0.5:-3.5];
loss_weights=ones(1,10);
case '"Adagrad"'
loss_weights=ones(1,10);
%loss_weights=10.^[-5:-3, +3:+4];
case '"Adam"'
loss_weights=ones(1,10);
otherwise
abort;
end
switch hpcjob.hide.finetune_flag
case true
solver_base.weight_decay=num2str(5e-4);
solver_base.base_lr=num2str(1e-3);
solver_base.max_iter=num2str(3e4);
solver_base.snapshot=num2str(2e4);
solver_base.test_interval=num2str(1e2);
case false
hpcjob.hide.model='';
end
learning_rates=1e-2*ones(1,numjobs);
hpcjob.target_output='cls_accuracy'; hpcjob.target_drn='h'; hpcjob.target_perfect=num2str(1.0);
hpcjob.overfit_margin=num2str(0.02);
hpcjob.saturation_wait=num2str(5e3);
case 'drlim'
no_test=false;
withcls=1; cls_weight=1;
net_base=readPrototxt('../kitti_codes/drlimnet_227_L6.prototxt');
solver_base=readPrototxt('../kitti_codes/solver_227.prototxt')
cls_trn_sfx='_ntpc5_run1_train-lmdb'; solver_base.max_iter=num2str(1e4); solver_base.snapshot=num2str(2e3); time=720;
batch_size=num2str(16);
solver_base.display=num2str(100);
solver_base.weight_decay=num2str(0);
%loss_weights=10.^[-3:0.15:-1.65];
%loss_weights=10.^-[-2.4:0.2:-1.6];
loss_weights=zeros(1,5); %10.^-2*ones(1,5);
learning_rates=1e-2*ones(1,numel(loss_weights));
hpcjob.target_output='cls_accuracy'; hpcjob.target_drn='h'; hpcjob.target_perfect=num2str(1.0);
hpcjob.overfit_margin=num2str(0.02);
hpcjob.saturation_wait=num2str(5e3);
case 'vednet'
no_test=false;
withcls=1; cls_weight=1;
%withcls=1; cls_trn_sfx='_1e-1_SUN'; num_cls=397; cls_weight=10^0;% for SUN
%
net_base=readPrototxt('../kitti_codes/equivnet_227_L6.prototxt');
solver_base=readPrototxt('../kitti_codes/solver_227.prototxt')
cls_trn_sfx='_ntpc5_run1_train-lmdb'; solver_base.max_iter=num2str(1e4); solver_base.snapshot=num2str(2e3); time=720;
batch_size=num2str(16);
solver_base.display=num2str(100);
solver_base.weight_decay=num2str(0);
%loss_weights=10.^[-3:0.15:-1.65];
%loss_weights=10.^-[-2.4:0.2:-1.6];
learning_rates=logspace(-4, -2, 5)
loss_weights=zeros(1,5); %10.^-2*ones(1,5);
transforms=repmat([1,2,3], numel(loss_weights), 1);
hpcjob.target_output='cls_accuracy'; hpcjob.target_drn='h'; hpcjob.target_perfect=num2str(1.0);
hpcjob.overfit_margin=num2str(0.02);
hpcjob.saturation_wait=num2str(5e3);
case 'ved_drlim'
no_test=false;
withcls=1; cls_weight=10;
num_cls=397;
pretrain=0;
%withcls=1; cls_trn_sfx='_1e-1_SUN'; num_cls=397; cls_weight=10^0;% for SUN
%
%net_base=readPrototxt('../kitti_codes/equivdrlim_227_L6.prototxt');
solver_base=readPrototxt('../kitti_codes/solver_227.prototxt')
if ~pretrain
%net_base=readPrototxt('../kitti_codes/equivdrlim_227_L6.prototxt');
net_base=readPrototxt('../kitti_codes/equivdrlim_227_L3.prototxt');
batch_size=num2str(32);
cls_trn_sfx='_ntpc5_run1_train-lmdb';
solver_base.max_iter=num2str(2e3); solver_base.snapshot=num2str(5e4); time=600;
solver_base.test_interval=num2str(500);
solver_base.test_iter=num2str(40);
solver_base.type='"Adam"';
solver_base.momentum=num2str(0.9);
solver_base.momentum2=num2str(0.999);
solver_base.lr_policy='"step"';
solver_base.stepsize=num2str(5000);
solver_base.gamma=num2str(0.5);
solver_base.weight_decay =num2str(0.005);
else
net_base=readPrototxt('../kitti_codes/equivdrlim_227_pretrain.prototxt');
batch_size=num2str(64);
solver_base.max_iter=num2str(4e4); solver_base.snapshot=num2str(1e4); time=600;
solver_base.test_interval=num2str(500);
solver_base.test_iter=num2str(10);
solver_base.type='"Adam"';
solver_base.momentum=num2str(0.9);
solver_base.momentum2=num2str(0.999);
solver_base.lr_policy='"step"';
solver_base.stepsize=num2str(5000);
solver_base.gamma=num2str(0.5);
solver_base.weight_decay =num2str(0.005);
end
hpcjob.max_time=num2str(time);
hpcjob.solver_max_iter=solver_base.max_iter;
solver_base.display=num2str(20); hpcjob.solver_display=solver_base.display;
hpcjob.solver_weight_decay=solver_base.weight_decay;
%loss_weights=10.^[-3:0.15:-1.65];
%loss_weights=10.^-[-2.4:0.2:-1.6];
n=numjobs;
%learning_rates=1e-3*ones(1,n);
%learning_rates=logspace(-8,-4,n);
learning_rates=10^-4;
trans_loss_weights=zeros(1, n); %10.^-2*ones(1,5);
%trans_loss_weights=ones(1,n); %10.^-2*ones(1,5);
trans_loss_margins=100*ones(1, n); %10.^-2*ones(1,5);
%drlim_loss_weights=0; %logspace(-7, -2, n); %10.^-2*ones(1,5);
%drlim_loss_weights=10.^[-10,-9,-8,-6,-5.5]; %logspace(-7, -2, n); %10.^-2*ones(1,5);
drlim_loss_weights=ones(1,numjobs); %10.^-2*ones(1,5);
%drlim_loss_weights=logspace(-5,-1,numjobs); %10.^-2*ones(1,5);
drlim_loss_margins=100*ones(1,n); % 0 means ignore negatives...
%drlim_loss_margins=logspace(-1,+3,n);
transforms=repmat([1,2,3], n, 1);
hpcjob.target_output='cls_accuracy'; hpcjob.target_drn='h'; hpcjob.target_perfect=num2str(1.0);
hpcjob.overfit_margin=num2str(0.02);
hpcjob.saturation_wait=num2str(5e3);
%case 'triplim'
% config = 'triplim';
% triplim_sfx='_nbd20';
% drlim_sfx='_nbd20';
% clsdata='SUN';
% solver_base=readPrototxt('../kitti_codes/solver.prototxt');
% net_base=readPrototxt('../kitti_codes/triplim_net.prototxt');
% switch clsdata
% case 'SUN'
% withcls=1; cls_trn_sfx='_1e-1_SUN'; num_cls=397;
% batch_size=num2str(128); solver_base.max_iter=num2str(1.6e4); time=150; solver_base.snapshot=num2str(2e3);
% solver_base.display=num2str(10); solver_base.test_interval=num2str(50);
% triplim_loss_weights=10.^[-3.5*ones(1,5) -2.5*ones(1,5) -2*ones(1,5)]; triplim_margins=0.1*ones(1,numel(triplim_loss_weights));
% drlim_loss_weights=10.^0.5*ones(1,numel(triplim_loss_weights));
% case 'KITTI'
% NOT_IMPLEMENTED
% otherwise
% error('Unknown cls dataset %s', clsdata);
% end
% solver_base.test_iter='1';
% solver_base.type='"Nesterov"';
% solver_base.weight_decay=num2str(0);
otherwise
error('Unknown configuration %s\n', config);
end
%switch solver_base.type
% case '"SGD"'
% case '"Adagrad"'
% solver_base.momentum=num2str(0);
% case '"Nesterov"'
% otherwise
% error('Unknown solver type');
%end
%hpcjob.hide.model='../caffe_snapshots/1000266_norb_slowpred_fc_iter_50000.caffemodel';
hpcjob.hide.useGPU=true;
if strcmp(hpcsystem,'slurm')
hpcjob.sys_cmd='"rsync -avrz -e ssh ../clust_runs/ dineshj@adriana.cs.utexas.edu:/scratch/vision/dineshj/active/clust_runs/"';
end
%hpcjob.numjobs=numjobs;
if strcmp(hpcsystem, 'condor')
% get clusterno through condor_q or condor_history commands
clusterno=-1;
system('condor_q -format "%d\n" ClusterId > tmp.txt 2> /dev/null');
allclust=load('tmp.txt');
if ~isempty(allclust)
clusterno=max(max(allclust), clusterno);
end
system('condor_history -format "%d\n" ClusterId 2> /dev/null | head -35 > tmp.txt');
allclust=load('tmp.txt');
if ~isempty(allclust)
clusterno=max(max(allclust), clusterno);
end
if clusterno==-1
clusterno=0;
end
else % do the next best thing - assign another unique ID
origclusterno=load('.lastclusterno');
clusterno=origclusterno;
end
%if hpcjob.hide.finetune_flag
% solver_base.base_lr=num2str(str2num(solver_base.base_lr)/finetune_lr_slowdown_factor);
% %hpcjob.hide.model=starting_model_file;
%end
for i=1:numjobs
if hpcjob.hide.resume_flag
hpcjob.snapshot=sprintf('../caffe_snapshots/%d_snap_iter_%d.solverstate', hpcjob.snapshot_clust(i), hpcjob.snapshot_iter(i));
end
if hpcjob.hide.finetune_flag
hpcjob.model=sprintf('../caffe_snapshots/%d_snap_iter_%d.caffemodel', hpcjob.model_clust(i), hpcjob.model_iter(i));
end
clusterno=clusterno+1;
solver=solver_base;
% change some parameter of solver based on value of i
%solver.snapshot_prefix=sprintf('"../caffe_snapshots/%d_norb_drlim_fc"', clusterno);
tmp=solver.snapshot_prefix;
tmp=strrep(tmp,'"','');
[path, name, ~] =fileparts(tmp);
solver.snapshot_prefix=sprintf('"%s/%d_%s"', path, clusterno, name);
solver.net=sprintf('"../condor/%s_net%d.prototxt"', hpcsystem, clusterno);
if exist('learning_rates', 'var')
solver.base_lr=num2str(learning_rates(i));
end
hpcjob.prefix=sprintf('../condor/%d', clusterno);
hpcjob.solver_snapshot_prefix=solver.snapshot_prefix;
fprintf('\n Selecting parameter combination #');
%params.Dep.index = params.process;
%fprintf('%d(+1) of %d\n\n', i, size(combinations,1));
%assert(i<=size(combinations,1) && i>0);
%for paramno=1:length(paramNames)
% cmd=sprintf('%s=combinations{i,%d};', paramNames{paramno}, paramno)
% eval(cmd);
%end
%dbstack; keyboard;
% dump solver into condor/solver%d_%d.prototxt
if no_test
solver.test_iter=num2str(0);
solver.test_interval=num2str(str2num(solver.max_iter)+2000);
end
currSolverFile=sprintf('../condor/%s_solver%d.prototxt', hpcsystem, clusterno);
%struct2proto(solver, currSolverFile);
writePrototxt(solver, currSolverFile);
currSolverFile
net = net_base;
layernames=cellfun(@(x) x.name, net.layer, 'UniformOutput', false);
% dump feat and deploy prototxts (may be changed later if necessary)
if exist('feat_file', 'var')
copyfile(feat_file, sprintf('../condor/%d_feat.prototxt', clusterno));
else
fprintf('Generating feat file\n');
writePrototxt(make_feat_net_new(net, input_dim), sprintf('../condor/%d_feat.prototxt', clusterno));
end
if exist('deploy_file', 'var')
copyfile(deploy_file, sprintf('../condor/%d_deploy.prototxt', clusterno));
else
fprintf('Generating deploy file\n');
writePrototxt(make_deploy_net_new(net, input_dim), sprintf('../condor/%d_deploy.prototxt', clusterno));
end
switch config
case {'cls', 'cls_L3' 'cls_L5', 'cls_L6'}
if exist('num_cls', 'var')
layerno= strcmp(layernames, '"cls_final"');
net.layer{layerno}.inner_product_param.num_output=num2str(num_cls);
end
if hpcjob.hide.finetune_flag % setting learning rate higher for new layers
for layerno=1:numel(new_layers)
layerno=strcmp(layernames, new_layers{layerno});
net.layer{layerno}.param{1}.lr_mult=num2str(str2num(net.layer{layerno}.param{1}.lr_mult)*finetune_lr_slowdown_factor);
net.layer{layerno}.param{2}.lr_mult=num2str(str2num(net.layer{layerno}.param{2}.lr_mult)*finetune_lr_slowdown_factor);
end
end
layerno= strcmp(layernames, '"cls_loss"');
net.layer{layerno}.loss_weight=num2str(loss_weights(i));
layerno= strcmp(layernames, '"cls_sun_trn"');
layertypes=cellfun(@(x) x.type, net.layer, 'UniformOutput', false);
if strcmp(layertypes(layerno), '"HDF5Data"')
net.layer{layerno}.hdf5_data_param.source=sprintf('"../kitti_codes/sun_trn_cls_data_227%s_asp1.txt"', cls_trn_sfx);
elseif strcmp(layertypes(layerno), '"Data"')
net.layer{layerno}.data_param.source=sprintf('"../kitti_codes/SUN/pulkit_lmdbs/sun_imSz256%s"', cls_trn_sfx);
%sprintf('"../kitti_codes/sun_trn_cls_data_227%s_asp1.txt"', cls_trn_sfx);
else
abort;
end
%net.layer{layerno}.hdf5_data_param.source=sprintf('"../kitti_codes/sun_trn_cls_data_227%s_asp1.txt"', cls_trn_sfx);
tmp=find(strcmp(layertypes, '"HDF5Data"'));
for layerno=1:length(tmp)
net.layer{tmp(layerno)}.hdf5_data_param.batch_size=batch_size;
end
tmp=find(strcmp(layertypes, '"Data"'));
for layerno=1:length(tmp)
net.layer{tmp(layerno)}.data_param.batch_size=batch_size;
end
if no_test==true
layerphases=cellfun(@(x) layerphase(x), net.layer, 'UniformOutput', false);
train_layers=find(strcmp(layerphases, 'TRAIN'));
for lno=train_layers
net.layer{lno}=rmfield(net.layer{lno}, 'include');
end
net.layer(strcmp(layerphases, 'TEST'))=[];
new_layerphases=cellfun(@(x) layerphase(x), net.layer, 'UniformOutput', false);
end
% rewriting deploy and feat prototxts after editing net
writePrototxt(make_deploy_net_new(net, input_dim), sprintf('../condor/%d_deploy.prototxt', clusterno));
writePrototxt(make_feat_net_new(net, input_dim), sprintf('../condor/%d_feat.prototxt', clusterno));
case 'drlim'
if exist('num_cls', 'var')
layerno= strcmp(layernames, '"cls_ip2"');
net.layer{layerno}.inner_product_param.num_output=num2str(num_cls);
end
layerno= strcmp(layernames, '"drlim_loss"');
net.layer{layerno}.loss_weight=num2str(loss_weights(i));
layerno= strcmp(layernames, '"cls_sun_trn"');
net.layer{layerno}.data_param.source=sprintf('"../kitti_codes/SUN/pulkit_lmdbs/sun_imSz256%s"', cls_trn_sfx);
%layerno= strcmp(layernames, '"drlim_kitti_trn"');
%net.layer{layerno}.hdf5_data_param.source=sprintf('"../kitti_codes/trn_sim_pairs%s.txt"', drlim_sfx);
%layerno= strcmp(layernames, '"drlim_kitti_tst"');
%net.layer{layerno}.hdf5_data_param.source=sprintf('"../kitti_codes/tst_sim_pairs%s.txt"', drlim_sfx);
layertypes=cellfun(@(x) x.type, net.layer, 'UniformOutput', false);
tmp=find(strcmp(layertypes, '"HDF5Data"'));
for layerno=1:length(tmp)
net.layer{tmp(layerno)}.hdf5_data_param.batch_size=batch_size;
end
layertypes=cellfun(@(x) x.type, net.layer, 'UniformOutput', false);
tmp=find(strcmp(layertypes, 'HDF5_DATA'));
for layerno=1:length(tmp)
net.layer{tmp(layerno)}.hdf5_data_param.batch_size=batch_size;
end
layerno=find(strcmp(layernames, '"cls_loss"'));
if withcls
net.layer{layerno}.loss_weight=num2str(cls_weight); % for SUN classes
else
net.layer{layerno}.loss_weight=num2str(0);
end
% rewriting deploy and feat prototxts after editing net
writePrototxt(make_deploy_net_new(net, input_dim), sprintf('../condor/%d_deploy.prototxt', clusterno));
writePrototxt(make_feat_net_new(net, input_dim), sprintf('../condor/%d_feat.prototxt', clusterno));
case 'vednet'
if exist('num_cls', 'var')
layerno= strcmp(layernames, '"cls_final"');
net.layer{layerno}.inner_product_param.num_output=num2str(num_cls);
end
layerno= strcmp(layernames, '"trans_loss@"');
net.layer{layerno}.loss_weight=num2str(loss_weights(i)); % setting loss weight for all transforms uniformly
layerno= strcmp(layernames, '"cls_sun_trn"');
net.layer{layerno}.data_param.source=sprintf('"../kitti_codes/SUN/pulkit_lmdbs/sun_imSz256%s"', cls_trn_sfx);
layertypes=cellfun(@(x) x.type, net.layer, 'UniformOutput', false);
tmp=find(strcmp(layertypes, '"HDF5Data"'));
for layerno=1:length(tmp)
net.layer{tmp(layerno)}.hdf5_data_param.batch_size=batch_size;
end
tmp=find(strcmp(layertypes, 'HDF5_DATA'));
for layerno=1:length(tmp)
net.layer{tmp(layerno)}.hdf5_data_param.batch_size=batch_size;
end
layerno= strcmp(layernames, '"cls_loss"');
if withcls
net.layer{layerno}.loss_weight=num2str(cls_weight);
else
net.layer{layerno}.loss_weight=num2str(0);
end
curr_transforms=transforms(i, :);
layerno1=find(strcmp(layernames, '"trans1_mod@"'));% point from which to replicate
layerno2=find(strcmp(layernames, '"trans_loss@"'));% point up to which to replicate
translayer=net.layer(layerno1);
baselayer=net.layer(1:layerno1-1);
template=net.layer(layerno1:layerno2); % one for each transform
if numel(net.layer)>layerno2
clsnet=net.layer(layerno2+1:end);
else
clsnet={};
end
tmp=reshape(template, 1, numel(template)); % making into row array (in case not already)
tmp=repmat(tmp, numel(curr_transforms), 1);
% adding slice points in label vector corresponding to each transform
label_layerno=find(strcmp(layernames, '"trans_slice_label"'));
mute_layerno=find(strcmp(layernames, '"trans_mute"'));
assert(numel(label_layerno)==1);
minlabelvecsize=max(curr_transforms);
baselayer{label_layerno}.slice_param.slice_point{1}=num2str(1);
baselayer{label_layerno}.top{1}='"sim_label"';
baselayer{mute_layerno}.bottom{1}='"sim_label"';
for labeldim=1:minlabelvecsize % typically 2nd dimension onwards is transform label (1st dimension is similarity)
baselayer{label_layerno}.slice_param.slice_point{end+1}=num2str(1+labeldim);
baselayer{label_layerno}.top{end+1}=sprintf('"trans_label%d"', labeldim);
baselayer{mute_layerno}.bottom{end+1}=sprintf('"trans_label%d"', labeldim);
end
baselayer{label_layerno}.top{end+1}='"lab_autodummy2"';
baselayer{mute_layerno}.bottom{end+1}='"lab_autodummy2"';
% layertypes=cellfun(@(x) x.type, baselayer, 'UniformOutput', false);
% sourcelayer=find(strcmp(layertypes, 'HDF5_DATA'));
% assert(~isempty(sourcelayer))
% for sourcelayerno=1:numel(sourcelayer)
% val=strtrim(baselayer{sourcelayer(sourcelayerno)}.hdf5_data_param.source);
% val=[val(1:end-1) num2str(trans_dsno) val(end)]; % the transformation number must be the last part of the name
% baselayer{sourcelayer(sourcelayerno)}.hdf5_data_param.source=val;
% end
% for transformno=1:numel(curr_transforms)
% %val=strtrim(template{sourcelayer(sourcelayerno)}.hdf5_data_param.source);
% %val=[val(1:end-1) num2str(curr_transforms(transformno)) val(end)]; % the transformation number must be the last part of the name
% end
% changing layer and blob names for each transform cluster
changeList={'name', 'top', 'bottom'}; % name fields to change for each transform
for layerno=1:numel(template) % for each layer
fields=fieldnames(template{layerno});
changefieldnos=find(ismember(fields, changeList));
for fldno=1:numel(changefieldnos)
fieldno=changefieldnos(fldno);
for transformno=1:numel(curr_transforms) % for each transform
val=strtrim(template{layerno}.(fields{fieldno}));
if iscell(val) % repeated field
for repno=1:length(val)
if val{repno}(end-1)=='@' % marker for inserting transformno
val{repno}=[val{repno}(1:end-2) num2str(curr_transforms(transformno)) val{repno}(end)];
end
end
else
if val(end-1)=='@' % marker for inserting transformno
val=[val(1:end-2) num2str(curr_transforms(transformno)) val(end)];
end
end
tmp{transformno, layerno}.(fields{fieldno})=val;
end
end
end
tmp=tmp(:);
net.layer={};
net.layer(1:numel(baselayer))=baselayer;
net.layer(end+(1:numel(tmp)))=tmp;
net.layer=[net.layer clsnet];
clear tmp
% edit vednet deploy file before dumping
deploy=make_deploy_net_new(net, input_dim);
translayer=repmat(translayer, 1, numel(curr_transforms));
changeList={'name', 'top'};
for transformno=1:length(translayer)
for fieldno=1:length(changeList)
val=strtrim(translayer{transformno}.(changeList{fieldno}));
translayer{transformno}.(changeList{fieldno})=[val(1:end-2) num2str(curr_transforms(transformno)) val(end)];
end
translayer{transformno}.bottom=deploy.layer{end}.top; % should be cls_feat
end
deploy.layer=[deploy.layer translayer];
writePrototxt(make_feat_net_new(net, input_dim), sprintf('../condor/%d_feat.prototxt', clusterno));
writePrototxt(deploy, sprintf('../condor/%d_deploy.prototxt', clusterno));
case 'ved_drlim'
%if ~pretrain
if exist('num_cls', 'var')
layerno= find(strcmp(layernames, '"cls_final"') | strcmp(layernames, '"cls_L6_final"') | strcmp(layernames, '"cls_L3_final"'));
for tmp=1:numel(layerno)
net.layer{layerno(tmp)}.inner_product_param.num_output=num2str(num_cls);
end
end
%end
layerno= strcmp(layernames, '"drlim_loss"');
net.layer{layerno}.loss_weight=num2str(drlim_loss_weights(i));
net.layer{layerno}.contrastive_loss_param.margin=num2str(drlim_loss_margins(i));
layerno= strcmp(layernames, '"trans_loss@"');
net.layer{layerno}.loss_weight=num2str(trans_loss_weights(i)); % setting loss weight for all transforms uniformly
net.layer{layerno}.contrastive_loss_param.margin=num2str(trans_loss_margins(i));
if ~pretrain
layerno= strcmp(layernames, '"cls_sun_trn"');
net.layer{layerno}.data_param.source=sprintf('"../kitti_codes/SUN/pulkit_lmdbs/sun_imSz256%s"', cls_trn_sfx);
end
layertypes=cellfun(@(x) x.type, net.layer, 'UniformOutput', false);
tmp=find(strcmp(layertypes, '"HDF5Data"'));
for layerno=1:length(tmp)
net.layer{tmp(layerno)}.hdf5_data_param.batch_size=batch_size;
end
tmp=find(strcmp(layertypes, '"Data"'));
for layerno=1:length(tmp)
if ~isempty(findstr(net.layer{tmp(layerno)}.name, 'cls'))
continue;
end
net.layer{tmp(layerno)}.data_param.batch_size=batch_size;
end
%tmp=find(strcmp(layertypes, 'HDF5_DATA'));
%for layerno=1:length(tmp)
% net.layer{tmp(layerno)}.hdf5_data_param.batch_size=batch_size;
%end
%if ~pretrain
layerno= find(strcmp(layernames, '"cls_loss"') | strcmp(layernames, '"cls_L3_loss"') | strcmp(layernames, '"cls_L6_loss"'));
for tmp=1:numel(layerno)
net.layer{layerno(tmp)}.loss_weight=num2str(cls_weight);
end
%end
curr_transforms=transforms(i, :);
layerno1=find(strcmp(layernames, '"trans1_mod@"'));% point from which to replicate
layerno2=find(strcmp(layernames, '"trans_loss@"'));% point up to which to replicate
translayer=net.layer(layerno1);
baselayer=net.layer(1:layerno1-1);
template=net.layer(layerno1:layerno2); % one for each transform
if numel(net.layer)>layerno2
clsnet=net.layer(layerno2+1:end);
else
clsnet={};
end
tmp=reshape(template, 1, numel(template)); % making into row array (in case not already)
tmp=repmat(tmp, numel(curr_transforms), 1);
% adding slice points in label vector corresponding to each transform
label_layerno=find(strcmp(layernames, '"trans_slice_label"'));
mute_layerno=find(strcmp(layernames, '"trans_mute"'));
assert(numel(label_layerno)==1);
minlabelvecsize=max(curr_transforms);
baselayer{label_layerno}.slice_param.slice_point{1}=num2str(1);
baselayer{label_layerno}.top{1}='"sim_label"';
baselayer{mute_layerno}.bottom{1}='"sim_label"';
for labeldim=1:minlabelvecsize % typically 2nd dimension onwards is transform label (1st dimension is similarity)
baselayer{label_layerno}.slice_param.slice_point{end+1}=num2str(1+labeldim);
baselayer{label_layerno}.top{end+1}=sprintf('"trans_label%d"', labeldim);
baselayer{mute_layerno}.bottom{end+1}=sprintf('"trans_label%d"', labeldim);
end
baselayer{label_layerno}.top{end+1}='"lab_autodummy2"';
baselayer{mute_layerno}.bottom{end+1}='"lab_autodummy2"';
% changing layer and blob names for each transform cluster
changeList={'name', 'top', 'bottom'}; % name fields to change for each transform
for layerno=1:numel(template) % for each layer
fields=fieldnames(template{layerno});
changefieldnos=find(ismember(fields, changeList));
for fldno=1:numel(changefieldnos)
fieldno=changefieldnos(fldno);
for transformno=1:numel(curr_transforms) % for each transform
val=strtrim(template{layerno}.(fields{fieldno}));
if iscell(val) % repeated field
for repno=1:length(val)
if val{repno}(end-1)=='@' % marker for inserting transformno
val{repno}=[val{repno}(1:end-2) num2str(curr_transforms(transformno)) val{repno}(end)];
end
end
else
if val(end-1)=='@' % marker for inserting transformno
val=[val(1:end-2) num2str(curr_transforms(transformno)) val(end)];
end
end
tmp{transformno, layerno}.(fields{fieldno})=val;
end
end
end
tmp=tmp(:);
net.layer={};
net.layer(1:numel(baselayer))=baselayer;
net.layer(end+(1:numel(tmp)))=tmp;
net.layer=[net.layer clsnet];
clear tmp
% edit vednet deploy file before dumping
deploy=make_deploy_net_new(net, input_dim);
translayer=repmat(translayer, 1, numel(curr_transforms));
changeList={'name', 'top'};
for transformno=1:length(translayer)
for fieldno=1:length(changeList)
val=strtrim(translayer{transformno}.(changeList{fieldno}));
translayer{transformno}.(changeList{fieldno})=[val(1:end-2) num2str(curr_transforms(transformno)) val(end)];
end
translayer{transformno}.bottom=deploy.layer{end}.top; % should be cls_feat
end
deploy.layer=[deploy.layer translayer];
writePrototxt(make_feat_net_new(net, input_dim), sprintf('../condor/%d_feat.prototxt', clusterno));
writePrototxt(deploy, sprintf('../condor/%d_deploy.prototxt', clusterno));
otherwise
error('Unknown config');
end
net_fname=solver.net;
net_fname(net_fname=='"')='';
writePrototxt(net, net_fname);
net_fname
% make a copy of current net definition
%copyfile(eval(strrep(solver.net,'"','''')), sprintf('condor/condor_net%d.prototxt',clusterno));
% condor_submit run_learnCNN_%d.condor with solver parameter changed
hpcjob.solver=currSolverFile;
switch hpcsystem
case 'condor'
condor_submitFile=sprintf('../condor/condor_learnCNN%d.submit',clusterno);
struct2condorsubmit(hpcjob, condor_submitFile);
if nosubmit, keyboard; end
status=system(sprintf('condor_submit %s 2> /dev/null', condor_submitFile));
assert(status==0);
case 'slurm'
slurm_submitFile=sprintf('../condor/slurm_learnCNN%d.submit',clusterno);
if ~exist('time', 'var')
time=20; % in minutes
end
struct2slurmsubmit(hpcjob, slurm_submitFile, clusterno, clustername, time, 'cmd');
if nosubmit, keyboard; end
status=system(sprintf('sbatch %s', slurm_submitFile));
assert(status==0);
% updating lastclusterno file
origclusterno=origclusterno+1;
f=fopen('.lastclusterno', 'w');
fprintf(f, '%d', origclusterno);
fclose(f);
otherwise
error('Unknown hpc system');
end
fprintf('Pausing to avoid read-write clashes between jobs when accessing data files');
pause(10);% pausing to avoid errors in reading LMDB?
end
catch err
getReport(err)
keyboard;
end
fprintf('\n=============\n');
end
function struct2slurmsubmit(object, filename, clusterno, cluster, num_mins, caffe_mode)
if nargin<5
num_mins=20;
end
if nargin<6
caffe_mode='cmd';
end
FILE=fopen(filename,'w');
fprintf(FILE, '#!/bin/bash\n');
fprintf(FILE, '#SBATCH -J tyrion\n');
fprintf(FILE, '#SBATCH -o ../condor/%d.err\n', clusterno);
%fprintf(FILE, '#SBATCH -o ../condor/%d(%j).err\n', clusterno);
fprintf(FILE, '#SBATCH -p gpu\n');
fprintf(FILE, '#SBATCH -n 1\n');
fprintf(FILE, '#SBATCH -A Visual-Recognition\n'); % Visual-Recognition || CS381V-Visual-Recogn || Fine-Tuning-CNNs
fprintf(FILE, '#SBATCH -t 00:%02d:00\n', num_mins);
%if object.hide.useGPU
% switch cluster
% case 'stampede'
% fprintf(FILE, '#SBATCH -p gpu\n');
% %fprintf(FILE, '#SBATCH -p vis\n');
% case 'maverick'
% fprintf(FILE, '#SBATCH -p gpu\n');
% end
%else
% fprintf(FILE, '#SBATCH -p normal\n');
%end
%fprintf(FILE, '#SBATCH -t 00:40:00\n');
%fprintf(FILE, '#SBATCH -t 05:00:00\n');
switch cluster
case 'stampede'
%fprintf(FILE, 'export LD_LIBRARY_PATH=/work/01932/dineshj/opencv-bleeding/build/lib/:/work/01932/dineshj/boost_1_55_0/stage/lib/:/opt/apps/cuda/6.0/lib64/:/opt/apps/intel13/hdf5/1.8.9/lib:/opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/:/opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64/:/opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/ia32/:/opt/apps/cuda/6.5/lib64/:/work/01932/dineshj/tacc/lib:/work/01932/dineshj/tacc/lib/protobuf/:/opt/apps/intel13/mvapich2/1.9/lib:/opt/apps/intel13/mvapich2/1.9/lib/shared:/opt/apps/intel/13/composer_xe_2013.2.146/tbb/lib/intel64:/opt/intel/mic/coi/host-linux-release/lib:/opt/intel/mic/myo/lib:/opt/apps/intel/13/composer_xe_2013.2.146/mpirt/lib/intel64:/opt/apps/intel/13/composer_xe_2013.2.146/ipp/lib/intel64:/opt/apps/intel/13/composer_xe_2013.2.146/mkl/lib/intel64:/work/apps/matlab/2015a/sys/java/jre/glnxa64/jre/lib/amd64/server:/work/apps/matlab/2015a/runtime/glnxa64:/work/apps/matlab/2015a/bin/glnxa64:/opt/apps/cuda/6.0/computeprof/bin:/home/01932/dineshj/tools/libevent-2.0.22/lib\n');
fprintf(FILE, 'time ../caffe2_build_stampede/tools/caffe train -gpu 0');
case 'maverick'
%fprintf(FILE, '$PATH \n');
%fprintf(FILE, '$LD_LIBRARY_PATH \n');
%fprintf(FILE, 'export LD_LIBRARY_PATH=/work/01932/dineshj/opencv-bleeding/build/lib/:/work/01932/dineshj/boost_1_55_0/stage/lib/:/opt/apps/intel14/hdf5/1.8.12/x86_64/lib:/opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/:/opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/intel64/:/opt/apps/intel/13/composer_xe_2013.2.146/compiler/lib/ia32/:/opt/apps/cuda/6.5/lib64/:/work/01932/dineshj/tacc/lib:/work/01932/dineshj/tacc/lib/protobuf/:/work/01932/dineshj/anaconda2/lib/:/opt/apps/intel14/mvapich2/2.0b/lib:/opt/apps/intel14/mvapich2/2.0b/lib/shared:/opt/apps/intel/13/composer_xe_2013_sp1.1.106/tbb/lib/intel64:/opt/apps/intel/13/composer_xe_2013_sp1.1.106/compiler/lib/intel64:/opt/apps/intel/13/composer_xe_2013_sp1.1.106/mpirt/lib/intel64:/opt/apps/intel/13/composer_xe_2013_sp1.1.106/ipp/lib/intel64:/opt/apps/intel/13/composer_xe_2013_sp1.1.106/mkl/lib/intel64:/work/apps/matlab/2015a/sys/java/jre/glnxa64/jre/lib/amd64/server:/work/apps/matlab/2015a/runtime/glnxa64:/work/apps/matlab/2015a/bin/glnxa64\n');
%fprintf(FILE, 'ldd /work/01932/dineshj/caffe2/python/caffe/_caffe.so\n');
%fprintf(FILE, 'echo ================================================\n');
%fprintf(FILE, 'ldd /work/01932/dineshj/caffe2/python/caffe/_caffe.so | grep "not found"\n');
switch caffe_mode
case 'py'
fprintf(FILE, 'python -u ./train_clsnet.py');
case 'cmd'
fprintf(FILE, 'time ../caffe2/tools/caffe train -gpu 0');
otherwise
error();
end
otherwise
error('Unknown cluster name');
end
switch caffe_mode
case 'py'
object2=rmfield(object, 'hide');
args=fieldnames(object2);
arg_string='';
for argno=1:numel(args)
arg_string=[arg_string, ' --', args{argno}, ' ', object2.(args{argno})];
end
fprintf(FILE, arg_string);
case 'cmd'
argumentsCompleted=false;
if isfield(object,'solver')
if ~isempty(object.solver)
fprintf(FILE, ' -solver %s', object.solver);
%argumentsCompleted=true;
end
end
if ~argumentsCompleted
if isfield(object, 'snapshot')
if ~isempty(object.snapshot)
fprintf(FILE, ' -snapshot %s', object.snapshot);
argumentsCompleted=true;
end
end
end
if ~argumentsCompleted
if isfield(object, 'model')
if ~isempty(object.model) && object.hide.finetune_flag
fprintf(FILE, ' -weights %s', object.model);
argumentsCompleted=true;
end
end
end
if ~argumentsCompleted
warning('Training from scratch! Could take ages');
end
otherwise
error();
end
%fprintf(FILE, '\nQueue %d', object.numjobs);
%fprintf(FILE, '\nQueue %d', 1);
fclose(FILE);
end
function struct2condorsubmit(object, filename)
FILE=fopen(filename,'w');
fprintf(FILE, '+Group="GRAD"\n');
fprintf(FILE, '+Project="AI_ROBOTICS"\n');
fprintf(FILE, '+ProjectDescription=""\n');
if object.hide.useGPU
fprintf(FILE, '+GPUJob=true\n');
fprintf(FILE, 'Requirements=TARGET.GPUSlot\n');
end
fprintf(FILE, 'Environment=LD_LIBRARY_PATH=/usr/local/cuda:/usr:/usr/lib/x86_64-linux-gnu:/scratch/vision/dineshj/caffe/cudnn-6.5-linux-R1/;\n');
fprintf(FILE, 'Universe = vanilla\n');
fprintf(FILE, 'Getenv = True\n');
fprintf(FILE, 'Log = ../condor/$(Cluster).log\n');
fprintf(FILE, 'Output = ../condor/$(Cluster).out\n');
fprintf(FILE, 'Error = ../condor/$(Cluster).err\n');
fprintf(FILE, 'Notification = Complete\n');
fprintf(FILE, 'Executable=../caffe2/tools/caffe\n');
fprintf(FILE, 'Arguments= train -gpu 0');
argumentsCompleted=false;
if isfield(object,'solver')
if ~isempty(object.solver)
fprintf(FILE, ' -solver %s', object.solver);
%argumentsCompleted=true;
end
end
if ~argumentsCompleted
if isfield(object, 'snapshot')
if ~isempty(object.snapshot)
fprintf(FILE, ' -snapshot %s', object.snapshot);
argumentsCompleted=true;
end
end
end
if ~argumentsCompleted
if isfield(object, 'weights') && object.hide.finetune_flag
if ~isempty(object.weights)
fprintf(FILE, ' -weights %s', object.weights);
argumentsCompleted=true;
end
end
end
if ~argumentsCompleted
warning('Training from scratch! Could take ages');
end
%fprintf(FILE, '\nQueue %d', object.numjobs);
fprintf(FILE, '\nQueue %d', 1);
fclose(FILE);
end
function phase=layerphase(x)
if isfield(x,'include')
phase=x.include.phase;
else
phase='ALL';
end
end
function paramNames=refineParamList(List, paramNames)
repeat=false;
newParamNames={};
for i=1:length(paramNames)
tmp = eval(sprintf('List.%s',paramNames{i}));
if isstruct(tmp)
repeat=true;
moreParamNames=strcat(paramNames{i}, '.', fieldnames(tmp));
newParamNames(end+1:end+length(moreParamNames))=moreParamNames;
else
newParamNames{end+1}=paramNames{i};
end
end
paramNames=newParamNames;
if repeat
paramNames=refineParamList(List, paramNames);% recursive call
end
end
import sys
import caffe
from caffe import layers as L
from caffe import params as P
import tempfile
from IPython.core.debugger import Tracer; debug_here = Tracer()
#weight_param = dict(lr_mult=1, decay_mult=1)
#bias_param = dict(lr_mult=2, decay_mult=0)
#learned_param = [weight_param, bias_param]
def learned_param(name="", n_param=2):
param=[]
for i in range(n_param):
if name:
param.append(dict(
name = name+"_w" + str(i+1),
lr_mult = 1 if i==0 else 2,
decay_mult = 0)
#decay_mult = 1 if i==0 else 0)
)
else:
param.append(dict(
lr_mult = 1 if i==0 else 2,
decay_mult = 0)
#decay_mult = 1 if i==0 else 0)
)
return param
def decay_param(name="", n_param=2):
param=[]
for i in range(n_param):
if name:
param.append(dict(
name = name+"_w" + str(i+1),
lr_mult = 1 if i==0 else 2,
decay_mult = 1 if i==0 else 0)
)
else:
param.append(dict(
lr_mult = 1 if i==0 else 2,
decay_mult = 1 if i==0 else 0)
)
return param
def frozen_param(name="", n_param=2):
param=[]
for i in range(n_param):
if name:
param.append(dict(
name = name +"_w" + str(i+1),
lr_mult = 0,
decay_mult = 0));
else:
param.append(dict(
lr_mult = 0,
decay_mult = 0))
return param
# returns one layer stack based on the CaffeNet architecture
def conv_relu(bottom, ks, nout, stride=1, pad=0, group=1,
param=learned_param(),
weight_filler=dict(type='gaussian', std=0.01),
bias_filler=dict(type='constant', value=0),
names=["",""]):
if not names[0]:
conv = L.Convolution(bottom, kernel_size=ks, stride=stride,
num_output=nout, pad=pad, group=group,
param=param, weight_filler=weight_filler,
bias_filler=bias_filler)
relu = L.ReLU(conv, in_place=True)
else:
conv = L.Convolution(bottom, kernel_size=ks, stride=stride,
num_output=nout, pad=pad, group=group,
param=param, weight_filler=weight_filler,
bias_filler=bias_filler, name=names[0])
relu = L.ReLU(conv, in_place=True, name=names[1])
return conv,relu
def fc_relu(bottom, nout,
param=learned_param(),
weight_filler=dict(type='gaussian', std=0.005),
bias_filler=dict(type='constant', value=0),
names=["",""]):
if not names[0]:
fc = L.InnerProduct(bottom, num_output=nout, param=param,
weight_filler=weight_filler,
bias_filler=bias_filler)
relu = L.ReLU(fc, in_place=True)
else:
fc = L.InnerProduct(bottom, num_output=nout, param=param,
weight_filler=weight_filler,
bias_filler=bias_filler, name=names[0])
relu = L.ReLU(fc, in_place=True, name=names[1])
return fc,relu
def max_pool(bottom, ks, stride=1, name=""):
if not name:
pool= L.Pooling(bottom, pool=P.Pooling.MAX, kernel_size=ks, stride=stride)
else:
pool=L.Pooling(bottom, pool=P.Pooling.MAX, kernel_size=ks, stride=stride, name=name)
return pool
def generate_conv1_to_bn6(
ns, #NetSpec
bottom_blob="", learn_all=False,
perLayerBatchNormFlag=True, num_dropouts=1,
in_place_pool5=True,
blob_prefix="", blob_suffix="",
layer_prefix="", layer_suffix="",
param_prefix="", param_suffix="",
top_blob=""):
# n = netspec
def str2blobname(string):
return blob_prefix+string+blob_suffix
def str2layername(string):
return layer_prefix+string+layer_suffix
def str2paramname(string):
return param_prefix+string+param_suffix
parfoo=[];
parfoo.append(learned_param if learn_all else frozen_param) # for other layers
parfoo.append(decay_param if learn_all else frozen_param) # for fc layers
if not bottom_blob:
ns[str2blobname("data")], ns[str2blobname("labelvec")]=L.Data(
name=str2layername("data"),
source="./SUN/pulkit_lmdbs/sun_imSz227_ntpc5_run1_train-lmdb",
transform_param=dict(
mean_value=[104,117,123]),
batch_size=5,
ntop=2,
backend=1);
bottom_blob=str2blobname("data")
#debug_here()
ns[str2blobname("conv1")], ns[str2blobname("relu1")]=conv_relu(
ns[bottom_blob], 11, 96, stride=4,
param=parfoo[0](name=str2paramname("conv1")),
names=[str2layername("conv1"), str2layername("relu1")]
)
#debug_here()
#print(str(ns.to_proto()))
ns[str2blobname("pool1")]= max_pool(
ns[str2blobname("relu1")], 3, stride=2,
name=str2layername("pool1")
)
if perLayerBatchNormFlag:
ns[str2blobname("bn1")]= L.BatchNorm(
ns[str2blobname("pool1")], in_place=True,
name=str2layername("bn1"),
param=frozen_param(name=str2paramname("bn1"),
n_param=3))
last_top=ns[str2blobname("bn1")];
else:
last_top=ns[str2blobname("pool1")];
ns[str2blobname("norm1")]= L.LRN(last_top,
local_size=5,
alpha=1e-4, beta=0.75, name=str2layername("norm1"))
ns[str2blobname("conv2")], ns[str2blobname("relu2")]=conv_relu(
ns[str2blobname("norm1")], 5, 256, pad=2, group=2,
param=parfoo[0](name=str2paramname("conv2")),
names=[str2layername("conv2"), str2layername("relu2")]
)
ns[str2blobname("pool2")]= max_pool(
ns[str2blobname("relu2")], 3, stride=2,
name=str2layername("pool2")
)
ns[str2blobname("norm2")]= L.LRN(ns[str2blobname("pool2")],
local_size=5,
alpha=1e-4, beta=0.75, name=str2layername("norm2"))
if perLayerBatchNormFlag:
ns[str2blobname("bn2")]= L.BatchNorm(
ns[str2blobname("norm2")], in_place=True,
name=str2layername("bn2"),
param=frozen_param(
name=str2paramname("bn2"),
n_param=3));
last_top=ns[str2blobname("bn2")];
else:
last_top=ns[str2blobname("norm2")];
ns[str2blobname("conv3")], ns[str2blobname("relu3")]= conv_relu(
last_top, 3, 384, pad=1,
param=parfoo[0](name=str2paramname("conv3")),
names=[str2layername("conv3"), str2layername("relu3")]
)
if perLayerBatchNormFlag:
ns[str2blobname("bn3")]= L.BatchNorm(
ns[str2blobname("relu3")], in_place=True,
name=str2layername("bn3"),
param=frozen_param(
name=str2paramname("bn3"),
n_param=3))
last_top=ns[str2blobname("bn3")];
else:
last_top=ns[str2blobname("relu3")];
ns[str2blobname("conv4")], ns[str2blobname("relu4")]= conv_relu(
last_top, 3, 384, pad=1, group=2,
param=parfoo[0](name=str2paramname("conv4")),
names=[str2layername("conv4"), str2layername("relu4")]
)
if perLayerBatchNormFlag:
ns[str2blobname("bn4")]= L.BatchNorm(
ns[str2blobname("relu4")], in_place=True,
name=str2layername("bn4"),
param=frozen_param(
name=str2paramname("bn4"),
n_param=3))
last_top=ns[str2blobname("bn4")];
else:
last_top=ns[str2blobname("relu4")];
ns[str2blobname("conv5")], ns[str2blobname("relu5")]= conv_relu(
last_top, 3, 256, pad=1, group=2,
param=parfoo[0](name=str2paramname("conv5")),
names=[str2layername("conv5"), str2layername("relu5")]
)
ns[str2blobname("pool5")]= max_pool(
ns[str2blobname("relu5")], 3, stride=2,
name=str2layername("pool5"))
if perLayerBatchNormFlag:
ns[str2blobname("bn5")]= L.BatchNorm(
ns[str2blobname("pool5")], in_place=True,
name=str2layername("bn5"),
param=frozen_param(
name=str2paramname("bn5"),
n_param=3));
last_top=ns[str2blobname("bn5")];
last_top_root="bn5"
else:
last_top=ns[str2blobname("pool5")];
last_top_root="pool5"
pool5_top=str2blobname("drop_pool5" + ("" if in_place_pool5 else "_s") );
if num_dropouts>=1:
ns[pool5_top]= L.Dropout(
last_top, in_place=in_place_pool5,
dropout_param=dict(dropout_ratio=0.5),
name=str2layername("drop_pool5"))
else:
ns[pool5_top]= L.Power(
last_top, in_place=in_place_pool5,
power_param=dict(scale=0.5),
name=str2layername("drop_pool5"))
ns[str2blobname("fc6")], ns[str2blobname("relu6")]= fc_relu(
ns[pool5_top], 4096,
param=parfoo[1](name=str2paramname("fc6")),
names=[str2layername("fc6"), str2layername("relu6")])
# last layer batch norm... applied independent of perLayerBatchNormFlag
if not top_blob:
top_blob=str2blobname("bn6");
ns[top_blob]= L.BatchNorm(
ns[str2blobname("relu6")], in_place=True,
name=str2layername("bn6"),
param=frozen_param(
name=str2paramname("bn6"),
n_param=3))
return ns
def generate_classifier(
ns, #NetSpec
bottom_blob=[], learn_all=False,
propagate_down=True,
num_dropouts=0,
blob_prefix="", blob_suffix="",
layer_prefix="", layer_suffix="",
param_prefix="", param_suffix="",
loss_name="",
acc_name="",
num_cls=397,
loss_weight=1
):
# n = netspec
def str2blobname(string):
return blob_prefix+string+blob_suffix
def str2layername(string):
return layer_prefix+string+layer_suffix
def str2paramname(string):
return param_prefix+string+param_suffix
parfoo = decay_param if learn_all else frozen_param
if not bottom_blob:
ns[str2blobname("data")], ns[str2blobname("labelvec")]=L.Data(
name=str2layername("data"),
source="./SUN/pulkit_lmdbs/sun_imSz227_ntpc5_run1_train-lmdb",
transform_param=dict(
mean_value=[104,117,123]),
batch_size=5,
ntop=2,
backend=1);
bottom_blob=[str2blobname("data"), str2blobname("labelvec")];
if num_dropouts>=1:
ns[str2blobname("drop"+bottom_blob[0])]=L.Dropout(
ns[bottom_blob[0]], in_place=True,
dropout_param=dict(dropout_ratio=0.5),
name=str2layername("drop_"+bottom_blob[0]),
);
last_top=str2blobname("drop"+bottom_blob[0]);
else:
last_top=bottom_blob[0];
#ns[str2blobname("drop"+bottom_blob[0])]=L.Power(
# ns[bottom_blob[0]], in_place=True,
# power_param=dict(scale=0.5),
# name=str2layername("drop_"+bottom_blob[0]),
# propagate_down=propagate_down);
ns[str2blobname("prefinal")]=L.InnerProduct(
ns[last_top],
param=parfoo(),
inner_product_param=dict(
num_output=500,
weight_filler= dict(type="gaussian", std=0.005),
bias_filler=dict(type="constant", value=1)
),
propagate_down=propagate_down,
name=str2layername("prefinal")
)
ns[str2blobname("prefinal_relu")]=L.ReLU(
ns[str2blobname("prefinal")],
in_place=True,
name=str2layername("prefinal_relu")
)
ns[str2blobname("final")]=L.InnerProduct(
ns[str2blobname("prefinal_relu")],
param=parfoo(),
inner_product_param=dict(
num_output=num_cls,
weight_filler= dict(type="gaussian", std=0.005),
bias_filler=dict(type="constant", value=1)
),
name=str2layername("final")
)
ns[loss_name]=L.SoftmaxWithLoss(
ns[str2blobname("final")],
ns[bottom_blob[1]],
name=loss_name,
loss_weight=loss_weight
)
ns[acc_name]=L.Accuracy(
ns[str2blobname("final")],
ns[bottom_blob[1]],
name=acc_name
)
return ns
def generate_contrastive_loss(
ns, #NetSpec
bottom_blob=[], learn_all=False,
blob_prefix="", blob_suffix="",
layer_prefix="", layer_suffix="",
param_prefix="", param_suffix="",
loss_margin=1,
loss_weight=1,
):
# n = netspec
def str2blobname(string):
return blob_prefix+string+blob_suffix
def str2layername(string):
return layer_prefix+string+layer_suffix
def str2paramname(string):
return param_prefix+string+param_suffix
parfoo = learned_param if learn_all else frozen_param
ns[str2blobname("loss")]=L.ContrastiveLoss(
ns[bottom_blob[0]],
ns[bottom_blob[1]],
ns[bottom_blob[2]],
name=str2layername("loss"),
loss_weight=loss_weight,
contrastive_loss_param=dict(
margin=loss_margin
)
)
ns[str2blobname("dist")]=L.EuclideanDist(
ns[bottom_blob[0]],
ns[bottom_blob[1]],
name=str2layername("dist"),
loss_weight=0
)
ns[str2blobname("AP")], ns[str2blobname("AUROC")]=L.AveragePrec(
ns[str2blobname("dist")],
ns[bottom_blob[2]],
name=str2layername("AP"),
loss_weight=[0,0],
ntop=2
)
return ns
def generate_equivariant_map(
ns, #NetSpec
bottom_blob="", learn_all=False,
blob_prefix="", blob_suffix="",
layer_prefix="", layer_suffix="",
param_prefix="", param_suffix="",
bottleneck_size=128,
orig_dim=4096,
top_blob="",
nonDiscrete_flag=False,
motion_blob="",
):
# n = netspec
def str2blobname(string):
return blob_prefix+string+blob_suffix
def str2layername(string):
return layer_prefix+string+layer_suffix
def str2paramname(string):
return param_prefix+string+param_suffix
parfoo = decay_param if learn_all else frozen_param
ns[str2blobname("map1")], ns[str2blobname("map2")]=fc_relu(
ns[bottom_blob], bottleneck_size,
param=parfoo(),
weight_filler=dict(type="xavier"),
bias_filler=dict(type="constant")
)
if nonDiscrete_flag:
ns[str2blobname("mot-map1")], ns[str2blobname("mot-map2")]=fc_relu(
ns[motion_blob], bottleneck_size,
param=parfoo(),
weight_filler=dict(type="xavier"),
bias_filler=dict(type="constant")
)
ns[str2blobname("map2-motion")]=L.Concat(
ns[str2blobname("map2")],
ns[str2blobname("mot-map2")]);
next_bottom=str2blobname("map2-motion");
else:
next_bottom=str2blobname("map2");
ns[top_blob]=L.InnerProduct(
ns[next_bottom],
param=parfoo(),
inner_product_param=dict(
num_output=orig_dim,
weight_filler= dict(type="xavier"),
bias_filler=dict(type="constant"),
),
name=top_blob
)
return ns
class CaffeSolver:
"""
Caffesolver is a class for creating a solver.prototxt file. It sets default
values and can export a solver parameter file.
Note that all parameters are stored as strings. Strings variables are
stored as strings in strings.
"""
def __init__(self, testnet_prototxt_path="testnet.prototxt",
trainnet_prototxt_path="trainnet.prototxt", debug=False):
self.sp = {}
# critical:
self.sp['base_lr'] = '0.0001'
self.sp['momentum'] = '0.9'
self.sp['momentum2'] = '0.999'
# speed:
self.sp['test_iter'] = '10'
self.sp['test_interval'] = '500'
# looks:
self.sp['display'] = '20'
self.sp['snapshot'] = '10000'
self.sp['snapshot_prefix'] = '"../caffe_snapshots/default_snapshot"' # string withing a string!
# learning rate policy
self.sp['lr_policy'] = '"step"'
self.sp['stepsize'] = '5000'
# solver algorithm
self.sp['type'] = '"Adam"'
# important, but rare:
self.sp['gamma'] = '0.5'
self.sp['weight_decay'] = '0.0005'
#self.sp['train_net'] = '"' + trainnet_prototxt_path + '"'
#self.sp['test_net'] = '"' + testnet_prototxt_path + '"'
# pretty much never change these.
self.sp['max_iter'] = '30000'
#self.sp['test_initialization'] = 'true'
#self.sp['test_compute_loss'] = 'true'
#self.sp['average_loss'] = '25' # this has to do with the display.
#self.sp['iter_size'] = '1' # this is for accumulating gradients
self.sp['solver_mode'] = 'GPU'
if (debug):
self.sp['max_iter'] = '12'
self.sp['test_iter'] = '1'
self.sp['test_interval'] = '4'
self.sp['display'] = '1'
self.sp['solver_mode']='CPU'
def add_from_file(self, filepath):
"""
Reads a caffe solver prototxt file and updates the Caffesolver
instance parameters.
"""
with open(filepath, 'r') as f:
for line in f:
if line[0] == '#':
continue
splitLine = line.split(':')
self.sp[splitLine[0].strip()] = splitLine[1].strip()
def write(self, filepath):
"""
Export solver parameters to INPUT "filepath". Sorted alphabetically.
"""
f = open(filepath, 'w')
for key, value in sorted(self.sp.items()):
if not(type(value) is str):
raise TypeError('All solver parameters must be strings')
f.write('%s: %s\n' % (key, value))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment