|
""" downhill: classify mnist digits with logistic regression |
|
see http://downhill.readthedocs.io -- 10 pages, well written |
|
optimizers: SGD RMSProp Ada* with momentum |
|
""" |
|
# see also: theanets.readthedocs.io |
|
# https://gist.github.com/denis-bz Downhill-mnist.md |
|
|
|
from __future__ import division |
|
import sys |
|
import climate |
|
import logging |
|
import numpy as np |
|
import theano |
|
import theano.tensor as TT |
|
import downhill # $downhill/base.py $downhill/adaptive.py |
|
from skdata_mnist import load_mnist |
|
|
|
from etc import etcutil as nu |
|
|
|
__version__ = "2016-11-19 Nov denis-bz-py t-online de" |
|
|
|
np.set_printoptions( threshold=20, edgeitems=10, linewidth=140, |
|
formatter = dict( float = lambda x: "%.2g" % x )) # float arrays %.2g |
|
|
|
def val( x ): |
|
return x.get_value() if hasattr( x, "get_value" ) \ |
|
else x |
|
|
|
print "\n", 80 * "-" |
|
print " ".join(sys.argv) |
|
|
|
#............................................................................... |
|
algo = 'rmsprop' |
|
rate = 1 if algo == 'sgd' else .002 |
|
momentum = .9 |
|
nesterov = False |
|
w2loss = 0 # loss + w2loss (W^2).mean |
|
# .001 no diff, .01 a bit worse -- loss and score both noisy |
|
patience = 3 # min_improvement in patience * validate_every epochs, else quit |
|
min_improvement = .01 |
|
validate_every = 3 # default 10 |
|
batch_size = 128 |
|
max_updates = None |
|
|
|
seed = 1 |
|
tag = ".tmp" # save > tag.npz |
|
save = 0 |
|
log = 1 |
|
|
|
# to change these params, run this.py a=1 b=None 'c = ...' in sh or ipython |
|
for arg in sys.argv[1:]: |
|
exec( arg ) |
|
|
|
np.random.seed( seed ) |
|
|
|
Params = """ |
|
algo %s |
|
rate %.3g |
|
momentum %.3g |
|
nesterov %d |
|
w2loss %.3g |
|
patience %d |
|
min_improvement %.3g |
|
validate_every %d |
|
batch_size %d |
|
seed %d |
|
""" % ( algo, rate, momentum, nesterov, w2loss, patience, min_improvement, |
|
validate_every, batch_size, seed ) |
|
print Params |
|
print "versions: downhill %s theano %s" % ( |
|
downhill.__version__, theano.__version__ ) |
|
|
|
#............................................................................... |
|
(xtrain, ytrain), (xvalid, yvalid) = load_mnist( normalize=1 ) |
|
train_dataset = downhill.Dataset( [xtrain, ytrain], name="train", |
|
batch_size=batch_size, rng=seed ) |
|
valid_dataset = downhill.Dataset( [xvalid, yvalid], name="valid", |
|
batch_size=batch_size, rng=seed ) |
|
# better val, val2 -- error bars |
|
|
|
#............................................................................... |
|
# theano dataflow graphs: inputs x y, state W b -> loss |
|
x = TT.fmatrix('x') # data, 28x28 pixels 0 .. 1 |
|
y = TT.ivector('y') # labels, ints 0 .. 9 |
|
W = theano.shared( np.zeros( (784, 10), dtype=np.float32 ), name='W' ) |
|
b = theano.shared( np.zeros( 10, dtype=np.float32 ), name='b' ) |
|
|
|
xdotW = TT.dot( x, W ) + b |
|
p_y_given_x = TT.nnet.softmax( xdotW ) |
|
probf = theano.function( [x], p_y_given_x ) |
|
# from http://deeplearning.net/tutorial/code/logistic_sgd.py |
|
# def negative_log_likelihood(self, y): |
|
W2loss = w2loss * (W * W).mean() |
|
loss = -TT.mean( TT.log( p_y_given_x )[TT.arange(y.shape[0]), y]) \ |
|
+ W2loss |
|
|
|
# grad = TT.grad( loss, [x] ) # not [x, y] ? |
|
# gradf = theano.function( [x, y], grad, name='gradf' ) |
|
|
|
def predict( x ): |
|
return np.argmax( x.dot( W.get_value() ) + b.get_value(), axis=1 ) |
|
|
|
def error_percent( x, y ): |
|
ypred = predict( x ) |
|
return (y != ypred).mean() * 100, ypred |
|
|
|
if log: |
|
climate.log.TTY_Formatter._DATE_FORMAT = ' ' # dates kill diff |
|
climate.enable_default_logging() |
|
|
|
#............................................................................... |
|
opt = downhill.build( |
|
algo=algo, |
|
loss=loss, |
|
params=[W, b], |
|
inputs=[x, y], |
|
monitors=[('W2loss', W2loss)], |
|
monitor_gradients=True, |
|
) |
|
|
|
# monitor, save these -- |
|
mon = nu.Bag( # a dict with mon.key == mon["key"], mon.<tab> in ipython |
|
tloss = [], |
|
vloss = [], |
|
W = [], |
|
b = [], |
|
) |
|
minerr = np.inf |
|
wprev = 0 |
|
iter = 0 |
|
stepq = np.r_[1, 10, 50, 90, 99] |
|
print "step quantiles:", stepq |
|
print "" |
|
|
|
#............................................................................... |
|
for tm, vm in opt.iterate( train_dataset, valid_dataset, |
|
learning_rate=rate, |
|
momentum=momentum, |
|
nesterov=nesterov, |
|
patience=patience, |
|
min_improvement=min_improvement, |
|
validate_every=validate_every, |
|
max_gradient_elem=0, |
|
max_gradient_norm=0, # not both |
|
max_updates=max_updates, |
|
): |
|
iter += 1 |
|
tm = nu.Bag(tm) |
|
vm = nu.Bag(vm) # most recent, default validate_every=10 |
|
tloss = tm.loss |
|
vloss = vm.loss |
|
Wval = W.get_value() |
|
wstep = Wval - wprev |
|
wprev = Wval |
|
for _k, _v in mon.items(): |
|
_v.append( val( eval( _k ))) # mon.err.append( err ) ... |
|
|
|
if (iter % validate_every) == 0: |
|
verr, ypredict = error_percent( xvalid, yvalid ) # noisier than loss |
|
if verr < minerr: |
|
minerr = verr |
|
minepoch = iter - 1 |
|
minpredict = ypredict |
|
p = nu.ints( np.percentile( wstep, q=stepq ) * 100 ) |
|
print "error rate: %.2f %% valid-loss %-6.3g train-loss %-6.3g epoch %d stepq %s " % ( |
|
verr, vm.loss, tm.loss, iter, p ) |
|
|
|
for _k, _v in mon.items(): |
|
mon[_k] = np.array( _v ) # lists -> arrays |
|
|
|
try: |
|
from etc import confus |
|
confus.pconfus( yvalid, minpredict, label=algo ) # print confusion matrix |
|
except ImportError: |
|
pass |
|
|
|
# rmsprop steps >> sgd steps ? |
|
W = mon.W[minepoch] # 784, 10 |
|
print "best W ", nu.quantiles( W, q=[1, 10, 50, 90, 99] ) |
|
print "best b:", mon.b[minepoch] |
|
|
|
if save: # to plot |
|
out = tag + ".npz" |
|
print "\nsaving to", out |
|
mon.Params = Params |
|
mon.minerr = minerr |
|
mon.minepoch = minepoch # 0-origin |
|
mon.minpredict = minpredict.astype(np.uint8) |
|
mon.yvalidate = yvalid |
|
# pdict.pdict( mon ) |
|
nu.mkdirpart( out ) |
|
np.savez( out, **mon ) |