Skip to content

Instantly share code, notes, and snippets.

@zomux
Created January 9, 2015 04:26
Show Gist options
  • Save zomux/b9575ec01770526ffc17 to your computer and use it in GitHub Desktop.
Save zomux/b9575ec01770526ffc17 to your computer and use it in GitHub Desktop.
'''
Build a tweet sentiment analyzer
'''
import theano
import theano.tensor as tensor
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import cPickle as pkl
import numpy
import copy
import random
from collections import OrderedDict
import imdb
datasets = {'imdb': (imdb.load_data, imdb.prepare_data)}
def get_minibatches_idx(n, nb_batches, shuffle=False):
idx_list = numpy.arange(n, dtype="int32")
if shuffle:
random.shuffle(idx_list)
minibatches = []
minibatch_start = 0
for i in range(nb_batches):
if i < n % nb_batches:
minibatch_size = n // nb_batches + 1
else:
minibatch_size = n // nb_batches
minibatches.append(idx_list[minibatch_start:
minibatch_start + minibatch_size])
minibatch_start += minibatch_size
return zip(range(nb_batches), minibatches)
def get_dataset(name):
return datasets[name][0], datasets[name][1]
def zipp(params, tparams):
for kk, vv in params.iteritems():
tparams[kk].set_value(vv)
def unzip(zipped):
new_params = OrderedDict()
for kk, vv in zipped.iteritems():
new_params[kk] = vv.get_value()
return new_params
def itemlist(tparams):
return [vv for kk, vv in tparams.iteritems()]
def dropout_layer(state_before, use_noise, trng):
proj = tensor.switch(use_noise,
(state_before *
trng.binomial(state_before.shape,
p=0.5, n=1,
dtype=state_before.dtype)),
state_before * 0.5)
return proj
def _p(pp, name):
return '%s_%s' % (pp, name)
def init_params(options):
params = OrderedDict()
# embedding
randn = numpy.random.rand(options['n_words'],
options['dim_proj'])
params['Wemb'] = (0.01 * randn).astype('float32')
# rconv
params = get_layer(options['encoder'])[0](options,
params,
prefix=options['encoder'])
# classifier
params['U'] = 0.01 * numpy.random.randn(options['dim_proj'],
options['ydim']).astype('float32')
params['b'] = numpy.zeros((options['ydim'],)).astype('float32')
return params
def load_params(path, params):
pp = numpy.load(path)
for kk, vv in params.iteritems():
if kk not in pp:
raise Warning('%s is not in the archive' % kk)
params[kk] = pp[kk]
return params
def init_tparams(params):
tparams = OrderedDict()
for kk, pp in params.iteritems():
tparams[kk] = theano.shared(params[kk], name=kk)
return tparams
layers = {'ff': ('param_init_fflayer', 'fflayer'),
'rconv': ('param_init_rconv', 'rconv_layer'),
'lstm': ('param_init_lstm', 'lstm_layer')}
def get_layer(name):
fns = layers[name]
return (eval(fns[0]), eval(fns[1]))
def param_init_fflayer(options, params, prefix='ff'):
weights = numpy.random.randn(options['dim_proj'], options['dim_proj'])
biases = numpy.zeros((options['dim_proj'], ))
params[_p(prefix, 'W')] = 0.01 * weights.astype('float32')
params[_p(prefix, 'b')] = biases.astype('float32')
return params
def fflayer(tparams, state_below, options, prefix='rconv', **kwargs):
pre_act = (tensor.dot(state_below,
tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')])
return eval(options['activ'])(pre_act)
def ortho_weight(ndim):
W = numpy.random.randn(ndim, ndim)
u, s, v = numpy.linalg.svd(W)
return u.astype('float32')
def param_init_lstm(options, params, prefix='lstm'):
W = numpy.concatenate([ortho_weight(options['dim_proj']),
ortho_weight(options['dim_proj']),
ortho_weight(options['dim_proj']),
ortho_weight(options['dim_proj'])], axis=1)
params[_p(prefix, 'W')] = W
U = numpy.concatenate([ortho_weight(options['dim_proj']),
ortho_weight(options['dim_proj']),
ortho_weight(options['dim_proj']),
ortho_weight(options['dim_proj'])], axis=1)
params[_p(prefix, 'U')] = U
b = numpy.zeros((4 * options['dim_proj'],))
params[_p(prefix, 'b')] = b.astype('float32')
return params
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
nsteps = state_below.shape[0]
if state_below.ndim == 3:
n_samples = state_below.shape[1]
else:
n_samples = 1
assert mask is not None
def _slice(_x, n, dim):
if _x.ndim == 3:
return _x[:, :, n*dim:(n+1)*dim]
return _x[:, n*dim:(n+1)*dim]
def _step(m_, x_, h_, c_):
preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
preact += x_
preact += tparams[_p(prefix, 'b')]
i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
c = tensor.tanh(_slice(preact, 3, options['dim_proj']))
c = f * c_ + i * c
c = m_[:, None] * c + (1. - m_)[:, None] * c_
h = o * tensor.tanh(c)
h = m_[:, None] * h + (1. - m_)[:, None] * h_
return h, c
state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
tparams[_p(prefix, 'b')])
dim_proj = options['dim_proj']
rval, updates = theano.scan(_step,
sequences=[mask, state_below],
outputs_info=[tensor.alloc(0., n_samples,
dim_proj),
tensor.alloc(0., n_samples,
dim_proj)],
name=_p(prefix, '_layers'),
n_steps=nsteps)
return rval[0]
def param_init_rconv(options, params, prefix='rconv'):
params[_p(prefix, 'W')] = ortho_weight(options['dim_proj'])
params[_p(prefix, 'U')] = ortho_weight(options['dim_proj'])
b = numpy.zeros((options['dim_proj'],)).astype('float32')
params[_p(prefix, 'b')] = b
gw = 0.01 * numpy.random.randn(options['dim_proj'], 3).astype('float32')
params[_p(prefix, 'GW')] = gw
gu = 0.01 * numpy.random.randn(options['dim_proj'], 3).astype('float32')
params[_p(prefix, 'GU')] = gu
params[_p(prefix, 'Gb')] = numpy.zeros((3,)).astype('float32')
return params
def rconv_layer(tparams, state_below, options, prefix='rconv', mask=None):
nsteps = state_below.shape[0]
assert mask is not None
def _step(m_, p_):
l_ = p_
# new activation
ps_ = tensor.zeros_like(p_)
ps_ = tensor.set_subtensor(ps_[1:], p_[:-1])
ls_ = ps_
ps_ = tensor.dot(ps_, tparams[_p(prefix, 'U')])
pl_ = tensor.dot(p_, tparams[_p(prefix, 'W')])
newact = eval(options['activ'])(ps_+pl_+tparams[_p(prefix, 'b')])
# gater
gt_ = (tensor.dot(ls_, tparams[_p(prefix, 'GU')]) +
tensor.dot(l_, tparams[_p(prefix, 'GW')]) +
tparams[_p(prefix, 'Gb')])
if l_.ndim == 3:
gt_shp = gt_.shape
gt_ = gt_.reshape((gt_shp[0] * gt_shp[1], gt_shp[2]))
gt_ = tensor.nnet.softmax(gt_)
if l_.ndim == 3:
gt_ = gt_.reshape((gt_shp[0], gt_shp[1], gt_shp[2]))
if p_.ndim == 3:
gn = gt_[:, :, 0].dimshuffle(0, 1, 'x')
gl = gt_[:, :, 1].dimshuffle(0, 1, 'x')
gr = gt_[:, :, 2].dimshuffle(0, 1, 'x')
else:
gn = gt_[:, 0].dimshuffle(0, 'x')
gl = gt_[:, 1].dimshuffle(0, 'x')
gr = gt_[:, 2].dimshuffle(0, 'x')
act = newact * gn + ls_ * gl + l_ * gr
if p_.ndim == 3:
m_ = m_.dimshuffle('x', 0, 'x')
else:
m_ = m_.dimshuffle('x', 0)
return tensor.switch(m_, act, l_)
rval, updates = theano.scan(_step,
sequences=[mask[1:]],
outputs_info=[state_below],
name='layer_%s' % prefix,
n_steps=nsteps-1)
seqlens = tensor.cast(mask.sum(axis=0), 'int64')-1
roots = rval[-1]
if state_below.ndim == 3:
def _grab_root(seqlen, one_sample, prev_sample):
return one_sample[seqlen]
dim_proj = options['dim_proj']
roots, updates = theano.scan(_grab_root,
sequences=[seqlens,
roots.dimshuffle(1, 0, 2)],
outputs_info=[tensor.alloc(0., dim_proj)],
name='grab_root_%s' % prefix)
else:
roots = roots[seqlens] # there should be only one, so it's fine.
return roots
def adadelta(lr, tparams, grads, x, mask, y, cost):
zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
name='%s_grad' % k)
for k, p in tparams.iteritems()]
running_up2 = [theano.shared(p.get_value() * numpy.float32(0.),
name='%s_rup2' % k)
for k, p in tparams.iteritems()]
running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
name='%s_rgrad2' % k)
for k, p in tparams.iteritems()]
zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
for rg2, g in zip(running_grads2, grads)]
f_grad_shared = theano.function([x, mask, y], cost, updates=zgup+rg2up)
updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
for zg, ru2, rg2 in zip(zipped_grads,
running_up2,
running_grads2)]
ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
for ru2, ud in zip(running_up2, updir)]
param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)]
f_update = theano.function([lr], [], updates=ru2up+param_up,
on_unused_input='ignore')
return f_grad_shared, f_update
def rmsprop(lr, tparams, grads, x, mask, y, cost):
zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
name='%s_grad' % k)
for k, p in tparams.iteritems()]
running_grads = [theano.shared(p.get_value() * numpy.float32(0.),
name='%s_rgrad' % k)
for k, p in tparams.iteritems()]
running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
name='%s_rgrad2' % k)
for k, p in tparams.iteritems()]
zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
for rg2, g in zip(running_grads2, grads)]
f_grad_shared = theano.function([x, mask, y], cost,
updates=zgup + rgup + rg2up)
updir = [theano.shared(p.get_value() * numpy.float32(0.),
name='%s_updir' % k)
for k, p in tparams.iteritems()]
updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
running_grads2)]
param_up = [(p, p + udn[1])
for p, udn in zip(itemlist(tparams), updir_new)]
f_update = theano.function([lr], [], updates=updir_new+param_up,
on_unused_input='ignore')
return f_grad_shared, f_update
def sgd(lr, tparams, grads, x, mask, y, cost):
gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
for k, p in tparams.iteritems()]
gsup = [(gs, g) for gs, g in zip(gshared, grads)]
f_grad_shared = theano.function([x, mask, y], cost, updates=gsup)
pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
f_update = theano.function([lr], [], updates=pup)
return f_grad_shared, f_update
def build_model(tparams, options):
trng = RandomStreams(1234)
use_noise = theano.shared(numpy.float32(0.))
x = tensor.matrix('x', dtype='int64')
mask = tensor.matrix('mask', dtype='float32')
y = tensor.vector('y', dtype='int64')
n_timesteps = x.shape[0]
n_samples = x.shape[1]
emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps,
n_samples,
options['dim_proj']])
proj = get_layer(options['encoder'])[1](tparams, emb, options,
prefix=options['encoder'],
mask=mask)
if options['encoder'] == 'lstm':
proj = (proj * mask[:, :, None]).sum(axis=0)
proj = proj / mask.sum(axis=0)[:, None]
if options['use_dropout']:
proj = dropout_layer(proj, use_noise, trng)
pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U'])+tparams['b'])
f_pred_prob = theano.function([x, mask], pred)
f_pred = theano.function([x, mask], pred.argmax(axis=1))
cost = -tensor.log(pred[tensor.arange(n_samples), y] + 1e-8).mean()
return trng, use_noise, x, mask, y, f_pred_prob, f_pred, cost
def pred_probs(f_pred_prob, prepare_data, data, iterator, verbose=False):
n_samples = len(data[0])
probs = numpy.zeros((n_samples, 2)).astype('float32')
n_done = 0
for _, valid_index in iterator:
x, mask, y = prepare_data([data[0][t] for t in valid_index],
numpy.array(data[1])[valid_index],
maxlen=None)
pred_probs = f_pred_prob(x, mask)
probs[valid_index, :] = pred_probs
n_done += len(valid_index)
if verbose:
print '%d/%d samples classified' % (n_done, n_samples)
return probs
def pred_error(f_pred, prepare_data, data, iterator, verbose=False):
valid_err = 0
for _, valid_index in iterator:
x, mask, y = prepare_data([data[0][t] for t in valid_index],
numpy.array(data[1])[valid_index],
maxlen=None)
preds = f_pred(x, mask)
targets = numpy.array(data[1])[valid_index]
valid_err += (preds == targets).sum()
valid_err = 1. - numpy.float32(valid_err) / len(data[0])
return valid_err
def train(dim_proj=100,
patience=10,
max_epochs=5000,
dispFreq=100,
activ='lambda x: tensor.tanh(x)',
decay_c=0.,
lrate=0.01,
n_words=100000,
data_sym=False,
optimizer='rmsprop',
encoder='rconv',
saveto='model.npz',
noise_std=0.,
validFreq=1000,
saveFreq=1000, # save the parameters after every saveFreq updates
maxlen=50,
batch_size=16,
valid_batch_size=16,
dataset='sentiment140',
use_dropout=False):
# Model options
model_options = locals().copy()
load_data, prepare_data = get_dataset(dataset)
print 'Loading data'
train, valid, test = load_data(n_words=n_words, valid_portion=0.01)
ydim = numpy.max(train[1])+1
model_options['ydim'] = ydim
print 'Building model'
params = init_params(model_options)
tparams = init_tparams(params)
(trng, use_noise, x, mask,
y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options)
if decay_c > 0.:
decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
weight_decay = 0.
weight_decay += (tparams['U']**2).sum()
weight_decay *= decay_c
cost += weight_decay
f_cost = theano.function([x, mask, y], cost)
grads = tensor.grad(cost, wrt=itemlist(tparams))
f_grad = theano.function([x, mask, y], grads)
lr = tensor.scalar(name='lr')
f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads,
x, mask, y, cost)
print 'Optimization'
kf_valid = get_minibatches_idx(len(valid[0]),
len(valid[0]) / valid_batch_size,
shuffle=True)
kf_test = get_minibatches_idx(len(test[0]),
len(test[0]) / valid_batch_size,
shuffle=True)
history_errs = []
best_p = None
bad_count = 0
if validFreq == -1:
validFreq = len(train[0])/batch_size
if saveFreq == -1:
saveFreq = len(train[0])/batch_size
uidx = 0
estop = False
for eidx in xrange(max_epochs):
n_samples = 0
kf = get_minibatches_idx(len(train[0]), len(train[0])/batch_size,
shuffle=True)
for _, train_index in kf:
n_samples += train_index.shape[0]
uidx += 1
use_noise.set_value(1.)
y = [train[1][t] for t in train_index]
x, mask, y = prepare_data([train[0][t]for t in train_index],
y, maxlen=maxlen)
if x is None:
print 'Minibatch with zero sample under length ', maxlen
continue
cost = f_grad_shared(x, mask, y)
f_update(lrate)
if numpy.isnan(cost) or numpy.isinf(cost):
print 'NaN detected'
return 1., 1., 1.
if numpy.mod(uidx, dispFreq) == 0:
print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost
if numpy.mod(uidx, saveFreq) == 0:
print 'Saving...',
if best_p is not None:
params = best_p
else:
params = unzip(tparams)
numpy.savez(saveto, history_errs=history_errs, **params)
pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
print 'Done'
if numpy.mod(uidx, validFreq) == 0:
use_noise.set_value(0.)
train_err = pred_error(f_pred, prepare_data, train, kf)
valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
test_err = pred_error(f_pred, prepare_data, test, kf_test)
history_errs.append([valid_err, test_err])
if (uidx == 0 or
valid_err <= numpy.array(history_errs)[:,
0].min()):
best_p = unzip(tparams)
bad_counter = 0
if (len(history_errs) > patience and
valid_err >= numpy.array(history_errs)[:-patience,
0].min()):
bad_counter += 1
if bad_counter > patience:
print 'Early Stop!'
estop = True
break
print ('Train ', train_err, 'Valid ', valid_err,
'Test ', test_err)
print 'Seen %d samples' % n_samples
if estop:
break
if best_p is not None:
zipp(best_p, tparams)
use_noise.set_value(0.)
train_err = pred_error(f_pred, prepare_data, train, kf)
valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
test_err = pred_error(f_pred, prepare_data, test, kf_test)
print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err
params = copy.copy(best_p)
numpy.savez(saveto, zipped_params=best_p, train_err=train_err,
valid_err=valid_err, test_err=test_err,
history_errs=history_errs, **params)
return train_err, valid_err, test_err
def main(job_id, params):
print ('Anything printed here will end up in the output directory'
'for job #%d' % job_id)
print params
use_dropout = True if params['use-dropout'][0] else False
trainerr, validerr, testerr = train(saveto=params['model'][0],
dim_proj=params['dim-proj'][0],
n_words=params['n-words'][0],
decay_c=params['decay-c'][0],
lrate=params['learning-rate'][0],
optimizer=params['optimizer'][0],
activ=params['activ'][0],
encoder=params['encoder'][0],
maxlen=600,
batch_size=16,
valid_batch_size=16,
validFreq=10000,
dispFreq=10,
saveFreq=100000,
dataset='imdb',
use_dropout=use_dropout)
return validerr
if __name__ == '__main__':
main(0, {
'model': ['model_lstm.npz'],
'encoder': ['lstm'],
'dim-proj': [128],
'n-words': [10000],
'optimizer': ['adadelta'],
'activ': ['lambda x: tensor.tanh(x)'],
'decay-c': [0.],
'use-dropout': [1],
'learning-rate': [0.0001]})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment