|
# Licensed to the Apache Software Foundation (ASF) under one |
|
# or more contributor license agreements. See the NOTICE file |
|
# distributed with this work for additional information |
|
# regarding copyright ownership. The ASF licenses this file |
|
# to you under the Apache License, Version 2.0 (the |
|
# "License"); you may not use this file except in compliance |
|
# with the License. You may obtain a copy of the License at |
|
# |
|
# http://www.apache.org/licenses/LICENSE-2.0 |
|
# |
|
# Unless required by applicable law or agreed to in writing, software |
|
# distributed under the License is distributed on an "AS IS" BASIS, |
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
# See the License for the specific language governing permissions and |
|
# limitations under the License. |
|
# ============================================================================= |
|
import cPickle as pickle |
|
import numpy as np |
|
import argparse |
|
import urllib2 |
|
import traceback |
|
import time |
|
import sys |
|
|
|
from singa import layer |
|
from singa import loss |
|
from singa import device |
|
from singa import tensor |
|
from singa import optimizer |
|
from singa import initializer |
|
from singa.proto import model_pb2 |
|
from singa import utils |
|
from rafiki.agent import Agent, MsgType |
|
|
|
|
|
class Data(object): |
|
|
|
def __init__(self, file_url, batch_size=32, seq_length=100, train_ratio=0.8): |
|
'''Data object for loading a plain text file. |
|
|
|
Args: |
|
fpath, path to the text file. |
|
train_ratio, split the text file into train and test sets, where |
|
train_ratio of the characters are in the train set. |
|
''' |
|
self.raw_data = urllib2.urlopen(file_url).read() |
|
chars = list(set(self.raw_data)) |
|
self.vocab_size = len(chars) |
|
self.char_to_idx = {ch: i for i, ch in enumerate(chars)} |
|
self.idx_to_char = {i: ch for i, ch in enumerate(chars)} |
|
data = [self.char_to_idx[c] for c in self.raw_data] |
|
# seq_length + 1 for the data + label |
|
nsamples = len(data) / (1 + seq_length) |
|
data = data[0:nsamples * (1 + seq_length)] |
|
data = np.asarray(data, dtype=np.int32) |
|
data = np.reshape(data, (-1, seq_length + 1)) |
|
# shuffle all sequences |
|
np.random.shuffle(data) |
|
self.train_dat = data[0:int(data.shape[0]*train_ratio)] |
|
self.num_train_batch = self.train_dat.shape[0] / batch_size |
|
self.val_dat = data[self.train_dat.shape[0]:] |
|
self.num_test_batch = self.val_dat.shape[0] / batch_size |
|
print 'train dat', self.train_dat.shape |
|
print 'val dat', self.val_dat.shape |
|
|
|
|
|
def numpy2tensors(npx, npy, dev): |
|
'''batch, seq, dim -- > seq, batch, dim''' |
|
tmpx = np.swapaxes(npx, 0, 1) |
|
tmpy = np.swapaxes(npy, 0, 1) |
|
inputs = [] |
|
labels = [] |
|
for t in range(tmpx.shape[0]): |
|
x = tensor.from_numpy(tmpx[t]) |
|
y = tensor.from_numpy(tmpy[t]) |
|
x.to_device(dev) |
|
y.to_device(dev) |
|
inputs.append(x) |
|
labels.append(y) |
|
return inputs, labels |
|
|
|
|
|
def convert(batch, batch_size, seq_length, vocab_size, dev): |
|
'''convert a batch of data into a sequence of input tensors''' |
|
y = batch[:, 1:] |
|
x1 = batch[:, :seq_length] |
|
x = np.zeros((batch_size, seq_length, vocab_size), dtype=np.float32) |
|
for b in range(batch_size): |
|
for t in range(seq_length): |
|
c = x1[b, t] |
|
x[b, t, c] = 1 |
|
return numpy2tensors(x, y, dev) |
|
|
|
|
|
def get_lr(epoch): |
|
return 0.001 / float(1 << (epoch / 50)) |
|
|
|
|
|
def handle_cmd(agent): |
|
pause = False |
|
stop = False |
|
while not stop: |
|
key, val = agent.pull() |
|
if key is not None: |
|
msg_type = MsgType.parse(key) |
|
if msg_type.is_command(): |
|
if MsgType.kCommandPause.equal(msg_type): |
|
agent.push(MsgType.kStatus,"Success") |
|
pause = True |
|
elif MsgType.kCommandResume.equal(msg_type): |
|
agent.push(MsgType.kStatus, "Success") |
|
pause = False |
|
elif MsgType.kCommandStop.equal(msg_type): |
|
agent.push(MsgType.kStatus,"Success") |
|
stop = True |
|
else: |
|
agent.push(MsgType.kStatus,"Warning, unkown message type") |
|
print "Unsupported command %s" % str(msg) |
|
if pause and not stop: |
|
time.sleep(0.1) |
|
else: |
|
break |
|
return stop |
|
|
|
def train(data, max_epoch, hidden_size=100, seq_length=100, batch_size=16, |
|
num_stacks=1, dropout=0.5, model_path='model', port=9999): |
|
# SGD with L2 gradient normalization |
|
opt = optimizer.RMSProp(constraint=optimizer.L2Constraint(5)) |
|
cuda = device.create_cuda_gpu() |
|
rnn = layer.LSTM( |
|
name='lstm', |
|
hidden_size=hidden_size, |
|
num_stacks=num_stacks, |
|
dropout=dropout, |
|
input_sample_shape=( |
|
data.vocab_size, |
|
)) |
|
rnn.to_device(cuda) |
|
rnn_w = rnn.param_values()[0] |
|
rnn_w.uniform(-0.08, 0.08) # init all rnn parameters |
|
dense = layer.Dense( |
|
'dense', |
|
data.vocab_size, |
|
input_sample_shape=( |
|
hidden_size, |
|
)) |
|
dense.to_device(cuda) |
|
dense_w = dense.param_values()[0] |
|
dense_b = dense.param_values()[1] |
|
print 'dense w ', dense_w.shape |
|
print 'dense b ', dense_b.shape |
|
initializer.uniform(dense_w, dense_w.shape[0], 0) |
|
dense_b.set_value(0) |
|
|
|
g_dense_w = tensor.Tensor(dense_w.shape, cuda) |
|
g_dense_b = tensor.Tensor(dense_b.shape, cuda) |
|
|
|
agent = Agent(port) |
|
lossfun = loss.SoftmaxCrossEntropy() |
|
for epoch in range(max_epoch): |
|
train_loss = 0 |
|
for b in range(data.num_train_batch): |
|
batch = data.train_dat[b * batch_size: (b + 1) * batch_size] |
|
inputs, labels = convert(batch, batch_size, seq_length, |
|
data.vocab_size, cuda) |
|
inputs.append(tensor.Tensor()) |
|
inputs.append(tensor.Tensor()) |
|
|
|
outputs = rnn.forward(model_pb2.kTrain, inputs)[0:-2] |
|
grads = [] |
|
batch_loss = 0 |
|
g_dense_w.set_value(0.0) |
|
g_dense_b.set_value(0.0) |
|
for output, label in zip(outputs, labels): |
|
act = dense.forward(model_pb2.kTrain, output) |
|
lvalue = lossfun.forward(model_pb2.kTrain, act, label) |
|
batch_loss += lvalue.l1() |
|
grad = lossfun.backward() |
|
grad /= batch_size |
|
grad, gwb = dense.backward(model_pb2.kTrain, grad) |
|
grads.append(grad) |
|
g_dense_w += gwb[0] |
|
g_dense_b += gwb[1] |
|
# print output.l1(), act.l1() |
|
utils.update_progress( |
|
b * 1.0 / data.num_train_batch, 'training loss = %f' % |
|
(batch_loss / seq_length)) |
|
train_loss += batch_loss / seq_length |
|
|
|
grads.append(tensor.Tensor()) |
|
grads.append(tensor.Tensor()) |
|
g_rnn_w = rnn.backward(model_pb2.kTrain, grads)[1][0] |
|
dense_w, dense_b = dense.param_values() |
|
opt.apply_with_lr(epoch, get_lr(epoch), g_rnn_w, rnn_w, 'rnnw') |
|
opt.apply_with_lr( |
|
epoch, get_lr(epoch), |
|
g_dense_w, dense_w, 'dense_w') |
|
opt.apply_with_lr( |
|
epoch, get_lr(epoch), |
|
g_dense_b, dense_b, 'dense_b') |
|
if (b+1) % 20 == 0: |
|
info = dict(phase='train', step=epoch, loss=train_loss / b, |
|
timestamp=time.time()) |
|
agent.push(MsgType.kInfoMetric, info) |
|
if handle_cmd(agent): |
|
break |
|
avg_loss = train_loss / data.num_train_batch |
|
print '\nEpoch %d, train loss is %f' % (epoch, avg_loss) |
|
eval_loss = 0 |
|
for b in range(data.num_test_batch): |
|
batch = data.val_dat[b * batch_size: (b + 1) * batch_size] |
|
inputs, labels = convert(batch, batch_size, seq_length, |
|
data.vocab_size, cuda) |
|
inputs.append(tensor.Tensor()) |
|
inputs.append(tensor.Tensor()) |
|
outputs = rnn.forward(model_pb2.kEval, inputs)[0:-2] |
|
for output, label in zip(outputs, labels): |
|
output = dense.forward(model_pb2.kEval, output) |
|
eval_loss += lossfun.forward(model_pb2.kEval, |
|
output, label).l1() |
|
avg_loss = eval_loss / data.num_test_batch / seq_length |
|
print 'Epoch %d, evaluation loss is %f' % (epoch, avg_loss) |
|
info = dict(phase='test', step=epoch, loss=avg_loss, |
|
timestamp=time.time()) |
|
agent.push(MsgType.kInfoMetric, info) |
|
|
|
if (epoch + 1) % 30 == 0: |
|
# checkpoint the file model |
|
with open('%s_%d.bin' % (model_path, epoch), 'wb') as fd: |
|
print 'saving model to %s' % model_path |
|
d = {} |
|
for name, w in zip( |
|
['rnn_w', 'dense_w', 'dense_b'], |
|
[rnn_w, dense_w, dense_b]): |
|
w.to_host() |
|
d[name] = tensor.to_numpy(w) |
|
w.to_device(cuda) |
|
d['idx_to_char'] = data.idx_to_char |
|
d['char_to_idx'] = data.char_to_idx |
|
d['hidden_size'] = hidden_size |
|
d['num_stacks'] = num_stacks |
|
d['dropout'] = dropout |
|
|
|
pickle.dump(d, fd) |
|
|
|
agent.stop() |
|
|
|
if __name__ == '__main__': |
|
try: |
|
parser = argparse.ArgumentParser( description='Train multi-stack LSTM') |
|
# parser.add_argument('data', type=str, help='training file') |
|
parser.add_argument('-b', type=int, default=32, help='batch_size') |
|
parser.add_argument('-l', type=int, default=64, help='sequence length') |
|
parser.add_argument('-d', type=int, default=128, help='hidden size') |
|
parser.add_argument('-s', type=int, default=2, help='num of stacks') |
|
parser.add_argument('-m', type=int, default=80, help='max num of epoch') |
|
args = parser.parse_args() |
|
file_url = 'http://cs.stanford.edu/people/karpathy/char-rnn/linux_input.txt' |
|
data = Data(file_url, batch_size=args.b, seq_length=args.l) |
|
train(data, args.m, hidden_size=args.d, num_stacks=args.s, |
|
seq_length=args.l, batch_size=args.b) |
|
except SystemExit: |
|
pass |
|
except: |
|
traceback.print_exc() |
|
sys.stderr.write(" for help use --help \n\n") |