Skip to content

Instantly share code, notes, and snippets.

@nudles
Last active November 1, 2016 10:52
Show Gist options
  • Save nudles/5841dafa4e77b4822604bb08d76a41d3 to your computer and use it in GitHub Desktop.
Save nudles/5841dafa4e77b4822604bb08d76a41d3 to your computer and use it in GitHub Desktop.
char-rnn

Train Char-RNN over Text

Recurrent neural networks (RNN) are widely used for modelling sequential data, e.g., natural language sentences. This example describes how to implement a RNN application (or model) using SINGA's RNN layers. We will use the char-rnn model as an example, which trains over sentences or source code, with each character as an input unit. Particularly, we will train a RNN using GRU over Linux kernel source code. After training, we expect to generate meaningful code from the model.

SINGA version

Apache SINGA-beb5e72

Folder layout

  • README.md This file.
  • train.py The training code.
  • serve.py The serving code which predicts the next 1000 characters given a seeding string.
  • index.html The webpage for displaying the prediction results and getting user's seeding string.

Instructions

Local mode

Currently the RNN implementation depends on Cudnn with version >= 5.05. If you get 'Error: status == CUDNN_STATUS_SUCCESS (8 vs. 0) CUDNN_STATUS_EXECUTION_FAILED ', please export the gpu card,

export CUDA_VISIBLE_DEVICES=0

Training

python train.py

Some hyper-parameters could be set through command line,

python train.py -h

Serving

Sample 1000 characters from the model by providing the seeding string.

python sample.py &
curl -i -F image=@'#incl' http://localhost:9999/api
curl -i -F image=@'int i' http://localhost:9999/api

To see other command line arguments, please run

python sample.py -h
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
'''Sample characters from the pre-trained model'''
import sys
import cPickle as pickle
import numpy as np
import argparse
import urllib2
import traceback
import time
import sys
from singa import layer
from singa import tensor
from singa import device
from singa.proto import model_pb2
from rafiki.agent import Agent, MsgType
def sample(model_url, nsamples, seed_text, do_sample=True, port=9999):
d = pickle.load(urllib2.urlopen(model_url))
rnn_w = tensor.from_numpy(d['rnn_w'])
idx_to_char = d['idx_to_char']
char_to_idx = d['char_to_idx']
vocab_size = len(idx_to_char)
dense_w = tensor.from_numpy(d['dense_w'].T)
dense_b = tensor.from_numpy(d['dense_b'])
hidden_size = d['hidden_size']
num_stacks = d['num_stacks']
dropout = d['dropout']
cuda = device.create_cuda_gpu()
rnn = layer.LSTM(name='lstm', hidden_size=hidden_size,
num_stacks=num_stacks, dropout=dropout,
input_sample_shape=(len(idx_to_char),))
rnn.to_device(cuda)
rnn.param_values()[0].copy_data(rnn_w)
dense = layer.Dense('dense', vocab_size, input_sample_shape=(hidden_size,))
dense.to_device(cuda)
dense.param_values()[0].copy_data(dense_w)
dense.param_values()[1].copy_data(dense_b)
hx = tensor.Tensor((num_stacks, 1, hidden_size), cuda)
cx = tensor.Tensor((num_stacks, 1, hidden_size), cuda)
hx.set_value(0.0)
cx.set_value(0.0)
agent = Agent(port=port)
while True:
key, val = agent.pull()
if key is None:
time.sleep(0.1)
continue
msg_type = MsgType.parse(key)
if msg_type.is_request():
try:
response = seed_text
if len(seed_text) > 0:
for c in seed_text:
x = np.zeros((1, vocab_size), dtype=np.float32)
x[0, char_to_idx[c]] = 1
tx = tensor.from_numpy(x)
tx.to_device(cuda)
inputs = [tx, hx, cx]
outputs = rnn.forward(model_pb2.kEval, inputs)
y = dense.forward(model_pb2.kEval, outputs[0])
y = tensor.softmax(y)
hx = outputs[1]
cx = outputs[2]
else:
y = tensor.Tensor((1, vocab_size), cuda)
y.set_value(1.0 / vocab_size)
for i in range(nsamples):
y.to_host()
prob = tensor.to_numpy(y)[0]
if do_sample:
cur = np.random.choice(vocab_size, 1, p=prob)[0]
else:
cur = np.argmax(prob)
response += idx_to_char[cur]
x = np.zeros((1, vocab_size), dtype=np.float32)
x[0, cur] = 1
tx = tensor.from_numpy(x)
tx.to_device(cuda)
inputs = [tx, hx, cx]
outputs = rnn.forward(model_pb2.kEval, inputs)
y = dense.forward(model_pb2.kEval, outputs[0])
y = tensor.softmax(y)
hx = outputs[1]
cx = outputs[2]
agent.push(MsgType.kResponse, response)
elif MsgType.kCommandStop.equal(msg_type):
print 'Get stop command'
agent.push(MsgType.kStatus, 'Success')
break
else:
print 'get unsupported message %s' % str(msg_type)
agent.push(MsgType.kStatus, "Unknown command")
break
# while loop
print "server stop"
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='sample chars from char-rnn')
parser.add_argument('seed', help='seed text string which warms up the '
' rnn states for sampling', default='')
parser.add_argument('-n', type=int, default=1000, help='num of characters to sample')
args = parser.parse_args()
assert args.n > 0, 'n must > 0'
model_url = 'http://www.comp.nus.edu.sg/~wangwei/assets/file/char-rnn.bin'
sample(model_url, args.n, seed_text=args.seed)
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =============================================================================
import cPickle as pickle
import numpy as np
import argparse
import urllib2
import traceback
import time
import sys
from singa import layer
from singa import loss
from singa import device
from singa import tensor
from singa import optimizer
from singa import initializer
from singa.proto import model_pb2
from singa import utils
from rafiki.agent import Agent, MsgType
class Data(object):
def __init__(self, file_url, batch_size=32, seq_length=100, train_ratio=0.8):
'''Data object for loading a plain text file.
Args:
fpath, path to the text file.
train_ratio, split the text file into train and test sets, where
train_ratio of the characters are in the train set.
'''
self.raw_data = urllib2.urlopen(file_url).read()
chars = list(set(self.raw_data))
self.vocab_size = len(chars)
self.char_to_idx = {ch: i for i, ch in enumerate(chars)}
self.idx_to_char = {i: ch for i, ch in enumerate(chars)}
data = [self.char_to_idx[c] for c in self.raw_data]
# seq_length + 1 for the data + label
nsamples = len(data) / (1 + seq_length)
data = data[0:nsamples * (1 + seq_length)]
data = np.asarray(data, dtype=np.int32)
data = np.reshape(data, (-1, seq_length + 1))
# shuffle all sequences
np.random.shuffle(data)
self.train_dat = data[0:int(data.shape[0]*train_ratio)]
self.num_train_batch = self.train_dat.shape[0] / batch_size
self.val_dat = data[self.train_dat.shape[0]:]
self.num_test_batch = self.val_dat.shape[0] / batch_size
print 'train dat', self.train_dat.shape
print 'val dat', self.val_dat.shape
def numpy2tensors(npx, npy, dev):
'''batch, seq, dim -- > seq, batch, dim'''
tmpx = np.swapaxes(npx, 0, 1)
tmpy = np.swapaxes(npy, 0, 1)
inputs = []
labels = []
for t in range(tmpx.shape[0]):
x = tensor.from_numpy(tmpx[t])
y = tensor.from_numpy(tmpy[t])
x.to_device(dev)
y.to_device(dev)
inputs.append(x)
labels.append(y)
return inputs, labels
def convert(batch, batch_size, seq_length, vocab_size, dev):
'''convert a batch of data into a sequence of input tensors'''
y = batch[:, 1:]
x1 = batch[:, :seq_length]
x = np.zeros((batch_size, seq_length, vocab_size), dtype=np.float32)
for b in range(batch_size):
for t in range(seq_length):
c = x1[b, t]
x[b, t, c] = 1
return numpy2tensors(x, y, dev)
def get_lr(epoch):
return 0.001 / float(1 << (epoch / 50))
def handle_cmd(agent):
pause = False
stop = False
while not stop:
key, val = agent.pull()
if key is not None:
msg_type = MsgType.parse(key)
if msg_type.is_command():
if MsgType.kCommandPause.equal(msg_type):
agent.push(MsgType.kStatus,"Success")
pause = True
elif MsgType.kCommandResume.equal(msg_type):
agent.push(MsgType.kStatus, "Success")
pause = False
elif MsgType.kCommandStop.equal(msg_type):
agent.push(MsgType.kStatus,"Success")
stop = True
else:
agent.push(MsgType.kStatus,"Warning, unkown message type")
print "Unsupported command %s" % str(msg)
if pause and not stop:
time.sleep(0.1)
else:
break
return stop
def train(data, max_epoch, hidden_size=100, seq_length=100, batch_size=16,
num_stacks=1, dropout=0.5, model_path='model', port=9999):
# SGD with L2 gradient normalization
opt = optimizer.RMSProp(constraint=optimizer.L2Constraint(5))
cuda = device.create_cuda_gpu()
rnn = layer.LSTM(
name='lstm',
hidden_size=hidden_size,
num_stacks=num_stacks,
dropout=dropout,
input_sample_shape=(
data.vocab_size,
))
rnn.to_device(cuda)
rnn_w = rnn.param_values()[0]
rnn_w.uniform(-0.08, 0.08) # init all rnn parameters
dense = layer.Dense(
'dense',
data.vocab_size,
input_sample_shape=(
hidden_size,
))
dense.to_device(cuda)
dense_w = dense.param_values()[0]
dense_b = dense.param_values()[1]
print 'dense w ', dense_w.shape
print 'dense b ', dense_b.shape
initializer.uniform(dense_w, dense_w.shape[0], 0)
dense_b.set_value(0)
g_dense_w = tensor.Tensor(dense_w.shape, cuda)
g_dense_b = tensor.Tensor(dense_b.shape, cuda)
agent = Agent(port)
lossfun = loss.SoftmaxCrossEntropy()
for epoch in range(max_epoch):
train_loss = 0
for b in range(data.num_train_batch):
batch = data.train_dat[b * batch_size: (b + 1) * batch_size]
inputs, labels = convert(batch, batch_size, seq_length,
data.vocab_size, cuda)
inputs.append(tensor.Tensor())
inputs.append(tensor.Tensor())
outputs = rnn.forward(model_pb2.kTrain, inputs)[0:-2]
grads = []
batch_loss = 0
g_dense_w.set_value(0.0)
g_dense_b.set_value(0.0)
for output, label in zip(outputs, labels):
act = dense.forward(model_pb2.kTrain, output)
lvalue = lossfun.forward(model_pb2.kTrain, act, label)
batch_loss += lvalue.l1()
grad = lossfun.backward()
grad /= batch_size
grad, gwb = dense.backward(model_pb2.kTrain, grad)
grads.append(grad)
g_dense_w += gwb[0]
g_dense_b += gwb[1]
# print output.l1(), act.l1()
utils.update_progress(
b * 1.0 / data.num_train_batch, 'training loss = %f' %
(batch_loss / seq_length))
train_loss += batch_loss / seq_length
grads.append(tensor.Tensor())
grads.append(tensor.Tensor())
g_rnn_w = rnn.backward(model_pb2.kTrain, grads)[1][0]
dense_w, dense_b = dense.param_values()
opt.apply_with_lr(epoch, get_lr(epoch), g_rnn_w, rnn_w, 'rnnw')
opt.apply_with_lr(
epoch, get_lr(epoch),
g_dense_w, dense_w, 'dense_w')
opt.apply_with_lr(
epoch, get_lr(epoch),
g_dense_b, dense_b, 'dense_b')
if (b+1) % 20 == 0:
info = dict(phase='train', step=epoch, loss=train_loss / b,
timestamp=time.time())
agent.push(MsgType.kInfoMetric, info)
if handle_cmd(agent):
break
avg_loss = train_loss / data.num_train_batch
print '\nEpoch %d, train loss is %f' % (epoch, avg_loss)
eval_loss = 0
for b in range(data.num_test_batch):
batch = data.val_dat[b * batch_size: (b + 1) * batch_size]
inputs, labels = convert(batch, batch_size, seq_length,
data.vocab_size, cuda)
inputs.append(tensor.Tensor())
inputs.append(tensor.Tensor())
outputs = rnn.forward(model_pb2.kEval, inputs)[0:-2]
for output, label in zip(outputs, labels):
output = dense.forward(model_pb2.kEval, output)
eval_loss += lossfun.forward(model_pb2.kEval,
output, label).l1()
avg_loss = eval_loss / data.num_test_batch / seq_length
print 'Epoch %d, evaluation loss is %f' % (epoch, avg_loss)
info = dict(phase='test', step=epoch, loss=avg_loss,
timestamp=time.time())
agent.push(MsgType.kInfoMetric, info)
if (epoch + 1) % 30 == 0:
# checkpoint the file model
with open('%s_%d.bin' % (model_path, epoch), 'wb') as fd:
print 'saving model to %s' % model_path
d = {}
for name, w in zip(
['rnn_w', 'dense_w', 'dense_b'],
[rnn_w, dense_w, dense_b]):
w.to_host()
d[name] = tensor.to_numpy(w)
w.to_device(cuda)
d['idx_to_char'] = data.idx_to_char
d['char_to_idx'] = data.char_to_idx
d['hidden_size'] = hidden_size
d['num_stacks'] = num_stacks
d['dropout'] = dropout
pickle.dump(d, fd)
agent.stop()
if __name__ == '__main__':
try:
parser = argparse.ArgumentParser( description='Train multi-stack LSTM')
# parser.add_argument('data', type=str, help='training file')
parser.add_argument('-b', type=int, default=32, help='batch_size')
parser.add_argument('-l', type=int, default=64, help='sequence length')
parser.add_argument('-d', type=int, default=128, help='hidden size')
parser.add_argument('-s', type=int, default=2, help='num of stacks')
parser.add_argument('-m', type=int, default=80, help='max num of epoch')
args = parser.parse_args()
file_url = 'http://cs.stanford.edu/people/karpathy/char-rnn/linux_input.txt'
data = Data(file_url, batch_size=args.b, seq_length=args.l)
train(data, args.m, hidden_size=args.d, num_stacks=args.s,
seq_length=args.l, batch_size=args.b)
except SystemExit:
pass
except:
traceback.print_exc()
sys.stderr.write(" for help use --help \n\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment