Last active October 11, 2016 04:24
import numpy as np
import sys
import as sio
def sample(hprev, xt, n):
for index in xt:
x = np.zeros((vocab_size, 1))
x[index] = 1
h = np.tanh(, x) +, hprev) + bh)
y =, h) + by
p = np.exp(y) / np.sum(np.exp(y))
ix = np.random.choice(range(vocab_size), p=p.ravel())
hprev = h
generated_seq = []
x = np.zeros((vocab_size, 1))
x[ix] = 1
h = hprev
for t in range(n):
h = np.tanh(, x) +, h) + bh)
y =, h) + by
p = np.exp(y) / np.sum(np.exp(y))
ix = np.random.choice(range(vocab_size), p=p.ravel())
x = np.zeros((vocab_size, 1))
x[ix] = 1
return generated_seq
def generate_h(xt):
hidden_size = Whh.shape[0]
hprev = np.zeros((hidden_size,1))
for index in xt:
x = np.zeros((vocab_size, 1))
x[index] = 1
h = np.tanh(, x) +, hprev) + bh)
y =, h) + by
p = np.exp(y) / np.sum(np.exp(y))
ix = np.random.choice(range(vocab_size), p=p.ravel())
hprev = h
return hprev
if __name__ == '__main__':
dataset = 'lstm_from_wiki.txt'
data = open('data/%s' % dataset, 'r').read()
chars = list(set(data))
print '%d unique characters in data.' % (len(chars))
vocab_size, data_size = len(chars), len(data)
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }
# nb. of sequence to generate from model
seq_length_sample = 2048
model = sio.loadmat(sys.argv[1])
p, _, Wxh, Whh, Why, bh, by = model['p'], model['hprev'], model['Wxh'], model['Whh'], model['Why'], model['bh'], model['by']
inputs = [char_to_ix[ch] for ch in data[1:p]]
hidden_size = Whh.shape[0]
while (True):
question = raw_input("Inital seq.: ")
print "gen-seq.: "
inputs = [char_to_ix[ch] for ch in question]
# initial state
hprev = np.zeros((hidden_size,1))
sample_ix = sample(hprev, inputs, seq_length_sample)
txt = ''.join(ix_to_char[ix] for ix in inputs) + ''.join(ix_to_char[ix] for ix in sample_ix)
print '----\n %s \n----' % txt
import numpy as np
import sys
import matplotlib.pyplot as plt
import time
import cPickle
import as sio
def lossFun(inputs, targets, hprev):
inputs,targets are both list of integers.
hprev is Hx1 array of initial hidden state
returns the loss, gradients on model parameters, and last hidden state
xs, hs, ys, ps = {}, {}, {}, {}
hs[-1] = np.copy(hprev)
loss, pplx = 0, 0
# forward pass
for t in range(len(inputs)):
# encode in 1-of-k representation
xs[t] = np.zeros((vocab_size,1))
xs[t][inputs[t]] = 1
hs[t] = np.tanh(, xs[t]) +, hs[t-1]) + bh)
ys[t] =, hs[t]) + by
# softmax
ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))
# negative log-likelihood loss
loss += -np.log(ps[t][targets[t],0])
# perplexity
#pplx += -np.log2(ps[t][targets[t],0])
#pplx = 2 ** (pplx / len(inputs))
pplx = np.exp(loss/len(inputs))
# backward pass: compute gradients going backwards
# memory variables for derivatives
dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
dbh, dby = np.zeros_like(bh), np.zeros_like(by)
dhnext = np.zeros_like(hs[0])
for t in reversed(range(len(inputs))):
dy = np.copy(ps[t])
# backprop. into y
# dLoss = y - t
dy[targets[t]] -= 1
# compute grad. w.r.t. Why
dWhy +=, hs[t].T)
# grad. w.r.t. by
dby += dy
# backprop into h
# dh = Why^t * dy_t + dh_t+1
dh =, dy) + dhnext
# backprop through tanh nonlinearity (f^\prime(z) = 1-tanh^2(z))
dhraw = (1 - hs[t] * hs[t]) * dh
dbh += dhraw
# compute grad. w.r.t. Wxh, Whh
# backprop into dh_t-1
dWxh +=, xs[t].T)
dWhh +=, hs[t-1].T)
dhnext =, dhraw)
# clip to mitigate exploding gradients
for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
dparam = np.clip(dparam, -1, 1, out=dparam)
return loss, pplx, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]
def sample(h, seed_ix, n):
x = np.zeros((vocab_size, 1))
x[seed_ix] = 1
generated_seq = []
for t in range(n):
h = np.tanh(, x) +, h) + bh)
y =, h) + by
p = np.exp(y) / np.sum(np.exp(y))
ix = np.random.choice(range(vocab_size), p=p.ravel())
x = np.zeros((vocab_size, 1))
x[ix] = 1
return generated_seq
# gradient checking
from random import uniform
def gradCheck(inputs, target, hprev):
global Wxh, Whh, Why, bh, by
num_checks, delta = 10, 1e-5
_, _, dWxh, dWhh, dWhy, dbh, dby, _ = lossFun(inputs, targets, hprev)
for param,dparam,name in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby], ['Wxh', 'Whh', 'Why', 'bh', 'by']):
s0 = dparam.shape
s1 = param.shape
assert s0 == s1, 'Error dims dont match: %s and %s.' % (`s0`, `s1`)
print name
for i in xrange(num_checks):
ri = int(uniform(0,param.size))
# evaluate cost at [x + delta] and [x - delta]
old_val = param.flat[ri]
param.flat[ri] = old_val + delta
cg0, _, _, _, _, _, _, _ = lossFun(inputs, targets, hprev)
param.flat[ri] = old_val - delta
cg1, _, _, _, _, _, _, _ = lossFun(inputs, targets, hprev)
param.flat[ri] = old_val # reset old value for this parameter
# fetch both numerical and analytic gradient
grad_analytic = dparam.flat[ri]
grad_numerical = (cg0 - cg1) / ( 2 * delta )
rel_error = abs(grad_analytic - grad_numerical) / abs(grad_numerical + grad_analytic)
print '%f, %f => %e ' % (grad_numerical, grad_analytic, rel_error)
# rel_error should be on order of 1e-7 or less
if __name__ == '__main__':
# data should be simple plain text file
data = open('data/lstm_from_wiki.txt', 'r').read()
#data = open('data/linux/linux.txt', 'r').read()
chars = list(set(data))
print '%d unique characters in data.' % (len(chars))
vocab_size, data_size = len(chars), len(data)
# compute vocab.
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }
# hyperparameters
# size of hidden layer of neurons
hidden_size = 256
# number of steps to unroll the RNN for
seq_length = 96
# nb. of sequence to generate from model
seq_length_sample = 128
# learning params.
learning_rate = 0.01
decay_rate = 0.5
# model parameters
Wxh = np.random.randn(hidden_size,vocab_size).astype( np.float32)*0.01
Whh = np.random.randn(hidden_size,hidden_size).astype(np.float32)*0.01
Why = np.random.randn(vocab_size, hidden_size).astype(np.float32)*0.01
bh = np.zeros((hidden_size,1)).astype(np.float32)
by = np.zeros((vocab_size, 1)).astype(np.float32)
# memory variables for Adagrad
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)
n, p, cum_p = 0, 0, 0
sample_freq = 400
# compute loss (loss at iter. 0)
smooth_loss = -np.log(1.0/vocab_size)*seq_length
# start to train
while True:
# prepare inputs (we're sweeping from left to right in steps seq_length long)
if p+seq_length+1 >= len(data) or n == 0:
# reset RNN memory
hprev = np.zeros((hidden_size,1))
p = 0 # go from start of data
inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
targets= [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
# sample from the model now and then
if n % sample_freq == 0:
sample_ix = sample(hprev, inputs[0], seq_length_sample)
txt = ix_to_char[inputs[0]] + ''.join(ix_to_char[ix] for ix in sample_ix)
print '----\n %s \n----' % txt
#gradCheck(inputs, targets, hprev)
# forward seq_length characters through the net and fetch gradient
loss, pplx, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
smooth_loss = smooth_loss * 0.999 + loss * 0.001
# perform parameter update
for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby], [mWxh, mWhh, mWhy, mbh, mby]):
## adagrad
#mem = (decay_rate * mem) + (1.0 - decay_rate) * (dparam*dparam)
## rmsprop
mem += dparam*dparam
param += -learning_rate * dparam / np.sqrt(mem + 1e-8)
# print learning
if n % sample_freq == 0:
print 'epoch: %d(%d/%d), iter %d, loss: %f, smooth_loss: %f, pplx: %f' % (cum_p / data_size, cum_p, data_size, n, loss, smooth_loss, pplx)
dump_filename = 'min_char_rnn_iter%08d_loss%.4f.mat' % (n, smooth_loss)
sio.savemat(dump_filename, {'p': p, 'hprev': hprev, 'Wxh': Wxh, 'Whh': Whh, 'Why': Why, 'bh': bh, 'by': by, 'mWxh': mWxh, 'mWhh': mWhh, 'mWhy': mWhy, 'mbh': mbh, 'mby': mby})
print 'Dump: %s' % dump_filename
p += seq_length # move data pointer
cum_p += seq_length # cumulate
n += 1 # iteration counter
