Created
July 18, 2017 16:59
-
-
Save sleebapaul/8924b8a48b1fb17892448130eb9370f6 to your computer and use it in GitHub Desktop.
Training char_rnn by Andrej Karpathy on first chapter of Genesis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy) | |
BSD License | |
""" | |
from __future__ import print_function | |
import numpy as np | |
# data I/O | |
data = open('input.txt', 'r').read() # should be simple plain text file | |
chars = list(set(data)) | |
data_size, vocab_size = len(data), len(chars) | |
print('data has %d characters, %d unique.' % (data_size, vocab_size)) | |
char_to_ix = {ch: i for i, ch in enumerate(chars)} | |
ix_to_char = {i: ch for i, ch in enumerate(chars)} | |
# hyperparameters | |
hidden_size = 100 # size of hidden layer of neurons | |
seq_length = 25 # number of steps to unroll the RNN for | |
learning_rate = 1e-1 | |
# model parameters | |
Wxh = np.random.randn(hidden_size, vocab_size) * 0.01 # input to hidden | |
Whh = np.random.randn(hidden_size, hidden_size) * 0.01 # hidden to hidden | |
Why = np.random.randn(vocab_size, hidden_size) * 0.01 # hidden to output | |
bh = np.zeros((hidden_size, 1)) # hidden bias | |
by = np.zeros((vocab_size, 1)) # output bias | |
def lossFun(inputs, targets, hprev): | |
""" | |
inputs,targets are both list of integers. | |
hprev is Hx1 array of initial hidden state | |
returns the loss, gradients on model parameters, and last hidden state | |
""" | |
xs, hs, ys, ps = {}, {}, {}, {} | |
hs[-1] = np.copy(hprev) | |
loss = 0 | |
# forward pass | |
for t in range(len(inputs)): | |
xs[t] = np.zeros((vocab_size, 1)) # encode in 1-of-k representation | |
xs[t][inputs[t]] = 1 | |
hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, | |
hs[t - 1]) + bh) # hidden state | |
# unnormalized log probabilities for next chars | |
ys[t] = np.dot(Why, hs[t]) + by | |
# probabilities for next chars | |
ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) | |
loss += -np.log(ps[t][targets[t], 0]) # softmax (cross-entropy loss) (negative log of cross entropy) | |
# backward pass: compute gradients going backwards | |
dWxh, dWhh, dWhy = np.zeros_like( | |
Wxh), np.zeros_like(Whh), np.zeros_like(Why) | |
dbh, dby = np.zeros_like(bh), np.zeros_like(by) | |
dhnext = np.zeros_like(hs[0]) | |
for t in reversed(range(len(inputs))): | |
dy = np.copy(ps[t]) | |
# backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here | |
dy[targets[t]] -= 1 | |
dWhy += np.dot(dy, hs[t].T) | |
dby += dy | |
dh = np.dot(Why.T, dy) + dhnext # backprop into h | |
dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity | |
dbh += dhraw | |
dWxh += np.dot(dhraw, xs[t].T) | |
dWhh += np.dot(dhraw, hs[t - 1].T) | |
dhnext = np.dot(Whh.T, dhraw) | |
for dparam in [dWxh, dWhh, dWhy, dbh, dby]: | |
# clip to mitigate exploding gradients | |
np.clip(dparam, -5, 5, out=dparam) | |
return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs) - 1] | |
def sample(h, seed_ix, n): | |
""" | |
sample a sequence of integers from the model | |
h is memory state, seed_ix is seed letter for first time step | |
""" | |
x = np.zeros((vocab_size, 1)) | |
x[seed_ix] = 1 # one hot encoding | |
ixes = [] | |
for t in range(n): | |
h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh) | |
y = np.dot(Why, h) + by | |
p = np.exp(y) / np.sum(np.exp(y)) | |
# ix = np.random.choice(range(vocab_size), p=p.ravel()) | |
# How this step gives a validation of generating the next letter is not lucid | |
# Instead using max of the softmax probablity would be more appropriate | |
ix = p.argmax() | |
x = np.zeros((vocab_size, 1)) | |
x[ix] = 1 | |
ixes.append(ix) | |
return ixes | |
n, p = 0, 0 | |
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why) | |
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad | |
smooth_loss = -np.log(1.0 / vocab_size) * seq_length # loss at iteration 0 | |
# MAIN LOOP | |
while n < 100000: | |
# prepare inputs (we're sweeping from left to right in steps seq_length long) | |
if p + seq_length + 1 >= len(data) or n == 0: | |
hprev = np.zeros((hidden_size, 1)) # reset RNN memory | |
p = 0 # go from start of data | |
inputs = [char_to_ix[ch] | |
for ch in data[p:p + seq_length]] # input characters | |
# output characters (shifted input by one position to right) | |
targets = [char_to_ix[ch] for ch in data[p + 1:p + seq_length + 1]] | |
# sample from the model now and then to identify the improvement | |
if n % 100 == 0: | |
sample_ix = sample(hprev, inputs[0], 200) | |
txt = ''.join(ix_to_char[ix] for ix in sample_ix) | |
print('----\n %s \n----' % (txt, )) | |
# forward seq_length characters through the net and fetch gradient | |
# hprev is the hidden state previous vector | |
loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev) | |
smooth_loss = smooth_loss * 0.999 + loss * 0.001 | |
if n % 100 == 0: | |
print('iter %d, loss: %f' % (n, smooth_loss)) # print progress | |
# perform parameter update with Adagrad | |
for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], | |
[dWxh, dWhh, dWhy, dbh, dby], | |
[mWxh, mWhh, mWhy, mbh, mby]): | |
mem += dparam * dparam | |
param += -learning_rate * dparam / \ | |
np.sqrt(mem + 1e-8) # adagrad update | |
p += seq_length # move data pointer | |
n += 1 # iteration counter | |
# gradient checking | |
from random import uniform | |
def gradCheck(inputs, target, hprev): | |
global Wxh, Whh, Why, bh, by | |
num_checks, delta = 10, 1e-5 | |
_, dWxh, dWhh, dWhy, dbh, dby, _ = lossFun(inputs, targets, hprev) | |
for param, dparam, name in zip([Wxh, Whh, Why, bh, by], [dWxh, dWhh, dWhy, dbh, dby], ['Wxh', 'Whh', 'Why', 'bh', 'by']): | |
s0 = dparam.shape | |
s1 = param.shape | |
assert (s0 == s1), "Error dims dont match: %s and %s." % (s0, s1) | |
print(name) | |
for i in range(num_checks): | |
ri = int(uniform(0, param.size)) | |
# evaluate cost at [x + delta] and [x - delta] | |
old_val = param.flat[ri] | |
param.flat[ri] = old_val + delta | |
cg0, _, _, _, _, _, _ = lossFun(inputs, targets, hprev) | |
param.flat[ri] = old_val - delta | |
cg1, _, _, _, _, _, _ = lossFun(inputs, targets, hprev) | |
param.flat[ri] = old_val # reset old value for this parameter | |
# fetch both numerical and analytic gradient | |
grad_analytic = dparam.flat[ri] | |
grad_numerical = (cg0 - cg1) / (2 * delta) | |
rel_error = abs(grad_analytic - grad_numerical) / \ | |
abs(grad_numerical + grad_analytic) | |
print('%f, %f => %e ' % (grad_numerical, grad_analytic, rel_error)) | |
# rel_error should be on order of 1e-7 or less |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment