Last active
August 29, 2020 01:25
-
-
Save petulla/dc4b68046e170eaaa1db9ee247f02fd6 to your computer and use it in GitHub Desktop.
Simplified rewrite of Karpathy character-level language model with a Vanilla Recurrent Neural Network
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import random | |
import unidecode | |
from bidict import bidict | |
file = unidecode.unidecode(open("./bin/input.txt").read()) | |
file_len = len(file) | |
print("file_len =", file_len) | |
def random_chunk(): | |
start_index = random.randint(0, file_len - window_size) | |
end_index = start_index + window_size + 1 | |
return file[start_index:end_index] | |
sm = lambda y: np.exp(y) / np.sum(np.exp(y)) | |
sm_loss = lambda v: -np.log(v) | |
def sample_output(h, seed_ix, n, Wi, Wh, Wo, bh, by): | |
""" | |
sample a sequence of integers from the model | |
h is memory state, seed_ix is seed letter for first time step | |
""" | |
x = np.zeros((vocab_size, 1)) | |
x[seed_ix] = 1 | |
ixes = [] | |
for t in range(n): | |
h = np.tanh(Wi @ x + Wh @ h + bh) | |
y = Wo @ h + by | |
p = sm(y) | |
ix = np.random.choice(range(vocab_size), p=p.ravel()) | |
x = np.zeros((vocab_size, 1)) | |
x[ix] = 1 | |
ixes.append(ix) | |
return ixes | |
chars = set(file) | |
vocab_size = len(chars) | |
chars_map = bidict() | |
for i, v in enumerate(chars): chars_map[v] = i | |
def train(iterations): | |
window_size = 30 | |
hidden_size = 100 | |
learning_rate = 1e-1 | |
Wi = np.random.randn(hidden_size, vocab_size) * 0.01 #input | |
Wo = np.random.randn(vocab_size, hidden_size) # output | |
Wh = np.random.randn(hidden_size, hidden_size) * 0.01 #between | |
bh = np.zeros((hidden_size, 1)) # hidden bias | |
by = np.zeros((vocab_size, 1)) # output bias | |
mWxh, mWhh, mWhy = np.zeros_like(Wi), np.zeros_like(Wh), np.zeros_like(Wo) | |
mbh, mby = np.zeros_like(bh), np.zeros_like(by) | |
smooth_loss = -np.log(1.0 / vocab_size) * window_size | |
def forward(inputs, targets): | |
""" | |
Args: | |
inputs, targets are both List[int] | |
Returns: | |
sequence weights (xs, hs, ys, ps), loss | |
""" | |
xs, hs, ys, ps = {}, {}, {}, {} | |
hs[-1] = np.copy(hprev) | |
loss = 0 | |
for t, inp in enumerate(inputs): | |
xs[t] = np.zeros((vocab_size, 1)) # encode in 1-of-k representation | |
xs[t][inp] = 1 | |
hs[t] = np.tanh(Wi @ xs[t] + Wh @ hs[t - 1] + bh) # hidden state | |
ys[t] = Wo @ hs[t] + by # unnormalized log probabilities for next chars | |
ps[t] = sm(ys[t]) | |
loss += sm_loss(ps[t][targets[t], 0]) | |
return (xs, hs, ys, ps), loss | |
def backprop(xs, hs, ys, ps): | |
# backward pass: compute gradients going backwards | |
dWxh, dWhh, dWhy = np.zeros_like(Wi), np.zeros_like(Wh), np.zeros_like(Wo) | |
dbh, dby = np.zeros_like(bh), np.zeros_like(by) | |
dhnext = np.zeros_like(hs[0]) | |
ii = window_size - 1 | |
for t in reversed(range(ii + 1)): | |
dy = np.copy(ps[t]) | |
dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here | |
dWhy += dy @ hs[t].T | |
dby += dy | |
dh = Wo.T @ dy + dhnext # backprop into h | |
dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity | |
dbh += dhraw | |
dWxh += dhraw @ xs[t].T | |
dWhh += dhraw @ hs[t - 1].T | |
dhnext = Wh.T @ dhraw | |
for dparam in (dWxh, dWhh, dWhy, dbh, dby): | |
np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients | |
return loss, dWxh, dWhh, dWhy, dbh, dby, hs[ii] | |
for i in range(iterations): | |
sample = random_chunk() | |
inputs = [chars_map[ch] for ch in sample[:-1]] | |
targets = [chars_map[ch] for ch in sample[1:]] | |
(xs, hs, ys, ps), loss = forward(inputs, targets) | |
loss, dWxh, dWhh, dWhy, dbh, dby, hprev = backprop(xs, hs, ys, ps) | |
smooth_loss = smooth_loss * 0.999 + loss * 0.001 | |
if not i % 1000: | |
sample_ix = sample_output(hprev, inputs[0], 40, Wi, Wh, Wo, bh, by) | |
txt = ''.join(chars_map.inverse[ix] for ix in sample_ix) | |
print(f'----\n {txt} \n----') | |
print(f"iter {i}, loss: {smooth_loss}") # print progress | |
# perform parameter update with Adagrad | |
for param, dparam, mem in zip([Wi, Wh, Wo, bh, by], | |
[dWxh, dWhh, dWhy, dbh, dby], | |
[mWxh, mWhh, mWhy, mbh, mby]): | |
mem += dparam * dparam | |
param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update | |
iterations = 10000000 | |
train(iterations) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment