Skip to content

Instantly share code, notes, and snippets.

@petulla
Last active August 29, 2020 01:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save petulla/dc4b68046e170eaaa1db9ee247f02fd6 to your computer and use it in GitHub Desktop.
Save petulla/dc4b68046e170eaaa1db9ee247f02fd6 to your computer and use it in GitHub Desktop.
Simplified rewrite of Karpathy character-level language model with a Vanilla Recurrent Neural Network
import numpy as np
import random
import unidecode
from bidict import bidict
file = unidecode.unidecode(open("./bin/input.txt").read())
file_len = len(file)
print("file_len =", file_len)
def random_chunk():
start_index = random.randint(0, file_len - window_size)
end_index = start_index + window_size + 1
return file[start_index:end_index]
sm = lambda y: np.exp(y) / np.sum(np.exp(y))
sm_loss = lambda v: -np.log(v)
def sample_output(h, seed_ix, n, Wi, Wh, Wo, bh, by):
"""
sample a sequence of integers from the model
h is memory state, seed_ix is seed letter for first time step
"""
x = np.zeros((vocab_size, 1))
x[seed_ix] = 1
ixes = []
for t in range(n):
h = np.tanh(Wi @ x + Wh @ h + bh)
y = Wo @ h + by
p = sm(y)
ix = np.random.choice(range(vocab_size), p=p.ravel())
x = np.zeros((vocab_size, 1))
x[ix] = 1
ixes.append(ix)
return ixes
chars = set(file)
vocab_size = len(chars)
chars_map = bidict()
for i, v in enumerate(chars): chars_map[v] = i
def train(iterations):
window_size = 30
hidden_size = 100
learning_rate = 1e-1
Wi = np.random.randn(hidden_size, vocab_size) * 0.01 #input
Wo = np.random.randn(vocab_size, hidden_size) # output
Wh = np.random.randn(hidden_size, hidden_size) * 0.01 #between
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias
mWxh, mWhh, mWhy = np.zeros_like(Wi), np.zeros_like(Wh), np.zeros_like(Wo)
mbh, mby = np.zeros_like(bh), np.zeros_like(by)
smooth_loss = -np.log(1.0 / vocab_size) * window_size
def forward(inputs, targets):
"""
Args:
inputs, targets are both List[int]
Returns:
sequence weights (xs, hs, ys, ps), loss
"""
xs, hs, ys, ps = {}, {}, {}, {}
hs[-1] = np.copy(hprev)
loss = 0
for t, inp in enumerate(inputs):
xs[t] = np.zeros((vocab_size, 1)) # encode in 1-of-k representation
xs[t][inp] = 1
hs[t] = np.tanh(Wi @ xs[t] + Wh @ hs[t - 1] + bh) # hidden state
ys[t] = Wo @ hs[t] + by # unnormalized log probabilities for next chars
ps[t] = sm(ys[t])
loss += sm_loss(ps[t][targets[t], 0])
return (xs, hs, ys, ps), loss
def backprop(xs, hs, ys, ps):
# backward pass: compute gradients going backwards
dWxh, dWhh, dWhy = np.zeros_like(Wi), np.zeros_like(Wh), np.zeros_like(Wo)
dbh, dby = np.zeros_like(bh), np.zeros_like(by)
dhnext = np.zeros_like(hs[0])
ii = window_size - 1
for t in reversed(range(ii + 1)):
dy = np.copy(ps[t])
dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
dWhy += dy @ hs[t].T
dby += dy
dh = Wo.T @ dy + dhnext # backprop into h
dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
dbh += dhraw
dWxh += dhraw @ xs[t].T
dWhh += dhraw @ hs[t - 1].T
dhnext = Wh.T @ dhraw
for dparam in (dWxh, dWhh, dWhy, dbh, dby):
np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
return loss, dWxh, dWhh, dWhy, dbh, dby, hs[ii]
for i in range(iterations):
sample = random_chunk()
inputs = [chars_map[ch] for ch in sample[:-1]]
targets = [chars_map[ch] for ch in sample[1:]]
(xs, hs, ys, ps), loss = forward(inputs, targets)
loss, dWxh, dWhh, dWhy, dbh, dby, hprev = backprop(xs, hs, ys, ps)
smooth_loss = smooth_loss * 0.999 + loss * 0.001
if not i % 1000:
sample_ix = sample_output(hprev, inputs[0], 40, Wi, Wh, Wo, bh, by)
txt = ''.join(chars_map.inverse[ix] for ix in sample_ix)
print(f'----\n {txt} \n----')
print(f"iter {i}, loss: {smooth_loss}") # print progress
# perform parameter update with Adagrad
for param, dparam, mem in zip([Wi, Wh, Wo, bh, by],
[dWxh, dWhh, dWhy, dbh, dby],
[mWxh, mWhh, mWhy, mbh, mby]):
mem += dparam * dparam
param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update
iterations = 10000000
train(iterations)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment