petulla/min-char-rnn.py

## min-char-rnn.py
import numpy as np
import random
import unidecode
from bidict import bidict

file = unidecode.unidecode(open("./bin/input.txt").read())
file_len = len(file)
print("file_len =", file_len)

def random_chunk():
    start_index = random.randint(0, file_len - window_size)
    end_index = start_index + window_size + 1
    return file[start_index:end_index]

sm = lambda y: np.exp(y) / np.sum(np.exp(y))
sm_loss = lambda v: -np.log(v)

def sample_output(h, seed_ix, n, Wi, Wh, Wo, bh, by):
    """
    sample a sequence of integers from the model
    h is memory state, seed_ix is seed letter for first time step
    """
    x = np.zeros((vocab_size, 1))
    x[seed_ix] = 1
    ixes = []
    for t in range(n):
        h = np.tanh(Wi @ x + Wh @ h + bh)
        y = Wo @ h + by
        p = sm(y)
        ix = np.random.choice(range(vocab_size), p=p.ravel())
        x = np.zeros((vocab_size, 1))
        x[ix] = 1
        ixes.append(ix)
    return ixes

chars = set(file)
vocab_size = len(chars)
chars_map = bidict()
for i, v in enumerate(chars): chars_map[v] = i

def train(iterations):
    window_size = 30
    hidden_size = 100

    learning_rate = 1e-1

    Wi = np.random.randn(hidden_size, vocab_size) * 0.01 #input
    Wo = np.random.randn(vocab_size, hidden_size) # output
    Wh = np.random.randn(hidden_size, hidden_size) * 0.01 #between

    bh = np.zeros((hidden_size, 1)) # hidden bias
    by = np.zeros((vocab_size, 1)) # output bias

    mWxh, mWhh, mWhy = np.zeros_like(Wi), np.zeros_like(Wh), np.zeros_like(Wo)
    mbh, mby = np.zeros_like(bh), np.zeros_like(by)
    smooth_loss = -np.log(1.0 / vocab_size) * window_size


    def forward(inputs, targets):
        """
        Args:
            inputs, targets are both List[int]
        Returns:
            sequence weights (xs, hs, ys, ps), loss
        """
        xs, hs, ys, ps = {}, {}, {}, {}
        hs[-1] = np.copy(hprev)
        loss = 0

        for t, inp in enumerate(inputs):
            xs[t] = np.zeros((vocab_size, 1))  # encode in 1-of-k representation
            xs[t][inp] = 1

            hs[t] = np.tanh(Wi @ xs[t] + Wh @ hs[t - 1] + bh)  # hidden state

            ys[t] = Wo @ hs[t] + by  # unnormalized log probabilities for next chars
            ps[t] = sm(ys[t])
            loss += sm_loss(ps[t][targets[t], 0])

        return (xs, hs, ys, ps), loss

    def backprop(xs, hs, ys, ps):
        # backward pass: compute gradients going backwards
        dWxh, dWhh, dWhy = np.zeros_like(Wi), np.zeros_like(Wh), np.zeros_like(Wo)
        dbh, dby = np.zeros_like(bh), np.zeros_like(by)
        dhnext = np.zeros_like(hs[0])
        ii = window_size - 1

        for t in reversed(range(ii + 1)):
            dy = np.copy(ps[t])
            dy[targets[t]] -= 1  # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
            dWhy += dy @ hs[t].T
            dby += dy
            dh = Wo.T @ dy + dhnext  # backprop into h
            dhraw = (1 - hs[t] * hs[t]) * dh  # backprop through tanh nonlinearity
            dbh += dhraw
            dWxh += dhraw  @ xs[t].T
            dWhh += dhraw @ hs[t - 1].T
            dhnext = Wh.T @ dhraw

        for dparam in (dWxh, dWhh, dWhy, dbh, dby):
            np.clip(dparam, -5, 5, out=dparam)  # clip to mitigate exploding gradients
        return loss, dWxh, dWhh, dWhy, dbh, dby, hs[ii]

    for i in range(iterations):

        sample = random_chunk()
        inputs = [chars_map[ch] for ch in sample[:-1]]
        targets = [chars_map[ch] for ch in sample[1:]]

        (xs, hs, ys, ps), loss = forward(inputs, targets)
        loss, dWxh, dWhh, dWhy, dbh, dby, hprev = backprop(xs, hs, ys, ps)
        smooth_loss = smooth_loss * 0.999 + loss * 0.001

        if not i % 1000:
            sample_ix = sample_output(hprev, inputs[0], 40, Wi, Wh, Wo, bh, by)
            txt = ''.join(chars_map.inverse[ix] for ix in sample_ix)
            print(f'----\n {txt} \n----')

            print(f"iter {i}, loss: {smooth_loss}") # print progress

        # perform parameter update with Adagrad
        for param, dparam, mem in zip([Wi, Wh, Wo, bh, by],
                                      [dWxh, dWhh, dWhy, dbh, dby],
                                      [mWxh, mWhh, mWhy, mbh, mby]):
            mem += dparam * dparam
            param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

iterations = 10000000
train(iterations)
	import numpy as np
	import random
	import unidecode
	from bidict import bidict

	file = unidecode.unidecode(open("./bin/input.txt").read())
	file_len = len(file)
	print("file_len =", file_len)

	def random_chunk():
	start_index = random.randint(0, file_len - window_size)
	end_index = start_index + window_size + 1
	return file[start_index:end_index]

	sm = lambda y: np.exp(y) / np.sum(np.exp(y))
	sm_loss = lambda v: -np.log(v)

	def sample_output(h, seed_ix, n, Wi, Wh, Wo, bh, by):
	"""
	sample a sequence of integers from the model
	h is memory state, seed_ix is seed letter for first time step
	"""
	x = np.zeros((vocab_size, 1))
	x[seed_ix] = 1
	ixes = []
	for t in range(n):
	h = np.tanh(Wi @ x + Wh @ h + bh)
	y = Wo @ h + by
	p = sm(y)
	ix = np.random.choice(range(vocab_size), p=p.ravel())
	x = np.zeros((vocab_size, 1))
	x[ix] = 1
	ixes.append(ix)
	return ixes

	chars = set(file)
	vocab_size = len(chars)
	chars_map = bidict()
	for i, v in enumerate(chars): chars_map[v] = i

	def train(iterations):
	window_size = 30
	hidden_size = 100

	learning_rate = 1e-1

	Wi = np.random.randn(hidden_size, vocab_size) * 0.01 #input
	Wo = np.random.randn(vocab_size, hidden_size) # output
	Wh = np.random.randn(hidden_size, hidden_size) * 0.01 #between

	bh = np.zeros((hidden_size, 1)) # hidden bias
	by = np.zeros((vocab_size, 1)) # output bias

	mWxh, mWhh, mWhy = np.zeros_like(Wi), np.zeros_like(Wh), np.zeros_like(Wo)
	mbh, mby = np.zeros_like(bh), np.zeros_like(by)
	smooth_loss = -np.log(1.0 / vocab_size) * window_size


	def forward(inputs, targets):
	"""
	Args:
	inputs, targets are both List[int]
	Returns:
	sequence weights (xs, hs, ys, ps), loss
	"""
	xs, hs, ys, ps = {}, {}, {}, {}
	hs[-1] = np.copy(hprev)
	loss = 0

	for t, inp in enumerate(inputs):
	xs[t] = np.zeros((vocab_size, 1)) # encode in 1-of-k representation
	xs[t][inp] = 1

	hs[t] = np.tanh(Wi @ xs[t] + Wh @ hs[t - 1] + bh) # hidden state

	ys[t] = Wo @ hs[t] + by # unnormalized log probabilities for next chars
	ps[t] = sm(ys[t])
	loss += sm_loss(ps[t][targets[t], 0])

	return (xs, hs, ys, ps), loss

	def backprop(xs, hs, ys, ps):
	# backward pass: compute gradients going backwards
	dWxh, dWhh, dWhy = np.zeros_like(Wi), np.zeros_like(Wh), np.zeros_like(Wo)
	dbh, dby = np.zeros_like(bh), np.zeros_like(by)
	dhnext = np.zeros_like(hs[0])
	ii = window_size - 1

	for t in reversed(range(ii + 1)):
	dy = np.copy(ps[t])
	dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
	dWhy += dy @ hs[t].T
	dby += dy
	dh = Wo.T @ dy + dhnext # backprop into h
	dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
	dbh += dhraw
	dWxh += dhraw @ xs[t].T
	dWhh += dhraw @ hs[t - 1].T
	dhnext = Wh.T @ dhraw

	for dparam in (dWxh, dWhh, dWhy, dbh, dby):
	np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
	return loss, dWxh, dWhh, dWhy, dbh, dby, hs[ii]

	for i in range(iterations):

	sample = random_chunk()
	inputs = [chars_map[ch] for ch in sample[:-1]]
	targets = [chars_map[ch] for ch in sample[1:]]

	(xs, hs, ys, ps), loss = forward(inputs, targets)
	loss, dWxh, dWhh, dWhy, dbh, dby, hprev = backprop(xs, hs, ys, ps)
	smooth_loss = smooth_loss * 0.999 + loss * 0.001

	if not i % 1000:
	sample_ix = sample_output(hprev, inputs[0], 40, Wi, Wh, Wo, bh, by)
	txt = ''.join(chars_map.inverse[ix] for ix in sample_ix)
	print(f'----\n {txt} \n----')

	print(f"iter {i}, loss: {smooth_loss}") # print progress

	# perform parameter update with Adagrad
	for param, dparam, mem in zip([Wi, Wh, Wo, bh, by],
	[dWxh, dWhh, dWhy, dbh, dby],
	[mWxh, mWhh, mWhy, mbh, mby]):
	mem += dparam * dparam
	param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

	iterations = 10000000
	train(iterations)