justanotherminh/layers.py

## layers.py
import numpy as np


def sigmoid(v):
    return 1 / (1 + np.exp(-v))


def one_hot(x):
    N = x.size
    D = x.max() + 1
    label = np.zeros(N, D)
    label[np.arange(N), x] = 1.
    return label


def selu(x):
    alpha = 1.6732632423543772848170429916717
    lamb = 1.0507009873554804934193349852946
    return lamb * np.where(x > 0., x, alpha * np.exp(x) - alpha)


def selu_backward(x):
    alpha = 1.6732632423543772848170429916717
    lamb = 1.0507009873554804934193349852946
    return lamb * np.where(x > 0., 1., alpha * np.exp(x))


def softmax(logits):
    e = np.exp(logits)
    return e / np.expand_dims(e.sum(axis=1), axis=1)


def get_data_batch(data, label, batch_size):
    N = data.shape[0]
    mask = np.random.choice(N, batch_size, replace=False)
    return data[mask, :], label[mask, :]


def loss(prob, label):
    ce = -label * np.log(prob)
    return ce.mean(axis=0).sum()


def conv_forward(x, w, b, stride, pad):
    N, C, H, W = x.shape
    F, _, HH, WW = w.shape
    H_out = 1 + (H + 2 * pad - HH) / stride
    W_out = 1 + (W + 2 * pad - WW) / stride
    out = np.empty([N, F, H_out, W_out])

    x_pad = np.pad(x, ((0, 0), (0, 0), (pad, pad), (pad, pad)), 'constant',
                   constant_values=((None, None), (None, None), (0, 0), (0, 0)))

    for u in xrange(H_out):
        for v in xrange(W_out):
            # patch.shape = N, C, HH, WW
            patch = x_pad[:, :, stride * u:stride * u + HH, stride * v:stride * v + WW]
            # Why didn't I figure this out earlier?
            out[:, :, u, v] = np.dot(patch.reshape(patch.shape[0], -1),
                                     w.reshape(w.shape[0], -1).T) + b[None, :]

    return out


def conv_backward(dout, x, w, b, stride, pad):
    N, C, H, W = x.shape
    F, _, HH, WW = w.shape  # F, C, HH, WW
    _, _, H_out, W_out = dout.shape  # N, F, H', W'

    dw, db = np.zeros_like(w), np.zeros_like(b)
    x_pad = np.pad(x, ((0, 0), (0, 0), (pad, pad), (pad, pad)), 'constant',
                   constant_values=((None, None), (None, None), (0, 0), (0, 0)))
    dx = np.zeros_like(x_pad)

    for u in xrange(H_out):
        for v in xrange(W_out):
            patch = x_pad[:, :, stride * u:stride * u + HH, stride * v:stride * v + WW]
            dout_patch = dout[:, :, u, v]
            dx[:, :, stride * u:stride * u + HH, stride * v:stride * v + WW] += \
                np.dot(dout_patch, w.reshape(w.shape[0], -1)).reshape(N, C, HH, WW)
            dw += dout_patch.T.dot(patch.reshape(patch.shape[0], -1)).reshape(F, C, HH, WW)
            db += dout_patch.sum(axis=0).flatten()
    dx = dx[:, :, pad:-pad, pad:-pad]
    return dx, dw, db

## np_2layers.py
import numpy as np
from layers import selu, selu_backward, softmax, get_data_batch, loss

np.random.seed(1234)  # Quality control


def net(x, params):
    W1, W2, b1, b2 = params
    # h = sigmoid(x.dot(W1) + b1)
    a = x.dot(W1) + b1
    # h[h < 0] *= 0.1  # Leaky ReLU
    h = selu(a)
    out = softmax(h.dot(W2) + b2)
    return out, h


def gradient(dout, x, params, cache):
    W1, W2, b1, b2 = params
    h = cache
    db2 = dout.sum(axis=0)[None, :]
    dW2 = h.T.dot(dout)
    dh = dout.dot(W2.T)
    # dh *= h * (1 - h)
    # dh[h < 0] *= 0.1
    dh *= selu_backward(h)
    db1 = dh.sum(axis=0)[None, :]
    dW1 = x.T.dot(dh)
    return dW1, dW2, db1, db2


def train():
    from keras.datasets import mnist
    from keras.utils.np_utils import to_categorical

    (X_train, y_train), (X_test, y_test) = mnist.load_data()
    y_train, y_test = to_categorical(y_train, 10), to_categorical(y_test, 10)
    X_train, X_test = X_train / 255., X_test / 255.
    X_train, X_test = X_train.reshape(-1, 784), X_test.reshape(-1, 784)

    params_shapes = [[784, 256], [256, 10], [1, 256], [1, 10]]
    params = [0.01 * np.random.randn(h, w) for h, w in params_shapes]
    grad_cache = [np.zeros(shape) for shape in params_shapes]

    lr = 1e-3
    gamma = 0.9

    for i in xrange(5000):
        X, y = get_data_batch(X_train, y_train, 128)
        prob, cache = net(X, params)
        grad = gradient(prob - y, X, [p - gamma * v for p, v in zip(params, grad_cache)], cache)  # Nesterov
        # grad = gradient(prob - y, X, params, cache)
        for g, gc in zip(grad, grad_cache):
            gc *= gamma
            gc += lr * g
        if i % 100 == 0:
            print 'Loss: {:.5f}'.format(loss(prob, y))
        for w, d in zip(params, grad_cache):
            w -= d
    prob, _ = net(X_test, params)
    print 'Validation Accuracy: {:.4f}'.format(np.mean(prob.argmax(axis=1) == y_test.argmax(axis=1)))
    for i, w in enumerate(params):
        np.save('saved_networks/np_weights/{}.npy'.format(i), w)


if __name__ == '__main__':
    train()
	import numpy as np


	def sigmoid(v):
	return 1 / (1 + np.exp(-v))


	def one_hot(x):
	N = x.size
	D = x.max() + 1
	label = np.zeros(N, D)
	label[np.arange(N), x] = 1.
	return label


	def selu(x):
	alpha = 1.6732632423543772848170429916717
	lamb = 1.0507009873554804934193349852946
	return lamb * np.where(x > 0., x, alpha * np.exp(x) - alpha)


	def selu_backward(x):
	alpha = 1.6732632423543772848170429916717
	lamb = 1.0507009873554804934193349852946
	return lamb * np.where(x > 0., 1., alpha * np.exp(x))


	def softmax(logits):
	e = np.exp(logits)
	return e / np.expand_dims(e.sum(axis=1), axis=1)


	def get_data_batch(data, label, batch_size):
	N = data.shape[0]
	mask = np.random.choice(N, batch_size, replace=False)
	return data[mask, :], label[mask, :]


	def loss(prob, label):
	ce = -label * np.log(prob)
	return ce.mean(axis=0).sum()


	def conv_forward(x, w, b, stride, pad):
	N, C, H, W = x.shape
	F, _, HH, WW = w.shape
	H_out = 1 + (H + 2 * pad - HH) / stride
	W_out = 1 + (W + 2 * pad - WW) / stride
	out = np.empty([N, F, H_out, W_out])

	x_pad = np.pad(x, ((0, 0), (0, 0), (pad, pad), (pad, pad)), 'constant',
	constant_values=((None, None), (None, None), (0, 0), (0, 0)))

	for u in xrange(H_out):
	for v in xrange(W_out):
	# patch.shape = N, C, HH, WW
	patch = x_pad[:, :, stride * u:stride * u + HH, stride * v:stride * v + WW]
	# Why didn't I figure this out earlier?
	out[:, :, u, v] = np.dot(patch.reshape(patch.shape[0], -1),
	w.reshape(w.shape[0], -1).T) + b[None, :]

	return out


	def conv_backward(dout, x, w, b, stride, pad):
	N, C, H, W = x.shape
	F, _, HH, WW = w.shape # F, C, HH, WW
	_, _, H_out, W_out = dout.shape # N, F, H', W'

	dw, db = np.zeros_like(w), np.zeros_like(b)
	x_pad = np.pad(x, ((0, 0), (0, 0), (pad, pad), (pad, pad)), 'constant',
	constant_values=((None, None), (None, None), (0, 0), (0, 0)))
	dx = np.zeros_like(x_pad)

	for u in xrange(H_out):
	for v in xrange(W_out):
	patch = x_pad[:, :, stride * u:stride * u + HH, stride * v:stride * v + WW]
	dout_patch = dout[:, :, u, v]
	dx[:, :, stride * u:stride * u + HH, stride * v:stride * v + WW] += \
	np.dot(dout_patch, w.reshape(w.shape[0], -1)).reshape(N, C, HH, WW)
	dw += dout_patch.T.dot(patch.reshape(patch.shape[0], -1)).reshape(F, C, HH, WW)
	db += dout_patch.sum(axis=0).flatten()
	dx = dx[:, :, pad:-pad, pad:-pad]
	return dx, dw, db
	import numpy as np
	from layers import selu, selu_backward, softmax, get_data_batch, loss

	np.random.seed(1234) # Quality control


	def net(x, params):
	W1, W2, b1, b2 = params
	# h = sigmoid(x.dot(W1) + b1)
	a = x.dot(W1) + b1
	# h[h < 0] *= 0.1 # Leaky ReLU
	h = selu(a)
	out = softmax(h.dot(W2) + b2)
	return out, h


	def gradient(dout, x, params, cache):
	W1, W2, b1, b2 = params
	h = cache
	db2 = dout.sum(axis=0)[None, :]
	dW2 = h.T.dot(dout)
	dh = dout.dot(W2.T)
	# dh = h (1 - h)
	# dh[h < 0] *= 0.1
	dh *= selu_backward(h)
	db1 = dh.sum(axis=0)[None, :]
	dW1 = x.T.dot(dh)
	return dW1, dW2, db1, db2


	def train():
	from keras.datasets import mnist
	from keras.utils.np_utils import to_categorical

	(X_train, y_train), (X_test, y_test) = mnist.load_data()
	y_train, y_test = to_categorical(y_train, 10), to_categorical(y_test, 10)
	X_train, X_test = X_train / 255., X_test / 255.
	X_train, X_test = X_train.reshape(-1, 784), X_test.reshape(-1, 784)

	params_shapes = [[784, 256], [256, 10], [1, 256], [1, 10]]
	params = [0.01 * np.random.randn(h, w) for h, w in params_shapes]
	grad_cache = [np.zeros(shape) for shape in params_shapes]

	lr = 1e-3
	gamma = 0.9

	for i in xrange(5000):
	X, y = get_data_batch(X_train, y_train, 128)
	prob, cache = net(X, params)
	grad = gradient(prob - y, X, [p - gamma * v for p, v in zip(params, grad_cache)], cache) # Nesterov
	# grad = gradient(prob - y, X, params, cache)
	for g, gc in zip(grad, grad_cache):
	gc *= gamma
	gc += lr * g
	if i % 100 == 0:
	print 'Loss: {:.5f}'.format(loss(prob, y))
	for w, d in zip(params, grad_cache):
	w -= d
	prob, _ = net(X_test, params)
	print 'Validation Accuracy: {:.4f}'.format(np.mean(prob.argmax(axis=1) == y_test.argmax(axis=1)))
	for i, w in enumerate(params):
	np.save('saved_networks/np_weights/{}.npy'.format(i), w)


	if __name__ == '__main__':
	train()