Skip to content

Instantly share code, notes, and snippets.

@justanotherminh
Last active September 1, 2018 23:40
Show Gist options
  • Save justanotherminh/48b2c588ee22c62044877dd6f5281987 to your computer and use it in GitHub Desktop.
Save justanotherminh/48b2c588ee22c62044877dd6f5281987 to your computer and use it in GitHub Desktop.
Simple 2-layer feedforward neural networ from scratch with brand new SELU activation
import numpy as np
def sigmoid(v):
return 1 / (1 + np.exp(-v))
def one_hot(x):
N = x.size
D = x.max() + 1
label = np.zeros(N, D)
label[np.arange(N), x] = 1.
return label
def selu(x):
alpha = 1.6732632423543772848170429916717
lamb = 1.0507009873554804934193349852946
return lamb * np.where(x > 0., x, alpha * np.exp(x) - alpha)
def selu_backward(x):
alpha = 1.6732632423543772848170429916717
lamb = 1.0507009873554804934193349852946
return lamb * np.where(x > 0., 1., alpha * np.exp(x))
def softmax(logits):
e = np.exp(logits)
return e / np.expand_dims(e.sum(axis=1), axis=1)
def get_data_batch(data, label, batch_size):
N = data.shape[0]
mask = np.random.choice(N, batch_size, replace=False)
return data[mask, :], label[mask, :]
def loss(prob, label):
ce = -label * np.log(prob)
return ce.mean(axis=0).sum()
def conv_forward(x, w, b, stride, pad):
N, C, H, W = x.shape
F, _, HH, WW = w.shape
H_out = 1 + (H + 2 * pad - HH) / stride
W_out = 1 + (W + 2 * pad - WW) / stride
out = np.empty([N, F, H_out, W_out])
x_pad = np.pad(x, ((0, 0), (0, 0), (pad, pad), (pad, pad)), 'constant',
constant_values=((None, None), (None, None), (0, 0), (0, 0)))
for u in xrange(H_out):
for v in xrange(W_out):
# patch.shape = N, C, HH, WW
patch = x_pad[:, :, stride * u:stride * u + HH, stride * v:stride * v + WW]
# Why didn't I figure this out earlier?
out[:, :, u, v] = np.dot(patch.reshape(patch.shape[0], -1),
w.reshape(w.shape[0], -1).T) + b[None, :]
return out
def conv_backward(dout, x, w, b, stride, pad):
N, C, H, W = x.shape
F, _, HH, WW = w.shape # F, C, HH, WW
_, _, H_out, W_out = dout.shape # N, F, H', W'
dw, db = np.zeros_like(w), np.zeros_like(b)
x_pad = np.pad(x, ((0, 0), (0, 0), (pad, pad), (pad, pad)), 'constant',
constant_values=((None, None), (None, None), (0, 0), (0, 0)))
dx = np.zeros_like(x_pad)
for u in xrange(H_out):
for v in xrange(W_out):
patch = x_pad[:, :, stride * u:stride * u + HH, stride * v:stride * v + WW]
dout_patch = dout[:, :, u, v]
dx[:, :, stride * u:stride * u + HH, stride * v:stride * v + WW] += \
np.dot(dout_patch, w.reshape(w.shape[0], -1)).reshape(N, C, HH, WW)
dw += dout_patch.T.dot(patch.reshape(patch.shape[0], -1)).reshape(F, C, HH, WW)
db += dout_patch.sum(axis=0).flatten()
dx = dx[:, :, pad:-pad, pad:-pad]
return dx, dw, db
import numpy as np
from layers import selu, selu_backward, softmax, get_data_batch, loss
np.random.seed(1234) # Quality control
def net(x, params):
W1, W2, b1, b2 = params
# h = sigmoid(x.dot(W1) + b1)
a = x.dot(W1) + b1
# h[h < 0] *= 0.1 # Leaky ReLU
h = selu(a)
out = softmax(h.dot(W2) + b2)
return out, h
def gradient(dout, x, params, cache):
W1, W2, b1, b2 = params
h = cache
db2 = dout.sum(axis=0)[None, :]
dW2 = h.T.dot(dout)
dh = dout.dot(W2.T)
# dh *= h * (1 - h)
# dh[h < 0] *= 0.1
dh *= selu_backward(h)
db1 = dh.sum(axis=0)[None, :]
dW1 = x.T.dot(dh)
return dW1, dW2, db1, db2
def train():
from keras.datasets import mnist
from keras.utils.np_utils import to_categorical
(X_train, y_train), (X_test, y_test) = mnist.load_data()
y_train, y_test = to_categorical(y_train, 10), to_categorical(y_test, 10)
X_train, X_test = X_train / 255., X_test / 255.
X_train, X_test = X_train.reshape(-1, 784), X_test.reshape(-1, 784)
params_shapes = [[784, 256], [256, 10], [1, 256], [1, 10]]
params = [0.01 * np.random.randn(h, w) for h, w in params_shapes]
grad_cache = [np.zeros(shape) for shape in params_shapes]
lr = 1e-3
gamma = 0.9
for i in xrange(5000):
X, y = get_data_batch(X_train, y_train, 128)
prob, cache = net(X, params)
grad = gradient(prob - y, X, [p - gamma * v for p, v in zip(params, grad_cache)], cache) # Nesterov
# grad = gradient(prob - y, X, params, cache)
for g, gc in zip(grad, grad_cache):
gc *= gamma
gc += lr * g
if i % 100 == 0:
print 'Loss: {:.5f}'.format(loss(prob, y))
for w, d in zip(params, grad_cache):
w -= d
prob, _ = net(X_test, params)
print 'Validation Accuracy: {:.4f}'.format(np.mean(prob.argmax(axis=1) == y_test.argmax(axis=1)))
for i, w in enumerate(params):
np.save('saved_networks/np_weights/{}.npy'.format(i), w)
if __name__ == '__main__':
train()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment