|
# -*- coding: utf-8 -*- |
|
"""Neural Network class for Assignment 3 |
|
|
|
This module implements the class MLP, a three layer Neural network |
|
with tanh activation function for solving the two-spiral problem. |
|
|
|
__author__ = "Lester James V. Miranda" |
|
__email__ = "lester.miranda@toki.waseda.jp" |
|
""" |
|
import numpy as np |
|
|
|
class MLP(object): |
|
"""This is a three-layer neural network for solving the two-spiral problem |
|
for the Neural Networks Class Spring 2017. The network has one hidden layer, |
|
and has a tanh activation function after the first fully-connected net. Thus, |
|
|
|
input_layer ---- hidden_layer x 2 ---- output_layer |
|
[tanh] [softmax] |
|
|
|
To use this class, simply initialize the model and train it. |
|
model = TwoLayerNet() # Assuming you are using the default parameters |
|
model.fit(X) |
|
pred = model.predict(y) |
|
|
|
""" |
|
def __init__(self, n_inputs=2, n_hidden=20, n_classes=2, std=1e-4): |
|
"""Initializes the parameters of the neural network. |
|
Here, we are initializing the weights into small values, whereas |
|
the biases are initialized to zero. |
|
|
|
Inputs: |
|
- input_size: dimensions of the input. |
|
- hidden_size: nb. of nodes in the hidden layer. |
|
- num_classes: nb. of classes in the output layer |
|
- std: controls the spread of sampling from a normal distrib. |
|
|
|
""" |
|
|
|
# Initialize the parameters |
|
self.params = {} |
|
|
|
# First layer weights and biases |
|
self.params['W1'] = std * np.random.randn(n_inputs, n_hidden) |
|
self.params['b1'] = np.random.randn(n_hidden) |
|
|
|
# Second layer weights and biases |
|
self.params['W2'] = std * np.random.randn(n_hidden, n_hidden) |
|
self.params['b2'] = np.random.randn(n_hidden) |
|
|
|
# Output layer weights and biases |
|
self.params['W3'] = std * np.random.randn(n_hidden, n_classes) |
|
self.params['b3'] = np.random.randn(n_classes) |
|
|
|
# Initialize the velocities |
|
self.velocity = {} |
|
|
|
# First layer velocity |
|
self.velocity['W1'] = np.zeros((n_inputs, n_hidden)) |
|
self.velocity['b1'] = np.zeros(n_hidden) |
|
|
|
# Second layer weights and biases |
|
self.velocity['W2'] = np.zeros((n_hidden, n_hidden)) |
|
self.velocity['b2'] = np.zeros(n_hidden) |
|
|
|
# Output layer weights and biases |
|
self.velocity['W3'] = np.zeros((n_hidden, n_classes)) |
|
self.velocity['b3'] = np.zeros(n_classes) |
|
|
|
|
|
|
|
def loss(self, X, y=None, reg=0.0): |
|
""" |
|
Compute the loss and gradients for a two layer fully connected neural |
|
network. |
|
|
|
Inputs: |
|
- X: input data of shape (n_examples, n_features). |
|
- y: vector of training labels. |
|
- reg: Regularization strength. |
|
|
|
Returns: |
|
- If y is None, then returns score matrix. |
|
- If y is not none, return |
|
loss: computed loss (both data loss and regularization loss) |
|
grads: dictionary containing all the gradients. |
|
|
|
""" |
|
|
|
W1, b1 = self.params['W1'], self.params['b1'] |
|
W2, b2 = self.params['W2'], self.params['b2'] |
|
W3, b3 = self.params['W3'], self.params['b3'] |
|
N, D = X.shape |
|
|
|
#----------------- Forward propagation ------------------ |
|
|
|
scores = None |
|
z1 = X.dot(W1) + b1 # First layer pre-activation |
|
a1 = np.tanh(z1) # First layer activation (using tanh) |
|
z2 = a1.dot(W2) + b2 # Second layer pre-activation |
|
a2 = np.tanh(z2) # Second layer activation (using tanh) |
|
z3 = a2.dot(W3) + b3 # Third layer pre-activation |
|
logits = z3 # Keep logits |
|
|
|
if y is None: |
|
return logits |
|
|
|
#------------ Perform softmax cross-entropy ------------ |
|
|
|
# Compute for the softmax (activation function) |
|
loss = None |
|
exp_scores = np.exp(logits) |
|
probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True) |
|
|
|
# Compute for cross-entropy loss |
|
corect_logprobs = -np.log(probs[range(N), y]) |
|
data_loss = np.sum(corect_logprobs) / N |
|
reg_loss = 0.5 * reg * (np.sum(W1 * W1) + np.sum(W2 * W2) + np.sum(W3 * W3)) |
|
loss = data_loss + reg_loss |
|
|
|
#------------------- Backpropagation -------------------- |
|
grads = {} |
|
|
|
# Compute the logits gradients |
|
dlogits = probs |
|
dlogits[range(N),y] -= 1 |
|
dlogits /= N |
|
|
|
# Propagate the loss back to the output layer |
|
grads['W3'] = np.dot(a2.T, dlogits) |
|
grads['b3'] = np.sum(dlogits, axis=0) |
|
|
|
# Compute the hidden layer 2 gradient |
|
dhidden_2 = np.multiply(self._tanh_deriv(z2), np.dot(dlogits, W3.T)) |
|
|
|
# Propagate the loss back to hidden layer 2 |
|
grads['W2'] = np.dot(a1.T, dhidden_2) |
|
grads['b2'] = np.sum(dhidden_2, axis =0) |
|
|
|
# Compute the hidden layer 1 gradient |
|
dhidden_1 = np.multiply(self._tanh_deriv(z1), np.dot(dhidden_2, W2.T)) |
|
|
|
# Propagate the loss back to hidden layer 1 |
|
grads['W1'] = np.dot(X.T, dhidden_1) |
|
grads['b1'] = np.sum(dhidden_1, axis=0) |
|
|
|
# Accumulate gradients in reg term |
|
grads['W3'] += reg * W3 |
|
grads['W2'] += reg * W2 |
|
grads['W1'] += reg * W1 |
|
|
|
return loss, grads |
|
|
|
def train(self, X, y, learning_rate=0.5, mu=0.05, num_iters=50000,print_step=1000, |
|
reg_param=0.0,verbose=1): |
|
"""Trains the created neural network model using the parameters stated. |
|
|
|
Inputs: |
|
- X: Input data of shape (N, D). Each X[i] is a training sample. If passing a single example signal, use np.array([[i]]). |
|
- y: Label data d(t). If passing a single example signal, use np.array([[i]]). |
|
- learning_rate: learning rate to be used in stochastic gradient descent |
|
- num_iters: number of iterations. |
|
- verbose: prints progress during optimization proper. |
|
""" |
|
|
|
# Define history list |
|
loss_history = [] |
|
train_acc_history = [] |
|
|
|
for i in range(num_iters): |
|
# Perform forward propagation and compute for the loss. |
|
loss, grads = self.loss(X,y, reg_param) |
|
|
|
# Append all returned variables in a history list. |
|
loss_history.append(loss) |
|
|
|
# Add momentum |
|
self.velocity['W1'] = (self.velocity['W1'] * mu) - learning_rate * grads['W1'] |
|
self.velocity['b1'] = (self.velocity['b1'] * mu) - learning_rate * grads['b1'] |
|
self.velocity['W2'] = (self.velocity['W2'] * mu) - learning_rate * grads['W2'] |
|
self.velocity['b2'] = (self.velocity['b2'] * mu) - learning_rate * grads['b2'] |
|
self.velocity['W3'] = (self.velocity['W3'] * mu) - learning_rate * grads['W3'] |
|
self.velocity['b3'] = (self.velocity['b3'] * mu) - learning_rate * grads['b3'] |
|
|
|
# Adjust the neural network parameters |
|
self.params['W1'] += self.velocity['W1'] |
|
self.params['b1'] += self.velocity['b1'] |
|
self.params['W2'] += self.velocity['W2'] |
|
self.params['b2'] += self.velocity['b2'] |
|
self.params['W3'] += self.velocity['W3'] |
|
self.params['b3'] += self.velocity['b3'] |
|
|
|
# Check accuracy |
|
train_acc = (self.predict(X) == y).mean() |
|
train_acc_history.append(train_acc) |
|
|
|
if (verbose>=2) and (i % print_step == 0): |
|
print('Iteration %d / %d: loss %f, acc %f' % (i+1, num_iters, loss, train_acc)) |
|
|
|
|
|
if (verbose>=1): |
|
print('Done! loss: %f, acc: %f' %(loss, train_acc)) |
|
|
|
return {'loss_history': loss_history, |
|
'acc_history': train_acc_history} |
|
|
|
def predict(self, X): |
|
"""Use the trained weights of the neural network to determine the class. |
|
The way this works is that it performs a feedforward propagation to compute |
|
for the logits, and use the logits to get the max. |
|
|
|
Inputs: |
|
- X: numpy ndarray of shape (N,D) giving N D-dimensional data points to |
|
clasify. |
|
|
|
Returns: |
|
- y_pred: numpy ndarray prediction of shape (N,). |
|
""" |
|
z1 = X.dot(self.params['W1']) + self.params['b1'] |
|
a1 = np.tanh(z1) |
|
z2 = a1.dot(self.params['W2']) + self.params['b2'] |
|
a2 = np.tanh(z2) |
|
z3 = a2.dot(self.params['W3']) + self.params['b3'] |
|
logits = z3 |
|
y_pred = np.argmax(logits, axis=1) |
|
|
|
return y_pred |
|
|
|
def _tanh_deriv(self,x): |
|
"""Helper function to compute for the first-derivative of tanh |
|
|
|
Input: |
|
- x: argument to compute the derivative from. |
|
""" |
|
return 1.0 - np.tanh(x)**2 |