rayryeng/NeuralNet2.m Secret

## demoNN.m
%%% Neural Network Engine Example
clearvars;
close all;

%%% 1. Declare total number of input neurons, hidden layer neurons and output
%%% neurons
input_neurons = 2;
hidden_neurons = [4 2];
output_neurons = 1;

%%% 2. Set total number of iterations
N = 10000;

%%% 3. Create synthetic dataset
rng(123);
M = 100;
tol = 0.1;
x1 = bsxfun(@plus, rand(M,2) - 0.5, [-0.5-tol, 0.5+tol]); % Top-Left quadrant
x2 = bsxfun(@plus, rand(M,2) - 0.5, [0.5+tol, 0.5+tol]); % Top-Right quadrant
x3 = bsxfun(@plus, rand(M,2) - 0.5, [-0.5-tol, -0.5-tol]); % Bottom-Left quadrant
x4 = bsxfun(@plus, rand(M,2) - 0.5, [0.5+tol, -0.5-tol]); % Bottom-Right quadrant

% Create final dataset
X = [x1; x2; x3; x4];
Y = [-ones(M,1); ones(2*M,1); -ones(M,1)];

% Randomly sample and create training and test sets
per = 0.5;
m = numel(Y);
ind = randperm(m);
pt = floor(per*m);
Xtrain = X(ind(1:pt), :);
Ytrain = Y(ind(1:pt));
Xtest = X(ind(pt+1:end),:);
Ytest = Y(ind(pt+1:end));
mTrain = size(Xtrain,1);

%%% 4. Declare NN engine
NN = NeuralNet2([input_neurons, hidden_neurons, output_neurons]);
NN.LearningRate = 0.1;
NN.RegularizationType = 'L2';
NN.RegularizationRate = 0.01;
NN.ActivationFunction = 'Tanh';
NN.BatchSize = 10;
NN.Debug = true;

%%% 6. Find optimal weights
costVal = NN.train(Xtrain, Ytrain, N);
figure;
gscatter(Xtrain(:,1), Xtrain(:,2), Ytrain, [0 0.4470 0.7410; 0.8500 0.3250 0.0980], ...
    'x');
hold on;
gscatter(Xtest(:,1), Xtest(:,2), Ytest, [0 0.4470 0.7410; 0.8500 0.3250 0.0980], ...
    '.');

%%% 7. Compute predictions for training and testing data
predicthTrain = 2*double(NN.sim(Xtrain) >= 0) - 1;
predicthTest = 2*double(NN.sim(Xtest) >= 0) - 1;

%%% 8. Compute classification accuracy for training and testing data
fprintf('Classification Accuracy for Training Data: %f\n', ...
        100*sum(predicthTrain == Ytrain) / numel(Ytrain));

fprintf('Classification Accuracy for Testing Data: %f\n', ...
        100*sum(predicthTest == Ytest) / numel(Ytest));

fprintf('The final cost to assign the optimal weights are: %f\n', costVal(end));

%%% 9. Plotting of cost function per epoch
figure;
plot(1:N, costVal);
title('Cost function vs. epoch');
xlabel('Epoch');
ylabel('Cost Function');
grid;

%%% 10. Plot decision regions
[X,Y] = meshgrid(linspace(-1.25,1.25,1000));
Xout = [X(:) Y(:)];
predictDes = 2*double(NN.sim(Xout) >= 0) - 1;
figure;

% Plot decision regions first
h = plot(Xout(predictDes==-1,1), Xout(predictDes==-1,2), '.', 'MarkerEdgeColor', ...
    [0.9290;0.6940;0.1250]);
hold on;
h2 = plot(Xout(predictDes==1,1), Xout(predictDes==1,2), '.', 'MarkerEdgeColor', ...
    [0.4940;0.1840;0.5560]);

% Now plot points
plot(Xtrain(Ytrain==-1,1), Xtrain(Ytrain==-1,2), '.', 'MarkerEdgeColor', ...
    [0 0.4470 0.7410], 'MarkerSize', 12);
plot(Xtrain(Ytrain==1,1), Xtrain(Ytrain==1,2), '.', 'MarkerEdgeColor', ...
    [0.8500 0.3250 0.0980], 'MarkerSize', 12);
plot(Xtest(Ytest==-1,1), Xtest(Ytest==-1,2), 'x', 'MarkerEdgeColor', ...
    [0 0.4470 0.7410], 'MarkerSize', 12);
plot(Xtest(Ytest==1,1), Xtest(Ytest==1,2), 'x', 'MarkerEdgeColor', ...
    [0.8500 0.3250 0.0980], 'MarkerSize', 12);
axis tight;
legend('Decision Region - Negative', 'Decision Region - Positive', ...
    'Training Examples - Negative', 'Training Examples - Positive', ...
    'Test Examples - Negative', 'Test Examples - Positive');

## NeuralNet2.m
classdef NeuralNet2 < handle

    properties (Access = public)
        LearningRate
        ActivationFunction
        RegularizationType
        RegularizationRate
        BatchSize
        Debug
    end

    properties (Access = private)
        inputSize
        hiddenSizes
        outputSize
        weights
    end

    methods
        % Class constructor
        function this = NeuralNet2(layerSizes)
            % default params
            this.LearningRate = 0.003;
            this.ActivationFunction = 'Tanh';
            this.RegularizationType = 'None';
            this.RegularizationRate = 0;
            this.BatchSize = 10;
            this.Debug = false;

            % network structure (fully-connected feed-forward)
            % Obtain input layer neuron size
            this.inputSize = layerSizes(1);

            % Obtain the hidden layer neuron sizes
            this.hiddenSizes = layerSizes(2:end-1);

            % Obtain the output layer neuron size
            this.outputSize = layerSizes(end);

            % Initialize matrices relating between the ith layer
            % and (i+1)th layer
            this.weights = cell(1,numel(layerSizes)-1);
            for i=1:numel(layerSizes)-1
                this.weights{i} = zeros(layerSizes(i)+1, layerSizes(i+1));
            end
        end


        % ???
        function configure(this, X, Y)
            % check correct sizes
            [xrows,xcols] = size(X);
            [yrows,ycols] = size(Y);
            assert(xrows == yrows);
            assert(xcols == this.inputSize);
            assert(ycols == this.outputSize);

            % min/max of inputs/outputs
            inMin = min(X);
            inMax = max(X);
            outMin = min(Y);
            outMax = max(Y);
        end

        % Initialize neural network weights
        function init(this)
            % initialize with random weights
            for i=1:numel(this.weights)
                num = numel(this.weights{i});
                this.weights{i}(:) = rand(num,1) - 0.5;  % [-0.5,0.5]
            end
        end

        % Perform training with Stochastic Gradient Descent
        function perf = train(this, X, Y, numIter)

            % Ensure correct sizes
            assert(size(X,1) == size(Y,1))

            % Ensure regularization rate and batch size is proper
            assert(this.BatchSize >= 1);
            assert(this.RegularizationRate >= 0);

            % Check if we have specified the right regularization type
            regType = this.RegularizationType;
            assert(any(strcmpi(regType, {'l1', 'l2', 'none'})));

            % Initialize cost function array
            perf = zeros(1, numIter);

            % Initialize weights
            init(this);

            % Total number of examples
            N = size(X,1);

            % Total number of applicable layers
            L = numel(this.weights);

            % Get batch size
            B = this.BatchSize;

            % Safely catch if batch size is larger than total number
            % of examples
            if B > N
                B = N;
            end

            % Cell array to store input and outputs of each neuron
            sNeuron = cell(1,L);

            % First cell array is for the initial
            xNeuron = cell(1,L+1);

            % Cell array for storing the sensitivities
            delta = cell(1,L);

            % For L1 regularization
            % Method used: http://aclweb.org/anthology/P/P09/P09-1054.pdf
            if strcmpi(regType, 'l1')
                % This represents the total L1 penalty that each
                % weight could have received up to current point
                uk = 0;

                % Total penalty for each weight that was received up to
                % current point
                qk = cell(1,L);
                for ii=1:L
                    qk{ii} = zeros(size(this.weights{ii}));
                end
            end

            % Get activation function
            fcn = getActivationFunction(this.ActivationFunction);

            skipFactor = floor(numIter/10);

            % For each iteration...
            for ii = 1:numIter
                % If the batch size is equal to the total number of examples
                % don't bother with random selection as this will be a full
                % batch gradient descent
                if N == B
                    ind = 1 : N;
                else
                    % Randomly select examples corresponding to the batch size
                    % if the batch size is not equal to the number of examples
                    ind = randperm(N, B);
                end

                % Select out the training example features and expected outputs
                IN = X(ind, :);
                OUT = Y(ind, :);

                % Initialize input layer
                xNeuron{1} = [IN ones(B,1)];

                %%% Perform forward propagation
                % Make sure you save the inputs and outputs into each neuron
                % at the hidden and output layers
                for jj = 1:L
                    % Compute inputs into next layer
                    sNeuron{jj} = xNeuron{jj} * this.weights{jj};

                    % Compute outputs of this layer
                    if jj == L
                        xNeuron{jj+1} = fcn(sNeuron{jj});
                    else
                        xNeuron{jj+1} = [fcn(sNeuron{jj}) ones(B,1)];
                    end
                end

                %%% Perform backpropagation
                % Get derivative of activation function
                dfcn = getDerivativeActivationFunction(this.ActivationFunction);

                % Compute sensitivities for output layer
                delta{end} = (xNeuron{end} - OUT) .* dfcn(sNeuron{end});

                % Compute the sensitivities for the rest of the layers
                for jj = L-1 : -1 : 1
                    delta{jj} = dfcn(sNeuron{jj}) .* ...
                        (delta{jj+1}*(this.weights{jj+1}(1:end-1,:)).');
                end

                %%% Compute weight updates
                alpha = this.LearningRate;
                lambda = this.RegularizationRate;
                for jj = 1 : L
                    % Obtain the outputs and sensitivities for each
                    % affected layer
                    XX = xNeuron{jj};
                    D = delta{jj};

                    % Calculate batch weight update
                    weight_update = (1/B)*sum(bsxfun(@times, permute(XX, [2 3 1]), ...
                        permute(D, [3 2 1])), 3);

                    % Apply L2 regularization if required
                    if strcmpi(regType, 'l2')
                        weight_update(1:end-1,:) = weight_update(1:end-1,:) + ...
                            (lambda/B)*this.weights{jj}(1:end-1,:);
                    end

                    % Compute the final update
                    this.weights{jj} = this.weights{jj} - alpha*weight_update;
                end

                % Apply L1 regularization if required
                if strcmpi(regType, 'l1')
                    % Step #1 - Accumulate total L1 penalty that each
                    % weight could have received up to this point
                    uk = uk + (alpha*lambda/B);

                    % Step #2
                    % Using the updated weights, now apply the penalties
                    for jj = 1 : L
                        % 2a - Save previous weights and penalties
                        % Make sure to remove bias terms
                        z = this.weights{jj}(1:end-1,:);
                        q = qk{jj}(1:end-1,:);

                        % 2b - Using the previous weights, find the weights
                        % that are positive and negative
                        w = z;
                        indwp = w > 0;
                        indwn = w < 0;

                        % 2c - Perform the udpate on each condition
                        % individually
                        w(indwp) = max(0, w(indwp) - (uk + q(indwp)));
                        w(indwn) = min(0, w(indwn) + (uk - q(indwn)));

                        % 2d - Update the actual penalties
                        qk{jj}(1:end-1,:) = q + (w - z);

                        % Don't forget to update the actual weights!
                        this.weights{jj}(1:end-1,:) = w;
                    end
                end

                % Compute cost at this iteration
                perf(ii) = (0.5/B)*sum(sum((xNeuron{end} - OUT).^2));

                % Add in regularization if necessary
                if strcmpi(regType, 'l1')
                    for jj = 1 : L
                        perf(ii) = perf(ii) + ...
                            (lambda/B)*sum(sum(abs(this.weights{jj}(1:end-1,:))));
                    end
                elseif strcmpi(regType, 'l2')
                    for jj = 1 : L
                        perf(ii) = perf(ii) + ...
                            (0.5*lambda/B)*sum(sum((this.weights{jj}(1:end-1,:)).^2));
                    end
                end

                % Debugging output
                if this.Debug
                    if mod(ii,skipFactor) == 1 || ii == numIter
                        fprintf('Iteration #%d - Cost: %4.6e\n', ii, perf(ii));
                    end
                end
            end
        end

        % Perform forward propagation
        % Note that the bias units are the last row of the matrix
        % Inputs are in a 2D matrix of N x M
        % N is the number of examples
        % M is the number of features / number of input neurons
        function OUT = sim(this, X)
            % Check if the total number of features matches the
            % total number of input neurons
            assert(size(X,2) == this.inputSize);

            % Get total number of examples
            N = size(X,1);

            %%% Begin algorithm
            % Start with first layer
            IN = X;

            % Get activation function
            fcn = getActivationFunction(this.ActivationFunction);

            % For each layer...
            for ii=1:numel(this.weights)
                % Compute inputs into each neuron and corresponding
                % outputs
                OUT = fcn([IN ones(N,1)] * this.weights{ii});

                % Save for next iteration
                IN = OUT;
            end
        end
    end

end

function fcn = getActivationFunction(activation)
    switch lower(activation)
        case 'linear'
            fcn = @f_linear;
        case 'relu'
            fcn = @f_relu;
        case 'tanh'
            fcn = @f_tanh;
        case 'sigmoid'
            fcn = @f_sigmoid;
        otherwise
            error('Unknown activation function');
    end
end

function fcn = getDerivativeActivationFunction(activation)
    switch lower(activation)
        case 'linear'
            fcn = @fd_linear;
        case 'relu'
            fcn = @fd_relu;
        case 'tanh'
            fcn = @fd_tanh;
        case 'sigmoid'
            fcn = @fd_sigmoid;
        otherwise
            error('Unknown activation function');
    end
end

% activation funtions and their derivatives
function y = f_linear(x)
    % See also: purelin
    y = x;
end

function y = fd_linear(x)
    % See also: dpurelin
    y = ones(size(x));
end

function y  = f_relu(x)
    % See also: poslin
    y = max(x, 0);
end

function y = fd_relu(x)
    % See also: dposlin
    y = double(x >= 0);
end

function y = f_tanh(x)
    % See also: tansig
    %y = 2 ./ (1 + exp(-2*x)) - 1;
    y = tanh(x);
end

function y = fd_tanh(x)
    % See also: dtansig
    y = f_tanh(x);
    y = 1 - y.^2;
end

function y = f_sigmoid(x)
    % See also: logsig
    y = 1 ./ (1 + exp(-x));
end

function y = fd_sigmoid(x)
    % See also: dlogsig
    y = f_sigmoid(x);
    y = y .* (1 - y);
end
	%%% Neural Network Engine Example
	clearvars;
	close all;

	%%% 1. Declare total number of input neurons, hidden layer neurons and output
	%%% neurons
	input_neurons = 2;
	hidden_neurons = [4 2];
	output_neurons = 1;

	%%% 2. Set total number of iterations
	N = 10000;

	%%% 3. Create synthetic dataset
	rng(123);
	M = 100;
	tol = 0.1;
	x1 = bsxfun(@plus, rand(M,2) - 0.5, [-0.5-tol, 0.5+tol]); % Top-Left quadrant
	x2 = bsxfun(@plus, rand(M,2) - 0.5, [0.5+tol, 0.5+tol]); % Top-Right quadrant
	x3 = bsxfun(@plus, rand(M,2) - 0.5, [-0.5-tol, -0.5-tol]); % Bottom-Left quadrant
	x4 = bsxfun(@plus, rand(M,2) - 0.5, [0.5+tol, -0.5-tol]); % Bottom-Right quadrant

	% Create final dataset
	X = [x1; x2; x3; x4];
	Y = [-ones(M,1); ones(2*M,1); -ones(M,1)];

	% Randomly sample and create training and test sets
	per = 0.5;
	m = numel(Y);
	ind = randperm(m);
	pt = floor(per*m);
	Xtrain = X(ind(1:pt), :);
	Ytrain = Y(ind(1:pt));
	Xtest = X(ind(pt+1:end),:);
	Ytest = Y(ind(pt+1:end));
	mTrain = size(Xtrain,1);

	%%% 4. Declare NN engine
	NN = NeuralNet2([input_neurons, hidden_neurons, output_neurons]);
	NN.LearningRate = 0.1;
	NN.RegularizationType = 'L2';
	NN.RegularizationRate = 0.01;
	NN.ActivationFunction = 'Tanh';
	NN.BatchSize = 10;
	NN.Debug = true;

	%%% 6. Find optimal weights
	costVal = NN.train(Xtrain, Ytrain, N);
	figure;
	gscatter(Xtrain(:,1), Xtrain(:,2), Ytrain, [0 0.4470 0.7410; 0.8500 0.3250 0.0980], ...
	'x');
	hold on;
	gscatter(Xtest(:,1), Xtest(:,2), Ytest, [0 0.4470 0.7410; 0.8500 0.3250 0.0980], ...
	'.');

	%%% 7. Compute predictions for training and testing data
	predicthTrain = 2*double(NN.sim(Xtrain) >= 0) - 1;
	predicthTest = 2*double(NN.sim(Xtest) >= 0) - 1;

	%%% 8. Compute classification accuracy for training and testing data
	fprintf('Classification Accuracy for Training Data: %f\n', ...
	100*sum(predicthTrain == Ytrain) / numel(Ytrain));

	fprintf('Classification Accuracy for Testing Data: %f\n', ...
	100*sum(predicthTest == Ytest) / numel(Ytest));

	fprintf('The final cost to assign the optimal weights are: %f\n', costVal(end));

	%%% 9. Plotting of cost function per epoch
	figure;
	plot(1:N, costVal);
	title('Cost function vs. epoch');
	xlabel('Epoch');
	ylabel('Cost Function');
	grid;

	%%% 10. Plot decision regions
	[X,Y] = meshgrid(linspace(-1.25,1.25,1000));
	Xout = [X(:) Y(:)];
	predictDes = 2*double(NN.sim(Xout) >= 0) - 1;
	figure;

	% Plot decision regions first
	h = plot(Xout(predictDes==-1,1), Xout(predictDes==-1,2), '.', 'MarkerEdgeColor', ...
	[0.9290;0.6940;0.1250]);
	hold on;
	h2 = plot(Xout(predictDes==1,1), Xout(predictDes==1,2), '.', 'MarkerEdgeColor', ...
	[0.4940;0.1840;0.5560]);

	% Now plot points
	plot(Xtrain(Ytrain==-1,1), Xtrain(Ytrain==-1,2), '.', 'MarkerEdgeColor', ...
	[0 0.4470 0.7410], 'MarkerSize', 12);
	plot(Xtrain(Ytrain==1,1), Xtrain(Ytrain==1,2), '.', 'MarkerEdgeColor', ...
	[0.8500 0.3250 0.0980], 'MarkerSize', 12);
	plot(Xtest(Ytest==-1,1), Xtest(Ytest==-1,2), 'x', 'MarkerEdgeColor', ...
	[0 0.4470 0.7410], 'MarkerSize', 12);
	plot(Xtest(Ytest==1,1), Xtest(Ytest==1,2), 'x', 'MarkerEdgeColor', ...
	[0.8500 0.3250 0.0980], 'MarkerSize', 12);
	axis tight;
	legend('Decision Region - Negative', 'Decision Region - Positive', ...
	'Training Examples - Negative', 'Training Examples - Positive', ...
	'Test Examples - Negative', 'Test Examples - Positive');
	classdef NeuralNet2 < handle

	properties (Access = public)
	LearningRate
	ActivationFunction
	RegularizationType
	RegularizationRate
	BatchSize
	Debug
	end

	properties (Access = private)
	inputSize
	hiddenSizes
	outputSize
	weights
	end

	methods
	% Class constructor
	function this = NeuralNet2(layerSizes)
	% default params
	this.LearningRate = 0.003;
	this.ActivationFunction = 'Tanh';
	this.RegularizationType = 'None';
	this.RegularizationRate = 0;
	this.BatchSize = 10;
	this.Debug = false;

	% network structure (fully-connected feed-forward)
	% Obtain input layer neuron size
	this.inputSize = layerSizes(1);

	% Obtain the hidden layer neuron sizes
	this.hiddenSizes = layerSizes(2:end-1);

	% Obtain the output layer neuron size
	this.outputSize = layerSizes(end);

	% Initialize matrices relating between the ith layer
	% and (i+1)th layer
	this.weights = cell(1,numel(layerSizes)-1);
	for i=1:numel(layerSizes)-1
	this.weights{i} = zeros(layerSizes(i)+1, layerSizes(i+1));
	end
	end


	% ???
	function configure(this, X, Y)
	% check correct sizes
	[xrows,xcols] = size(X);
	[yrows,ycols] = size(Y);
	assert(xrows == yrows);
	assert(xcols == this.inputSize);
	assert(ycols == this.outputSize);

	% min/max of inputs/outputs
	inMin = min(X);
	inMax = max(X);
	outMin = min(Y);
	outMax = max(Y);
	end

	% Initialize neural network weights
	function init(this)
	% initialize with random weights
	for i=1:numel(this.weights)
	num = numel(this.weights{i});
	this.weights{i}(:) = rand(num,1) - 0.5; % [-0.5,0.5]
	end
	end

	% Perform training with Stochastic Gradient Descent
	function perf = train(this, X, Y, numIter)

	% Ensure correct sizes
	assert(size(X,1) == size(Y,1))

	% Ensure regularization rate and batch size is proper
	assert(this.BatchSize >= 1);
	assert(this.RegularizationRate >= 0);

	% Check if we have specified the right regularization type
	regType = this.RegularizationType;
	assert(any(strcmpi(regType, {'l1', 'l2', 'none'})));

	% Initialize cost function array
	perf = zeros(1, numIter);

	% Initialize weights
	init(this);

	% Total number of examples
	N = size(X,1);

	% Total number of applicable layers
	L = numel(this.weights);

	% Get batch size
	B = this.BatchSize;

	% Safely catch if batch size is larger than total number
	% of examples
	if B > N
	B = N;
	end

	% Cell array to store input and outputs of each neuron
	sNeuron = cell(1,L);

	% First cell array is for the initial
	xNeuron = cell(1,L+1);

	% Cell array for storing the sensitivities
	delta = cell(1,L);

	% For L1 regularization
	% Method used: http://aclweb.org/anthology/P/P09/P09-1054.pdf
	if strcmpi(regType, 'l1')
	% This represents the total L1 penalty that each
	% weight could have received up to current point
	uk = 0;

	% Total penalty for each weight that was received up to
	% current point
	qk = cell(1,L);
	for ii=1:L
	qk{ii} = zeros(size(this.weights{ii}));
	end
	end

	% Get activation function
	fcn = getActivationFunction(this.ActivationFunction);

	skipFactor = floor(numIter/10);

	% For each iteration...
	for ii = 1:numIter
	% If the batch size is equal to the total number of examples
	% don't bother with random selection as this will be a full
	% batch gradient descent
	if N == B
	ind = 1 : N;
	else
	% Randomly select examples corresponding to the batch size
	% if the batch size is not equal to the number of examples
	ind = randperm(N, B);
	end

	% Select out the training example features and expected outputs
	IN = X(ind, :);
	OUT = Y(ind, :);

	% Initialize input layer
	xNeuron{1} = [IN ones(B,1)];

	%%% Perform forward propagation
	% Make sure you save the inputs and outputs into each neuron
	% at the hidden and output layers
	for jj = 1:L
	% Compute inputs into next layer
	sNeuron{jj} = xNeuron{jj} * this.weights{jj};

	% Compute outputs of this layer
	if jj == L
	xNeuron{jj+1} = fcn(sNeuron{jj});
	else
	xNeuron{jj+1} = [fcn(sNeuron{jj}) ones(B,1)];
	end
	end

	%%% Perform backpropagation
	% Get derivative of activation function
	dfcn = getDerivativeActivationFunction(this.ActivationFunction);

	% Compute sensitivities for output layer
	delta{end} = (xNeuron{end} - OUT) .* dfcn(sNeuron{end});

	% Compute the sensitivities for the rest of the layers
	for jj = L-1 : -1 : 1
	delta{jj} = dfcn(sNeuron{jj}) .* ...
	(delta{jj+1}*(this.weights{jj+1}(1:end-1,:)).');
	end

	%%% Compute weight updates
	alpha = this.LearningRate;
	lambda = this.RegularizationRate;
	for jj = 1 : L
	% Obtain the outputs and sensitivities for each
	% affected layer
	XX = xNeuron{jj};
	D = delta{jj};

	% Calculate batch weight update
	weight_update = (1/B)*sum(bsxfun(@times, permute(XX, [2 3 1]), ...
	permute(D, [3 2 1])), 3);

	% Apply L2 regularization if required
	if strcmpi(regType, 'l2')
	weight_update(1:end-1,:) = weight_update(1:end-1,:) + ...
	(lambda/B)*this.weights{jj}(1:end-1,:);
	end

	% Compute the final update
	this.weights{jj} = this.weights{jj} - alpha*weight_update;
	end

	% Apply L1 regularization if required
	if strcmpi(regType, 'l1')
	% Step #1 - Accumulate total L1 penalty that each
	% weight could have received up to this point
	uk = uk + (alpha*lambda/B);

	% Step #2
	% Using the updated weights, now apply the penalties
	for jj = 1 : L
	% 2a - Save previous weights and penalties
	% Make sure to remove bias terms
	z = this.weights{jj}(1:end-1,:);
	q = qk{jj}(1:end-1,:);

	% 2b - Using the previous weights, find the weights
	% that are positive and negative
	w = z;
	indwp = w > 0;
	indwn = w < 0;

	% 2c - Perform the udpate on each condition
	% individually
	w(indwp) = max(0, w(indwp) - (uk + q(indwp)));
	w(indwn) = min(0, w(indwn) + (uk - q(indwn)));

	% 2d - Update the actual penalties
	qk{jj}(1:end-1,:) = q + (w - z);

	% Don't forget to update the actual weights!
	this.weights{jj}(1:end-1,:) = w;
	end
	end

	% Compute cost at this iteration
	perf(ii) = (0.5/B)*sum(sum((xNeuron{end} - OUT).^2));

	% Add in regularization if necessary
	if strcmpi(regType, 'l1')
	for jj = 1 : L
	perf(ii) = perf(ii) + ...
	(lambda/B)*sum(sum(abs(this.weights{jj}(1:end-1,:))));
	end
	elseif strcmpi(regType, 'l2')
	for jj = 1 : L
	perf(ii) = perf(ii) + ...
	(0.5lambda/B)sum(sum((this.weights{jj}(1:end-1,:)).^2));
	end
	end

	% Debugging output
	if this.Debug
	if mod(ii,skipFactor) == 1 \|\| ii == numIter
	fprintf('Iteration #%d - Cost: %4.6e\n', ii, perf(ii));
	end
	end
	end
	end

	% Perform forward propagation
	% Note that the bias units are the last row of the matrix
	% Inputs are in a 2D matrix of N x M
	% N is the number of examples
	% M is the number of features / number of input neurons
	function OUT = sim(this, X)
	% Check if the total number of features matches the
	% total number of input neurons
	assert(size(X,2) == this.inputSize);

	% Get total number of examples
	N = size(X,1);

	%%% Begin algorithm
	% Start with first layer
	IN = X;

	% Get activation function
	fcn = getActivationFunction(this.ActivationFunction);

	% For each layer...
	for ii=1:numel(this.weights)
	% Compute inputs into each neuron and corresponding
	% outputs
	OUT = fcn([IN ones(N,1)] * this.weights{ii});

	% Save for next iteration
	IN = OUT;
	end
	end
	end

	end

	function fcn = getActivationFunction(activation)
	switch lower(activation)
	case 'linear'
	fcn = @f_linear;
	case 'relu'
	fcn = @f_relu;
	case 'tanh'
	fcn = @f_tanh;
	case 'sigmoid'
	fcn = @f_sigmoid;
	otherwise
	error('Unknown activation function');
	end
	end

	function fcn = getDerivativeActivationFunction(activation)
	switch lower(activation)
	case 'linear'
	fcn = @fd_linear;
	case 'relu'
	fcn = @fd_relu;
	case 'tanh'
	fcn = @fd_tanh;
	case 'sigmoid'
	fcn = @fd_sigmoid;
	otherwise
	error('Unknown activation function');
	end
	end

	% activation funtions and their derivatives
	function y = f_linear(x)
	% See also: purelin
	y = x;
	end

	function y = fd_linear(x)
	% See also: dpurelin
	y = ones(size(x));
	end

	function y = f_relu(x)
	% See also: poslin
	y = max(x, 0);
	end

	function y = fd_relu(x)
	% See also: dposlin
	y = double(x >= 0);
	end

	function y = f_tanh(x)
	% See also: tansig
	%y = 2 ./ (1 + exp(-2*x)) - 1;
	y = tanh(x);
	end

	function y = fd_tanh(x)
	% See also: dtansig
	y = f_tanh(x);
	y = 1 - y.^2;
	end

	function y = f_sigmoid(x)
	% See also: logsig
	y = 1 ./ (1 + exp(-x));
	end

	function y = fd_sigmoid(x)
	% See also: dlogsig
	y = f_sigmoid(x);
	y = y .* (1 - y);
	end