blandinw/gist:4258990

## gistfile1.matlab
% https://class.coursera.org/ml-2012-002

for i = 1:m
    % actual class for this example
    actual = zeros(1, num_labels);
    actual(y(i)) = 1;

    % compute activation vectors for the 3 layers
    a1 = X(i,:);
    z2 = [1 a1] * Theta1';
    a2 = sigmoid(z2);
    z3 = [1 a2] * Theta2';
    a3 = sigmoid(z3);

    % Part 1 - cost
    % compute cost for this example
    costs = actual.*log(a3) + (1-actual).*log(1-a3);
    J += sum(costs);

    % Part 2 - backprop
    d3 = (a3 - actual)';
    d2 = ((Theta2'(2:end,:) * d3) .* sigmoidGradient(z2)');

    Theta2_grad += d3 * [1 a2];
    Theta1_grad += d2 * [1 a1];
end

% add regularization term (sum over every node, skip bias units)
regularization = lambda/(2*m) * sum([Theta1(:,2:end)(:) ; Theta2(:,2:end)(:)] .^ 2);
J = (-1/m * J) + regularization;

% compute Deltas
Theta2_grad = 1/m * Theta2_grad;
Theta2_grad(:,2:end) += (lambda/m)*Theta2(:,2:end);

Theta1_grad = 1/m * Theta1_grad;
Theta1_grad(:,2:end) += (lambda/m)*Theta1(:,2:end);
	% https://class.coursera.org/ml-2012-002

	for i = 1:m
	% actual class for this example
	actual = zeros(1, num_labels);
	actual(y(i)) = 1;

	% compute activation vectors for the 3 layers
	a1 = X(i,:);
	z2 = [1 a1] * Theta1';
	a2 = sigmoid(z2);
	z3 = [1 a2] * Theta2';
	a3 = sigmoid(z3);

	% Part 1 - cost
	% compute cost for this example
	costs = actual.log(a3) + (1-actual).log(1-a3);
	J += sum(costs);

	% Part 2 - backprop
	d3 = (a3 - actual)';
	d2 = ((Theta2'(2:end,:) * d3) .* sigmoidGradient(z2)');

	Theta2_grad += d3 * [1 a2];
	Theta1_grad += d2 * [1 a1];
	end

	% add regularization term (sum over every node, skip bias units)
	regularization = lambda/(2m) sum([Theta1(:,2:end)(:) ; Theta2(:,2:end)(:)] .^ 2);
	J = (-1/m * J) + regularization;

	% compute Deltas
	Theta2_grad = 1/m * Theta2_grad;
	Theta2_grad(:,2:end) += (lambda/m)*Theta2(:,2:end);

	Theta1_grad = 1/m * Theta1_grad;
	Theta1_grad(:,2:end) += (lambda/m)*Theta1(:,2:end);