Instantly share code, notes, and snippets.

junichiro/file0.txt Last active Jan 10, 2017

 y \in \mathbf{R}^K \\ E.g. \begin{bmatrix} 1 \\ 0 \\ 0 \end{bmatrix} , \begin{bmatrix} 0 \\ 1 \\ 0 \end{bmatrix} , \begin{bmatrix} 0 \\ 0 \\ 1 \end{bmatrix}
 J(\theta) = -\frac{1}{m} \Bigl[\sum_{i=1}^m y^{(i)}\log h_\theta(x^{(i)}) + (1 - y^{(i)})\log(1 - h_\theta(x^{(i)}))\Bigr] + \frac{\lambda}{2m}\sum_{j=1}^n\theta_j^2
 J(\Theta) = -\frac{1}{m} \Bigl[\sum_{i=1}^m\sum_{k=1}^K y_k^{(i)}\log(h_\Theta(x^{(i)}))_k + (1 - y_k^{(i)})\log(1 - (h_\Theta(x^{(i)}))_k)\Bigr] + \frac{\lambda}{2m}\sum_{l=1}^{L-1}\sum_{i=1}^{s_l}\sum_{j=1}^{s_{j+1}}(\Theta_{ji}^{(l)})^2
 \begin{align} a^{ (1) } &= x \\ z^{ (2) } &= \Theta^{ (1) }a^{ (1) } \\ a^{ (2) } &= g(z^{ (2) }) \quad (add \quad a_0^{(2)}) \\ z^{ (3) } &= \Theta^{ (2) }a^{ (2) } \\ a^{ (3) } &= h_\Theta(x) = g(z^{ (4) }) \end{align}
 \begin{align} \delta_j^{ (3) } &= a_j^{ (3) } - y_j \end{align}
 \begin{align} \delta^{ (3) } &= a^{ (3) } - y \end{align}
 \begin{align} \delta^{(2)} = (\Theta^{ (2) })^T \delta^{ (3) }.*g'(z^{ (2) }) \end{align}
 g'(z^{(l)}) = a^{(l)}.*(1-a^{(l)})
 function [J grad] = nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lambda) % reshape Theta1 = reshape(nn_params(1:hidden_layer_size * (input_layer_size + 1)), hidden_layer_size, (input_layer_size + 1)); Theta2 = reshape(nn_params((1 + (hidden_layer_size * (input_layer_size + 1))):end), num_labels, (hidden_layer_size + 1)); m = size(X, 1); J = 0; Theta1_grad = zeros(size(Theta1)); Theta2_grad = zeros(size(Theta2)); Y = zeros(m, num_labels); for i = 1:size(y) Y(i,y(i)) = 1; end a1 = X; a1 = [ones(m, 1) a1]; z2 = a1 * Theta1'; a2 = sigmoid(z2); a2 = [ones(m, 1) a2]; z3 = a2 * Theta2'; a3 = sigmoid(z3); hx = a3; J = -1/m * sum(sum(Y .* log(hx) + (1 - Y) .* log(1 - hx))); J = J + lambda / 2 / m * (sum(sum(Theta1(:,2:end).^2)) + sum(sum(Theta2(:,2:end).^2))); d3 = a3 - Y; d2 = (d3*Theta2) .* sigmoidGradient([ones(size(z2, 1), 1) z2]); d2 = d2(:, 2:end); r2 = lambda / m * [zeros(size(Theta2, 1), 1) Theta2(:, 2:end)]; r1 = lambda / m * [zeros(size(Theta1, 1), 1) Theta1(:, 2:end)]; Theta2_grad = 1/m * (d3' * a2) + r2; Theta1_grad = 1/m * (d2' * a1) + r1; % unrolling grad = [Theta1_grad(:) ; Theta2_grad(:)]; end