andreydung/softmax_grad_hw_withtest.m

## softmax_grad_hw_withtest.m
function softmax_grad_demo_hw()
% softmax_grad_demo_hw - when complete reproduces Figure 4.3 from Chapter 4
% of the text

%%% load data
[X,y] = load_data();

%%% run gradient descent
w = softmax_gradient_descent(X,y);

%%% plot everything, pts and lines %%%
plot_all(X',y,w);


%%%%%%%%%%%%%%%%% functions %%%%%%%%%%%%%%%
function numgrad = computeNumericalGradient(J, theta)
    % http://deeplearning.stanford.edu/wiki/index.php/UFLDL_Tutorial
    % numgrad = computeNumericalGradient(J, theta)
    % theta: a vector of parameters
    % J: a function that outputs a real-number. Calling y = J(theta) will return the
    % function value at theta.
    % Initialize numgrad with zeros
    numgrad = zeros(size(theta));

    perturb = zeros(size(theta));
    e = 1e-4;
    for p = 1:numel(theta)
        % Set perturbation vector
        perturb(p) = e;
        loss1 = J(theta - perturb);
        loss2 = J(theta + perturb);
        % Compute Numerical Gradient
        numgrad(p) = (loss2 - loss1) / (2*e);
        perturb(p) = 0;
    end
end

function L = loss(X, y, w)
    % assume X is compact notation
    L = sum(log(1 + exp(-y.*(X'*w))));
end
%%% gradient descent function for softmax cost/logistic regression %%%
function w = softmax_gradient_descent(X,y)
    %%% initialize w0 and make step length %%%
    X = [ones(size(X,1),1) X]';  % use compact notation
    w = randn(3,1);              % random initial point
    alpha = 10^-2;               % fixed steplength for all iterations

    % Initializations
    iter = 1;
    max_its = 30000;
    grad = 1;

    while  norm(grad) > 10^-12 && iter < max_its
        % compute gradient
        grad =             % YOUR CODE GOES HERE

        % gradient checking
        % commment out next two lines if this test passes
        costFunc = @(theta) loss(X,y,theta);
        assert(norm(grad - computeNumericalGradient(costFunc, w)) < 1e-5);

        w = w - alpha*grad;


        % update iteration count
        iter = iter + 1;
    end
end

%%% plots everything %%%
function plot_all(X,y,w)
    red = [1 0 .4];
    blue =  [ 0 .4 1];

    % plot points
    ind = find(y == 1);
    scatter(X(1,ind),X(2,ind),'Linewidth',2,'Markeredgecolor',blue,'markerFacecolor','none');
    hold on
    ind = find(y == -1);
    scatter(X(1,ind),X(2,ind),'Linewidth',2,'Markeredgecolor',red,'markerFacecolor','none');
    hold on

    % plot separator
    s =[0:0.01:1 ];
    plot (s,(-w(1)-w(2)*s)/w(3),'m','linewidth',2);

    % clean up plot and add info labels
    set(gcf,'color','w');
    axis square
    box off
    axis([0 1 0 1])
    xlabel('x_1','Fontsize',14)
    ylabel('x_2  ','Fontsize',14)
    set(get(gca,'YLabel'),'Rotation',0)
end

%%% loads data %%%
function [X,y] = load_data()
    data = csvread('imbalanced_2class.csv');
    X = data(:,1:end-1);
    y = data(:,end);
end

end
	function softmax_grad_demo_hw()
	% softmax_grad_demo_hw - when complete reproduces Figure 4.3 from Chapter 4
	% of the text

	%%% load data
	[X,y] = load_data();

	%%% run gradient descent
	w = softmax_gradient_descent(X,y);

	%%% plot everything, pts and lines %%%
	plot_all(X',y,w);


	%%%%%%%%%%%%%%%%% functions %%%%%%%%%%%%%%%
	function numgrad = computeNumericalGradient(J, theta)
	% http://deeplearning.stanford.edu/wiki/index.php/UFLDL_Tutorial
	% numgrad = computeNumericalGradient(J, theta)
	% theta: a vector of parameters
	% J: a function that outputs a real-number. Calling y = J(theta) will return the
	% function value at theta.
	% Initialize numgrad with zeros
	numgrad = zeros(size(theta));

	perturb = zeros(size(theta));
	e = 1e-4;
	for p = 1:numel(theta)
	% Set perturbation vector
	perturb(p) = e;
	loss1 = J(theta - perturb);
	loss2 = J(theta + perturb);
	% Compute Numerical Gradient
	numgrad(p) = (loss2 - loss1) / (2*e);
	perturb(p) = 0;
	end
	end

	function L = loss(X, y, w)
	% assume X is compact notation
	L = sum(log(1 + exp(-y.(X'w))));
	end
	%%% gradient descent function for softmax cost/logistic regression %%%
	function w = softmax_gradient_descent(X,y)
	%%% initialize w0 and make step length %%%
	X = [ones(size(X,1),1) X]'; % use compact notation
	w = randn(3,1); % random initial point
	alpha = 10^-2; % fixed steplength for all iterations

	% Initializations
	iter = 1;
	max_its = 30000;
	grad = 1;

	while norm(grad) > 10^-12 && iter < max_its
	% compute gradient
	grad = % YOUR CODE GOES HERE

	% gradient checking
	% commment out next two lines if this test passes
	costFunc = @(theta) loss(X,y,theta);
	assert(norm(grad - computeNumericalGradient(costFunc, w)) < 1e-5);

	w = w - alpha*grad;


	% update iteration count
	iter = iter + 1;
	end
	end

	%%% plots everything %%%
	function plot_all(X,y,w)
	red = [1 0 .4];
	blue = [ 0 .4 1];

	% plot points
	ind = find(y == 1);
	scatter(X(1,ind),X(2,ind),'Linewidth',2,'Markeredgecolor',blue,'markerFacecolor','none');
	hold on
	ind = find(y == -1);
	scatter(X(1,ind),X(2,ind),'Linewidth',2,'Markeredgecolor',red,'markerFacecolor','none');
	hold on

	% plot separator
	s =[0:0.01:1 ];
	plot (s,(-w(1)-w(2)*s)/w(3),'m','linewidth',2);

	% clean up plot and add info labels
	set(gcf,'color','w');
	axis square
	box off
	axis([0 1 0 1])
	xlabel('x_1','Fontsize',14)
	ylabel('x_2 ','Fontsize',14)
	set(get(gca,'YLabel'),'Rotation',0)
	end

	%%% loads data %%%
	function [X,y] = load_data()
	data = csvread('imbalanced_2class.csv');
	X = data(:,1:end-1);
	y = data(:,end);
	end

	end