mutolisp/gist:449933

## gistfile1.matlab
% Q1: Fit a maximum likelihood model to cetacean entanglement
%  problem in homework 14 and use AIC to determine the most
%  appropriate model (i.e. compare models:
% 1. Full model (with beta0 beta1 beta2 beta3)
% 2. Model without beta1,
% 3. Model without beta2,
% 4. Model without beta3
% 5. Full model adding interaction term: x*y.

clear;clc;

%% 0. data loading and preprocessing
cetacean=load('cetaceancorrect.dat.txt');
n=length(cetacean)(:,1);
x0=ones(1000,1);
x=cetacean(:,2);
% if calm seas=0 (value 0 to 3) else rough seas=1
% (value >= 4); x is the rough/calm; y is the number
% of the pingers and z is the entanglement
% x (1000x1)
% y (1000x1)
% z (1x1000)
x(x<4)=0;
x(x>3)=1;
y=cetacean(:,3);
z=cetacean(:,4);
z=z';

%% Model selection
    % model 1: full model, beta0+beta1+beta2+beta3
    % data M1 (1000x4)
    M1=[x0,x,y,y.^2];
    % coefficients of model 1 (4x1)
    B1=zeros(4,1);
    model1=inline('sum( log( 1 + exp(M1*B1) ) ) - z*M1*B1 ','M1','B1','z');
    % find minimum (1000x4 * 4x1)
    [m1b1, logLik1]=fminsearch(@(B1) model1(M1,B1,z), [0;0;0;0] );

    % model 2: beta0+beta2+beta3
    M2=[x0,y,y.^2]; % 1000x3
    B2=zeros(3,1);  % 3x1
    model2=inline('sum( log( 1 + exp(M2*B2) ) ) - z*M2*B2 ','B2','M2','z');
    [m2b2, logLik2]=fminsearch(@(B2) model1(M2,B2,z), [0;0;0] );

    % model 3: beta0+beta1+beta3
    M3=[x0,x,y.^2]; % 1000x3
    B3=zeros(3,1);  % 3x1
    model3=inline('sum( log( 1 + exp(M3*B3) ) ) - z*M3*B3 ','B3','M3','z');
    [m3b3, logLik3]=fminsearch(@(B3) model1(M3,B3,z), [0;0;0] );

    % model 4: beta0+beta1+beta2
    M4=[x0,x,y];    % 1000x3
    B4=zeros(3,1);  % 3x1
    model4=inline('sum( log( 1 + exp(M4*B4) ) ) - z*M4*B4 ','B4','M4','z');
    [m4b4, logLik4]=fminsearch(@(B4) model1(M4,B4,z), [0;0;0] );

    % model 5: beta0+beta1+beta2+beta3+x*y
    M5=[x0,x,y,y.^2,x.*y]; % 1000x5
    B5=zeros(5,1);        % 5x1
    model5=inline('sum( log( 1 + exp(M5*B5) ) ) - z*M5*B5 ','B5','M5','z');
    [m5b5, logLik5]=fminsearch(@(B5) model1(M5,B5,z), [0;0;0;0;0] );

%% Plot the probability of entanglement, pi as a function of number
%   of pingers. Report -log(likelihood) and AIC for each model.
%   Based on the results, tell me which model is the best.

%% 1.1 Calculate the pi (logistic link function)
%      pi=exp(beta_0+beta_i*Xi)/(1+exp(beta_0+beta_i*Xi))
X1=M1*m1b1;
X2=M2*m2b2;
X3=M3*m3b3;
X4=M4*m4b4;
X5=M5*m5b5;
pi1=exp(X1)./(1+exp(X1));
pi2=exp(X2)./(1+exp(X2));
pi3=exp(X3)./(1+exp(X3));
pi4=exp(X4)./(1+exp(X4));
pi5=exp(X5)./(1+exp(X5));
PI=[pi1, pi2, pi3, pi4, pi5];

% plot
figure(1)
for i=1:5
   subplot(5,1,i)
   plot(y,PI(:,i),'r.')
   ylabel('\pi')
end
xlabel('Number of pingers')
print('../figures/fig1.pdf', '-dpdf')

%% 1.2 calculate -log(L)
mlogL=-[logLik1, logLik2, logLik3, logLik4, logLik5]
% -log(L)  -365.74  -368.98  -478.68  -369.91  -365.65

%% 1.3 calculate AIC
%  AIC= 2P-2lnL, where P is the number of parameters and L is the
%  log likelihood
AIC=2*([length(m1b1), length(m2b2), length(m3b3), length(m4b4), length(m5b5)]-mlogL)
%  AIC = 739.48   743.97   963.37   745.83   741.30

% minium AIC value is the best model
[best_aic, model_id] = min(AIC);
% the first model (full model) is the best

%% 2. Use cross-validation to choose the most appropriate model. For my
%     convenience, you MUST use 80% of the data to fit the model and
%     then evaluate the likelihood of the other 20%. Do this sequentially,
%     5 times for each model (leaving out a different, non-overlapping 20%
%     each time). Evaluate different models (model 1 to 5 Q1).
%
%     Print out the parameter estimates for each fit of your models
%     (5 sets of parameters for each model, each one based on leaving
%     out a different 20% of the data) and the log-likelihood of both
%     the 80% that was fit and the 20% that was left out.
%
%     For each model, give me the cross-validation estimate of
%     log-likelihood. Based on this, tell me which model is best.

% set groups for training
g1=[x,y,z'](201:end,:);
g2=[x,y,z']([1:200,401:end],:);
g3=[x,y,z']([1:400,601:end],:);
g4=[x,y,z']([1:600,801:end],:);
g5=[x,y,z'](1:800,:);
% combine groups into 3 dimensional matrices
g_all=cat(3,g1,g2,g3,g4,g5);

%% Model selection
cx0=ones(800,1);
for i=1:5
    % x,y,y^2 and gz for model input
    gx=g_all(:,1,i);
    gy=g_all(:,2,i);
    gy2=g_all(:,2,i).^2;
    gz=g_all(:,3,i)';

    % model 1
    CM1=[cx0,gx,gy,gy2];
    CB1=zeros(4,1);
    [cm1b1(:,i), clogLik1(:,i)]=fminsearch(@(CB1) model1(CM1,CB1,gz), [0;0;0;0] );
    % model 2
    CM2=[cx0,gy,gy2];
    CB2=zeros(3,1);
    [cm2b2(:,i), clogLik2(:,i)]=fminsearch(@(CB2) model1(CM2,CB2,gz), [0;0;0] );
    % model 3
    CM3=[cx0,gx,gy2];
    CB3=zeros(3,1);
    [cm3b3(:,i), clogLik3(:,i)]=fminsearch(@(CB3) model1(CM3,CB3,gz), [0;0;0] );
    % model 4
    CM4=[cx0,gx,gy];
    CB4=zeros(3,1);
    [cm4b4(:,i), clogLik4(:,i)]=fminsearch(@(CB4) model1(CM4,CB4,gz), [0;0;0] );
    % model 5
    CM5=[cx0,gx,gy,gy2,gx.*gy];
    CB5=zeros(5,1);
    [cm5b5(:,i), clogLik5(:,i)]=fminsearch(@(CB5) model1(CM5,CB5,gz), [0;0;0;0;0] );
end

cmlogLik=-[clogLik1;clogLik2;clogLik3;clogLik4;clogLik5];
cP=[size(cm1b1)(1,1), size(cm2b2)(1,1), size(cm3b3)(1,1), size(cm4b4)(1,1), size(cm5b5)(1,1)];
cAIC=-2*cmlogLik+2*repmat(cP,[5,1])

[cbest_aic,cmodel_id]=min(cAIC)
	% Q1: Fit a maximum likelihood model to cetacean entanglement
	% problem in homework 14 and use AIC to determine the most
	% appropriate model (i.e. compare models:
	% 1. Full model (with beta0 beta1 beta2 beta3)
	% 2. Model without beta1,
	% 3. Model without beta2,
	% 4. Model without beta3
	% 5. Full model adding interaction term: x*y.

	clear;clc;

	%% 0. data loading and preprocessing
	cetacean=load('cetaceancorrect.dat.txt');
	n=length(cetacean)(:,1);
	x0=ones(1000,1);
	x=cetacean(:,2);
	% if calm seas=0 (value 0 to 3) else rough seas=1
	% (value >= 4); x is the rough/calm; y is the number
	% of the pingers and z is the entanglement
	% x (1000x1)
	% y (1000x1)
	% z (1x1000)
	x(x<4)=0;
	x(x>3)=1;
	y=cetacean(:,3);
	z=cetacean(:,4);
	z=z';

	%% Model selection
	% model 1: full model, beta0+beta1+beta2+beta3
	% data M1 (1000x4)
	M1=[x0,x,y,y.^2];
	% coefficients of model 1 (4x1)
	B1=zeros(4,1);
	model1=inline('sum( log( 1 + exp(M1B1) ) ) - zM1*B1 ','M1','B1','z');
	% find minimum (1000x4 * 4x1)
	[m1b1, logLik1]=fminsearch(@(B1) model1(M1,B1,z), [0;0;0;0] );

	% model 2: beta0+beta2+beta3
	M2=[x0,y,y.^2]; % 1000x3
	B2=zeros(3,1); % 3x1
	model2=inline('sum( log( 1 + exp(M2B2) ) ) - zM2*B2 ','B2','M2','z');
	[m2b2, logLik2]=fminsearch(@(B2) model1(M2,B2,z), [0;0;0] );

	% model 3: beta0+beta1+beta3
	M3=[x0,x,y.^2]; % 1000x3
	B3=zeros(3,1); % 3x1
	model3=inline('sum( log( 1 + exp(M3B3) ) ) - zM3*B3 ','B3','M3','z');
	[m3b3, logLik3]=fminsearch(@(B3) model1(M3,B3,z), [0;0;0] );

	% model 4: beta0+beta1+beta2
	M4=[x0,x,y]; % 1000x3
	B4=zeros(3,1); % 3x1
	model4=inline('sum( log( 1 + exp(M4B4) ) ) - zM4*B4 ','B4','M4','z');
	[m4b4, logLik4]=fminsearch(@(B4) model1(M4,B4,z), [0;0;0] );

	% model 5: beta0+beta1+beta2+beta3+x*y
	M5=[x0,x,y,y.^2,x.*y]; % 1000x5
	B5=zeros(5,1); % 5x1
	model5=inline('sum( log( 1 + exp(M5B5) ) ) - zM5*B5 ','B5','M5','z');
	[m5b5, logLik5]=fminsearch(@(B5) model1(M5,B5,z), [0;0;0;0;0] );

	%% Plot the probability of entanglement, pi as a function of number
	% of pingers. Report -log(likelihood) and AIC for each model.
	% Based on the results, tell me which model is the best.

	%% 1.1 Calculate the pi (logistic link function)
	% pi=exp(beta_0+beta_iXi)/(1+exp(beta_0+beta_iXi))
	X1=M1*m1b1;
	X2=M2*m2b2;
	X3=M3*m3b3;
	X4=M4*m4b4;
	X5=M5*m5b5;
	pi1=exp(X1)./(1+exp(X1));
	pi2=exp(X2)./(1+exp(X2));
	pi3=exp(X3)./(1+exp(X3));
	pi4=exp(X4)./(1+exp(X4));
	pi5=exp(X5)./(1+exp(X5));
	PI=[pi1, pi2, pi3, pi4, pi5];

	% plot
	figure(1)
	for i=1:5
	subplot(5,1,i)
	plot(y,PI(:,i),'r.')
	ylabel('\pi')
	end
	xlabel('Number of pingers')
	print('../figures/fig1.pdf', '-dpdf')

	%% 1.2 calculate -log(L)
	mlogL=-[logLik1, logLik2, logLik3, logLik4, logLik5]
	% -log(L) -365.74 -368.98 -478.68 -369.91 -365.65

	%% 1.3 calculate AIC
	% AIC= 2P-2lnL, where P is the number of parameters and L is the
	% log likelihood
	AIC=2*([length(m1b1), length(m2b2), length(m3b3), length(m4b4), length(m5b5)]-mlogL)
	% AIC = 739.48 743.97 963.37 745.83 741.30

	% minium AIC value is the best model
	[best_aic, model_id] = min(AIC);
	% the first model (full model) is the best

	%% 2. Use cross-validation to choose the most appropriate model. For my
	% convenience, you MUST use 80% of the data to fit the model and
	% then evaluate the likelihood of the other 20%. Do this sequentially,
	% 5 times for each model (leaving out a different, non-overlapping 20%
	% each time). Evaluate different models (model 1 to 5 Q1).
	%
	% Print out the parameter estimates for each fit of your models
	% (5 sets of parameters for each model, each one based on leaving
	% out a different 20% of the data) and the log-likelihood of both
	% the 80% that was fit and the 20% that was left out.
	%
	% For each model, give me the cross-validation estimate of
	% log-likelihood. Based on this, tell me which model is best.

	% set groups for training
	g1=[x,y,z'](201:end,:);
	g2=[x,y,z']([1:200,401:end],:);
	g3=[x,y,z']([1:400,601:end],:);
	g4=[x,y,z']([1:600,801:end],:);
	g5=[x,y,z'](1:800,:);
	% combine groups into 3 dimensional matrices
	g_all=cat(3,g1,g2,g3,g4,g5);

	%% Model selection
	cx0=ones(800,1);
	for i=1:5
	% x,y,y^2 and gz for model input
	gx=g_all(:,1,i);
	gy=g_all(:,2,i);
	gy2=g_all(:,2,i).^2;
	gz=g_all(:,3,i)';

	% model 1
	CM1=[cx0,gx,gy,gy2];
	CB1=zeros(4,1);
	[cm1b1(:,i), clogLik1(:,i)]=fminsearch(@(CB1) model1(CM1,CB1,gz), [0;0;0;0] );
	% model 2
	CM2=[cx0,gy,gy2];
	CB2=zeros(3,1);
	[cm2b2(:,i), clogLik2(:,i)]=fminsearch(@(CB2) model1(CM2,CB2,gz), [0;0;0] );
	% model 3
	CM3=[cx0,gx,gy2];
	CB3=zeros(3,1);
	[cm3b3(:,i), clogLik3(:,i)]=fminsearch(@(CB3) model1(CM3,CB3,gz), [0;0;0] );
	% model 4
	CM4=[cx0,gx,gy];
	CB4=zeros(3,1);
	[cm4b4(:,i), clogLik4(:,i)]=fminsearch(@(CB4) model1(CM4,CB4,gz), [0;0;0] );
	% model 5
	CM5=[cx0,gx,gy,gy2,gx.*gy];
	CB5=zeros(5,1);
	[cm5b5(:,i), clogLik5(:,i)]=fminsearch(@(CB5) model1(CM5,CB5,gz), [0;0;0;0;0] );
	end

	cmlogLik=-[clogLik1;clogLik2;clogLik3;clogLik4;clogLik5];
	cP=[size(cm1b1)(1,1), size(cm2b2)(1,1), size(cm3b3)(1,1), size(cm4b4)(1,1), size(cm5b5)(1,1)];
	cAIC=-2cmlogLik+2repmat(cP,[5,1])

	[cbest_aic,cmodel_id]=min(cAIC)