Created
June 23, 2010 13:50
-
-
Save mutolisp/449933 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
% Q1: Fit a maximum likelihood model to cetacean entanglement | |
% problem in homework 14 and use AIC to determine the most | |
% appropriate model (i.e. compare models: | |
% 1. Full model (with beta0 beta1 beta2 beta3) | |
% 2. Model without beta1, | |
% 3. Model without beta2, | |
% 4. Model without beta3 | |
% 5. Full model adding interaction term: x*y. | |
clear;clc; | |
%% 0. data loading and preprocessing | |
cetacean=load('cetaceancorrect.dat.txt'); | |
n=length(cetacean)(:,1); | |
x0=ones(1000,1); | |
x=cetacean(:,2); | |
% if calm seas=0 (value 0 to 3) else rough seas=1 | |
% (value >= 4); x is the rough/calm; y is the number | |
% of the pingers and z is the entanglement | |
% x (1000x1) | |
% y (1000x1) | |
% z (1x1000) | |
x(x<4)=0; | |
x(x>3)=1; | |
y=cetacean(:,3); | |
z=cetacean(:,4); | |
z=z'; | |
%% Model selection | |
% model 1: full model, beta0+beta1+beta2+beta3 | |
% data M1 (1000x4) | |
M1=[x0,x,y,y.^2]; | |
% coefficients of model 1 (4x1) | |
B1=zeros(4,1); | |
model1=inline('sum( log( 1 + exp(M1*B1) ) ) - z*M1*B1 ','M1','B1','z'); | |
% find minimum (1000x4 * 4x1) | |
[m1b1, logLik1]=fminsearch(@(B1) model1(M1,B1,z), [0;0;0;0] ); | |
% model 2: beta0+beta2+beta3 | |
M2=[x0,y,y.^2]; % 1000x3 | |
B2=zeros(3,1); % 3x1 | |
model2=inline('sum( log( 1 + exp(M2*B2) ) ) - z*M2*B2 ','B2','M2','z'); | |
[m2b2, logLik2]=fminsearch(@(B2) model1(M2,B2,z), [0;0;0] ); | |
% model 3: beta0+beta1+beta3 | |
M3=[x0,x,y.^2]; % 1000x3 | |
B3=zeros(3,1); % 3x1 | |
model3=inline('sum( log( 1 + exp(M3*B3) ) ) - z*M3*B3 ','B3','M3','z'); | |
[m3b3, logLik3]=fminsearch(@(B3) model1(M3,B3,z), [0;0;0] ); | |
% model 4: beta0+beta1+beta2 | |
M4=[x0,x,y]; % 1000x3 | |
B4=zeros(3,1); % 3x1 | |
model4=inline('sum( log( 1 + exp(M4*B4) ) ) - z*M4*B4 ','B4','M4','z'); | |
[m4b4, logLik4]=fminsearch(@(B4) model1(M4,B4,z), [0;0;0] ); | |
% model 5: beta0+beta1+beta2+beta3+x*y | |
M5=[x0,x,y,y.^2,x.*y]; % 1000x5 | |
B5=zeros(5,1); % 5x1 | |
model5=inline('sum( log( 1 + exp(M5*B5) ) ) - z*M5*B5 ','B5','M5','z'); | |
[m5b5, logLik5]=fminsearch(@(B5) model1(M5,B5,z), [0;0;0;0;0] ); | |
%% Plot the probability of entanglement, pi as a function of number | |
% of pingers. Report -log(likelihood) and AIC for each model. | |
% Based on the results, tell me which model is the best. | |
%% 1.1 Calculate the pi (logistic link function) | |
% pi=exp(beta_0+beta_i*Xi)/(1+exp(beta_0+beta_i*Xi)) | |
X1=M1*m1b1; | |
X2=M2*m2b2; | |
X3=M3*m3b3; | |
X4=M4*m4b4; | |
X5=M5*m5b5; | |
pi1=exp(X1)./(1+exp(X1)); | |
pi2=exp(X2)./(1+exp(X2)); | |
pi3=exp(X3)./(1+exp(X3)); | |
pi4=exp(X4)./(1+exp(X4)); | |
pi5=exp(X5)./(1+exp(X5)); | |
PI=[pi1, pi2, pi3, pi4, pi5]; | |
% plot | |
figure(1) | |
for i=1:5 | |
subplot(5,1,i) | |
plot(y,PI(:,i),'r.') | |
ylabel('\pi') | |
end | |
xlabel('Number of pingers') | |
print('../figures/fig1.pdf', '-dpdf') | |
%% 1.2 calculate -log(L) | |
mlogL=-[logLik1, logLik2, logLik3, logLik4, logLik5] | |
% -log(L) -365.74 -368.98 -478.68 -369.91 -365.65 | |
%% 1.3 calculate AIC | |
% AIC= 2P-2lnL, where P is the number of parameters and L is the | |
% log likelihood | |
AIC=2*([length(m1b1), length(m2b2), length(m3b3), length(m4b4), length(m5b5)]-mlogL) | |
% AIC = 739.48 743.97 963.37 745.83 741.30 | |
% minium AIC value is the best model | |
[best_aic, model_id] = min(AIC); | |
% the first model (full model) is the best | |
%% 2. Use cross-validation to choose the most appropriate model. For my | |
% convenience, you MUST use 80% of the data to fit the model and | |
% then evaluate the likelihood of the other 20%. Do this sequentially, | |
% 5 times for each model (leaving out a different, non-overlapping 20% | |
% each time). Evaluate different models (model 1 to 5 Q1). | |
% | |
% Print out the parameter estimates for each fit of your models | |
% (5 sets of parameters for each model, each one based on leaving | |
% out a different 20% of the data) and the log-likelihood of both | |
% the 80% that was fit and the 20% that was left out. | |
% | |
% For each model, give me the cross-validation estimate of | |
% log-likelihood. Based on this, tell me which model is best. | |
% set groups for training | |
g1=[x,y,z'](201:end,:); | |
g2=[x,y,z']([1:200,401:end],:); | |
g3=[x,y,z']([1:400,601:end],:); | |
g4=[x,y,z']([1:600,801:end],:); | |
g5=[x,y,z'](1:800,:); | |
% combine groups into 3 dimensional matrices | |
g_all=cat(3,g1,g2,g3,g4,g5); | |
%% Model selection | |
cx0=ones(800,1); | |
for i=1:5 | |
% x,y,y^2 and gz for model input | |
gx=g_all(:,1,i); | |
gy=g_all(:,2,i); | |
gy2=g_all(:,2,i).^2; | |
gz=g_all(:,3,i)'; | |
% model 1 | |
CM1=[cx0,gx,gy,gy2]; | |
CB1=zeros(4,1); | |
[cm1b1(:,i), clogLik1(:,i)]=fminsearch(@(CB1) model1(CM1,CB1,gz), [0;0;0;0] ); | |
% model 2 | |
CM2=[cx0,gy,gy2]; | |
CB2=zeros(3,1); | |
[cm2b2(:,i), clogLik2(:,i)]=fminsearch(@(CB2) model1(CM2,CB2,gz), [0;0;0] ); | |
% model 3 | |
CM3=[cx0,gx,gy2]; | |
CB3=zeros(3,1); | |
[cm3b3(:,i), clogLik3(:,i)]=fminsearch(@(CB3) model1(CM3,CB3,gz), [0;0;0] ); | |
% model 4 | |
CM4=[cx0,gx,gy]; | |
CB4=zeros(3,1); | |
[cm4b4(:,i), clogLik4(:,i)]=fminsearch(@(CB4) model1(CM4,CB4,gz), [0;0;0] ); | |
% model 5 | |
CM5=[cx0,gx,gy,gy2,gx.*gy]; | |
CB5=zeros(5,1); | |
[cm5b5(:,i), clogLik5(:,i)]=fminsearch(@(CB5) model1(CM5,CB5,gz), [0;0;0;0;0] ); | |
end | |
cmlogLik=-[clogLik1;clogLik2;clogLik3;clogLik4;clogLik5]; | |
cP=[size(cm1b1)(1,1), size(cm2b2)(1,1), size(cm3b3)(1,1), size(cm4b4)(1,1), size(cm5b5)(1,1)]; | |
cAIC=-2*cmlogLik+2*repmat(cP,[5,1]) | |
[cbest_aic,cmodel_id]=min(cAIC) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment