Skip to content

Instantly share code, notes, and snippets.

@jphall663
Last active August 29, 2015 14:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jphall663/661334961ca41b29adfb to your computer and use it in GitHub Desktop.
Save jphall663/661334961ca41b29adfb to your computer and use it in GitHub Desktop.
PROC DISCRIM vs. the MBR node in Enterprise Miner
*** clear any previous results;
ods html close;
ods html;
*** display resolution of macro variables in log;
option symbolgen;
*** request detailed performance info;
options fullstimer;
*** number of points per cluster;
%let N=5000;
*** spread of each cluster, causes overlap;
%let SPREAD=1.5;
*** set tuning paramters;
%let K=11; /* for all */
%let BUCKETS=100; /* for PROC PMBR RDTREE method, higher for less approximation */
%let EPSILON=10; /* for PROC PMBR RDTREE method, lower for less approximation */
*** generate sample data with obvious classes ********************************;
data clusters_train clusters_test;
drop n;
id=0;
do n=1 to &N;
x=2*rannor(12345)*&SPREAD.+20;
y=2*rannor(12345)*&SPREAD.+20;
if mod(n, 2)=0 then output clusters_train;
else output clusters_test;
end;
id=id + 1;
do n=1 to &N;
x=2*rannor(12345)*&SPREAD.+15;
y=2*rannor(12345)*&SPREAD.+15;
if mod(n, 2)=0 then output clusters_train;
else output clusters_test;
end;
id=id + 1;
do n=1 to &N;
x=2*rannor(12345)*&SPREAD.+10;
y=2*rannor(12345)*&SPREAD.+10;
if mod(n, 2)=0 then output clusters_train;
else output clusters_test;
end;
id=id + 1;
do n=1 to &N;
x=2*rannor(12345)*&SPREAD.+5;
y=2*rannor(12345)*&SPREAD.+5;
if mod(n, 2)=0 then output clusters_train;
else output clusters_test;
end;
id=id + 1;
do n=1 to &N;
x=2*rannor(12345)*&SPREAD.;
y=2*rannor(12345)*&SPREAD.;
if mod(n, 2)=0 then output clusters_train;
else output clusters_test;
end;
id=id + 1;
do n=1 to &N;
x=2*rannor(12345)*&SPREAD.+15;
y=2*rannor(12345)*&SPREAD.;
if mod(n, 2)=0 then output clusters_train;
else output clusters_test;
end;
id=id + 1;
do n=1 to &N;
x=2*rannor(12345)*&SPREAD.;
y=2*rannor(12345)*&SPREAD.+15;
if mod(n, 2)=0 then output clusters_train;
else output clusters_test;
end;
run;
*** plot generated test data;
title 'Generated Test Data';
proc sgplot
data=clusters_test;
scatter y=y x=x /group=id;
run;
*** classifiy with PROC DISCRIM **********************************************;
proc discrim
data=clusters_train
method=npar
k=&K
testdata=clusters_test
testout=score_discrim
;
class id;
var x y;
run;
*** sort for plot color consistency;
*** remove label from misclassified points;
proc sort
data=score_discrim;
by _INTO_;
run;
data score_discrim;
set score_discrim;
if _INTO_ ne id then _INTO_=.;
run;
*** misclassification;
title 'PROC DISCRIM Misclassification';
proc sql;
select count(*)
from score_discrim
where _INTO_=.;
quit;
*** plot test data classified with PROC DISCRIM;
title 'Test Data Classified with PROC DISCRIM';
proc sgplot
data=score_discrim;
scatter y=y x=x / group=_INTO_;
run;
*** create mandatory dmdb database for PROC PMBR *****************************;
proc dmdb
data=clusters_train
dmdbcat=work._cat;
class id;
var x y;
target id;
run;
*** classify with PROC PMBR SCAN method **************************************;
*** should be similar to PROC DISCRIM;
proc pmbr
data=clusters_train
dmdbcat=_cat
method=scan
k=&K
;
score data=clusters_test out=score_pmbr_scan;
target id;
var x y;
run;
*** sort for plot color consistency;
*** remove label from misclassified points;
proc sort
data=score_pmbr_scan;
by I_id;
run;
data score_pmbr_scan;
set score_pmbr_scan;
if strip(I_id) ne strip(put(id, best.)) then I_id='';
run;
*** misclassification;
title 'PROC PMBR SCAN Misclassification';
proc sql;
select count(*)
from score_pmbr_scan
where I_id='';
quit;
*** plot test data classified with PROC PMBR SCAN method;
title 'Test Data Classified with PROC PMBR SCAN method';
proc sgplot
data=score_pmbr_scan;
scatter y=y x=x / group=I_id;
run;
*** PROC PMBR RDTREE METHOD **************************************************;
*** can be different from PROC DISCRIM;
*** more efficient, but calculates distances approximately;
*** you may change the BUCKETS and EPSILON tuning parameters;
proc pmbr
data=clusters_train
dmdbcat=_cat
method=rdtree
k=&K
epsilon=&EPSILON
buckets=&BUCKETS
;
score data=clusters_test out=score_pmbr_rdtree;
target id;
var x y;
run;
*** sort for plot color consistency;
*** remove label from misclassified points;
proc sort
data=score_pmbr_rdtree;
by I_id;
run;
data score_pmbr_rdtree;
set score_pmbr_rdtree;
if strip(I_id) ne strip(put(id, best.)) then I_id='';
run;
*** misclassification;
title 'PROC PMBR RDTREE Misclassification';
proc sql;
select count(*)
from score_pmbr_rdtree
where I_id='';
quit;
*** plot test data classified with PROC PMBR RDTREE method;
title 'Test Data Classified with PROC PMBR RDTREE method';
title2 "BUCKETS=&BUCKETS EPSILON=&EPSILON";
proc sgplot
data=score_pmbr_rdtree;
scatter y=y x=x / group=I_id;
run;
title;
title2;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment