jphall663/discrim_v_pmbr.sas

## discrim_v_pmbr.sas
*** clear any previous results;
ods html close;
ods html;

*** display resolution of macro variables in log;
option symbolgen;

*** request detailed performance info;
options fullstimer;

*** number of points per cluster;
%let N=5000;

*** spread of each cluster, causes overlap;
%let SPREAD=1.5;

*** set tuning paramters;
%let K=11; /* for all */
%let BUCKETS=100; /* for PROC PMBR RDTREE method, higher for less approximation */
%let EPSILON=10; /* for PROC PMBR RDTREE method, lower for less approximation */

*** generate sample data with obvious classes ********************************;
data clusters_train clusters_test;
      drop n;
      id=0;
      do n=1 to &N;
            x=2*rannor(12345)*&SPREAD.+20;
            y=2*rannor(12345)*&SPREAD.+20;
           	if mod(n, 2)=0 then output clusters_train;
	  		else output clusters_test;
      end;
      id=id + 1;
      do n=1 to &N;
            x=2*rannor(12345)*&SPREAD.+15;
            y=2*rannor(12345)*&SPREAD.+15;
           	if mod(n, 2)=0 then output clusters_train;
	  		else output clusters_test;
      end;
      id=id + 1;
      do n=1 to &N;
            x=2*rannor(12345)*&SPREAD.+10;
            y=2*rannor(12345)*&SPREAD.+10;
           	if mod(n, 2)=0 then output clusters_train;
	  		else output clusters_test;
      end;
      id=id + 1;
      do n=1 to &N;
            x=2*rannor(12345)*&SPREAD.+5;
            y=2*rannor(12345)*&SPREAD.+5;
           	if mod(n, 2)=0 then output clusters_train;
	  		else output clusters_test;
      end;
      id=id + 1;
      do n=1 to &N;
            x=2*rannor(12345)*&SPREAD.;
            y=2*rannor(12345)*&SPREAD.;
           	if mod(n, 2)=0 then output clusters_train;
	  		else output clusters_test;
      end;
      id=id + 1;
      do n=1 to &N;
            x=2*rannor(12345)*&SPREAD.+15;
            y=2*rannor(12345)*&SPREAD.;
           	if mod(n, 2)=0 then output clusters_train;
	  		else output clusters_test;
      end;
      id=id + 1;
      do n=1 to &N;
            x=2*rannor(12345)*&SPREAD.;
            y=2*rannor(12345)*&SPREAD.+15;
           	if mod(n, 2)=0 then output clusters_train;
	  		else output clusters_test;
      end;
run;

*** plot generated test data;
title 'Generated Test Data';
proc sgplot
	data=clusters_test;
	scatter y=y x=x /group=id;
run;

*** classifiy with PROC DISCRIM **********************************************;
proc discrim
	data=clusters_train
	method=npar
	k=&K
	testdata=clusters_test
	testout=score_discrim
	;
	class id;
	var x y;
run;


*** sort for plot color consistency;
*** remove label from misclassified points;
proc sort
	data=score_discrim;
	by _INTO_;
run;
data score_discrim;
	set score_discrim;
	if _INTO_ ne id then _INTO_=.;
run;

*** misclassification;
title 'PROC DISCRIM Misclassification';
proc sql;
	select count(*)
	from score_discrim
	where _INTO_=.;
quit;

*** plot test data classified with PROC DISCRIM;
title 'Test Data Classified with PROC DISCRIM';
proc sgplot
	data=score_discrim;
	scatter y=y x=x / group=_INTO_;
run;

*** create mandatory dmdb database for PROC PMBR *****************************;
proc dmdb
	data=clusters_train
	dmdbcat=work._cat;
	class id;
	var x y;
	target id;
run;

*** classify with PROC PMBR SCAN method **************************************;
*** should be similar to PROC DISCRIM;
proc pmbr
	data=clusters_train
	dmdbcat=_cat
	method=scan
	k=&K
	;
	score data=clusters_test out=score_pmbr_scan;
	target id;
	var x y;
run;

*** sort for plot color consistency;
*** remove label from misclassified points;
proc sort
	data=score_pmbr_scan;
	by I_id;
run;
data score_pmbr_scan;
	set score_pmbr_scan;
	if strip(I_id) ne strip(put(id, best.)) then I_id='';
run;

*** misclassification;
title 'PROC PMBR SCAN Misclassification';
proc sql;
	select count(*)
	from score_pmbr_scan
	where I_id='';
quit;

*** plot test data classified with PROC PMBR SCAN method;
title 'Test Data Classified with PROC PMBR SCAN method';
proc sgplot
	data=score_pmbr_scan;
	scatter y=y x=x / group=I_id;
run;

*** PROC PMBR RDTREE METHOD **************************************************;
*** can be different from PROC DISCRIM;
*** more efficient, but calculates distances approximately;
*** you may change the BUCKETS and EPSILON tuning parameters;
proc pmbr
	data=clusters_train
	dmdbcat=_cat
	method=rdtree
	k=&K
	epsilon=&EPSILON
	buckets=&BUCKETS
	;
	score data=clusters_test out=score_pmbr_rdtree;
	target id;
	var x y;
run;

*** sort for plot color consistency;
*** remove label from misclassified points;
proc sort
	data=score_pmbr_rdtree;
	by I_id;
run;
data score_pmbr_rdtree;
	set score_pmbr_rdtree;
	if strip(I_id) ne strip(put(id, best.)) then I_id='';
run;


*** misclassification;
title 'PROC PMBR RDTREE Misclassification';
proc sql;
	select count(*)
	from score_pmbr_rdtree
	where I_id='';
quit;

*** plot test data classified with PROC PMBR RDTREE method;
title 'Test Data Classified with PROC PMBR RDTREE method';
title2 "BUCKETS=&BUCKETS EPSILON=&EPSILON";
proc sgplot
	data=score_pmbr_rdtree;
	scatter y=y x=x / group=I_id;
run;

title;
title2;
	*** clear any previous results;
	ods html close;
	ods html;

	*** display resolution of macro variables in log;
	option symbolgen;

	*** request detailed performance info;
	options fullstimer;

	*** number of points per cluster;
	%let N=5000;

	*** spread of each cluster, causes overlap;
	%let SPREAD=1.5;

	*** set tuning paramters;
	%let K=11; /* for all */
	%let BUCKETS=100; /* for PROC PMBR RDTREE method, higher for less approximation */
	%let EPSILON=10; /* for PROC PMBR RDTREE method, lower for less approximation */

	* generate sample data with obvious classes ******************************;
	data clusters_train clusters_test;
	drop n;
	id=0;
	do n=1 to &N;
	x=2rannor(12345)&SPREAD.+20;
	y=2rannor(12345)&SPREAD.+20;
	if mod(n, 2)=0 then output clusters_train;
	else output clusters_test;
	end;
	id=id + 1;
	do n=1 to &N;
	x=2rannor(12345)&SPREAD.+15;
	y=2rannor(12345)&SPREAD.+15;
	if mod(n, 2)=0 then output clusters_train;
	else output clusters_test;
	end;
	id=id + 1;
	do n=1 to &N;
	x=2rannor(12345)&SPREAD.+10;
	y=2rannor(12345)&SPREAD.+10;
	if mod(n, 2)=0 then output clusters_train;
	else output clusters_test;
	end;
	id=id + 1;
	do n=1 to &N;
	x=2rannor(12345)&SPREAD.+5;
	y=2rannor(12345)&SPREAD.+5;
	if mod(n, 2)=0 then output clusters_train;
	else output clusters_test;
	end;
	id=id + 1;
	do n=1 to &N;
	x=2rannor(12345)&SPREAD.;
	y=2rannor(12345)&SPREAD.;
	if mod(n, 2)=0 then output clusters_train;
	else output clusters_test;
	end;
	id=id + 1;
	do n=1 to &N;
	x=2rannor(12345)&SPREAD.+15;
	y=2rannor(12345)&SPREAD.;
	if mod(n, 2)=0 then output clusters_train;
	else output clusters_test;
	end;
	id=id + 1;
	do n=1 to &N;
	x=2rannor(12345)&SPREAD.;
	y=2rannor(12345)&SPREAD.+15;
	if mod(n, 2)=0 then output clusters_train;
	else output clusters_test;
	end;
	run;

	*** plot generated test data;
	title 'Generated Test Data';
	proc sgplot
	data=clusters_test;
	scatter y=y x=x /group=id;
	run;

	* classifiy with PROC DISCRIM ********************************************;
	proc discrim
	data=clusters_train
	method=npar
	k=&K
	testdata=clusters_test
	testout=score_discrim
	;
	class id;
	var x y;
	run;


	*** sort for plot color consistency;
	*** remove label from misclassified points;
	proc sort
	data=score_discrim;
	by _INTO_;
	run;
	data score_discrim;
	set score_discrim;
	if _INTO_ ne id then _INTO_=.;
	run;

	*** misclassification;
	title 'PROC DISCRIM Misclassification';
	proc sql;
	select count(*)
	from score_discrim
	where _INTO_=.;
	quit;

	*** plot test data classified with PROC DISCRIM;
	title 'Test Data Classified with PROC DISCRIM';
	proc sgplot
	data=score_discrim;
	scatter y=y x=x / group=_INTO_;
	run;

	* create mandatory dmdb database for PROC PMBR ***************************;
	proc dmdb
	data=clusters_train
	dmdbcat=work._cat;
	class id;
	var x y;
	target id;
	run;

	* classify with PROC PMBR SCAN method ************************************;
	*** should be similar to PROC DISCRIM;
	proc pmbr
	data=clusters_train
	dmdbcat=_cat
	method=scan
	k=&K
	;
	score data=clusters_test out=score_pmbr_scan;
	target id;
	var x y;
	run;

	*** sort for plot color consistency;
	*** remove label from misclassified points;
	proc sort
	data=score_pmbr_scan;
	by I_id;
	run;
	data score_pmbr_scan;
	set score_pmbr_scan;
	if strip(I_id) ne strip(put(id, best.)) then I_id='';
	run;

	*** misclassification;
	title 'PROC PMBR SCAN Misclassification';
	proc sql;
	select count(*)
	from score_pmbr_scan
	where I_id='';
	quit;

	*** plot test data classified with PROC PMBR SCAN method;
	title 'Test Data Classified with PROC PMBR SCAN method';
	proc sgplot
	data=score_pmbr_scan;
	scatter y=y x=x / group=I_id;
	run;

	* PROC PMBR RDTREE METHOD ************************************************;
	*** can be different from PROC DISCRIM;
	*** more efficient, but calculates distances approximately;
	*** you may change the BUCKETS and EPSILON tuning parameters;
	proc pmbr
	data=clusters_train
	dmdbcat=_cat
	method=rdtree
	k=&K
	epsilon=&EPSILON
	buckets=&BUCKETS
	;
	score data=clusters_test out=score_pmbr_rdtree;
	target id;
	var x y;
	run;

	*** sort for plot color consistency;
	*** remove label from misclassified points;
	proc sort
	data=score_pmbr_rdtree;
	by I_id;
	run;
	data score_pmbr_rdtree;
	set score_pmbr_rdtree;
	if strip(I_id) ne strip(put(id, best.)) then I_id='';
	run;


	*** misclassification;
	title 'PROC PMBR RDTREE Misclassification';
	proc sql;
	select count(*)
	from score_pmbr_rdtree
	where I_id='';
	quit;

	*** plot test data classified with PROC PMBR RDTREE method;
	title 'Test Data Classified with PROC PMBR RDTREE method';
	title2 "BUCKETS=&BUCKETS EPSILON=&EPSILON";
	proc sgplot
	data=score_pmbr_rdtree;
	scatter y=y x=x / group=I_id;
	run;

	title;
	title2;