Last active
March 7, 2019 03:25
-
-
Save aidancbrady/dccfe27f22c287a72ca5e128d14c9c13 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.File; | |
import java.lang.reflect.Field; | |
import java.text.DecimalFormat; | |
import java.util.Arrays; | |
import weka.attributeSelection.AttributeSelection; | |
import weka.attributeSelection.InfoGainAttributeEval; | |
import weka.attributeSelection.Ranker; | |
import weka.classifiers.Classifier; | |
import weka.classifiers.Evaluation; | |
import weka.classifiers.functions.MultilayerPerceptron; | |
import weka.clusterers.EM; | |
import weka.clusterers.SimpleKMeans; | |
import weka.core.Instance; | |
import weka.core.Instances; | |
import weka.core.converters.ArffSaver; | |
import weka.core.converters.ConverterUtils.DataSource; | |
import weka.filters.Filter; | |
import weka.filters.unsupervised.attribute.AddCluster; | |
import weka.filters.unsupervised.attribute.IndependentComponents; | |
import weka.filters.unsupervised.attribute.PrincipalComponents; | |
import weka.filters.unsupervised.attribute.RandomProjection; | |
import weka.filters.unsupervised.instance.Resample; | |
public class UnsupervisedLearning | |
{ | |
private static final DecimalFormat format = new DecimalFormat("#.00"); | |
private static Instances cancer = getData("/Documents/Georgia Tech/Spring 2019/cs4641/Datasets/breast-cancer.arff"); | |
private static Instances phishing = getData("/Documents/Georgia Tech/Spring 2019/cs4641/Datasets/phishing-websites.arff"); | |
private static SimpleKMeans kMeans = new SimpleKMeans(); | |
private static EM em = new EM(); | |
public static void main(String[] args) throws Exception { | |
//runClusterCountTest(cancer); | |
//runClusterCountTest(phishing); | |
//runTimeTest(); | |
runPCATest(); | |
//runICATest(); | |
//runRPTest(); | |
//runIGTest(); | |
//runNNTest(); | |
//runNNClusteringTest(cancer, 3, 4, new String[] {"-L", "0.3", "-M", "0.2", "-N", "300", "-V", "0", "-S", "0", "-E", "20", "-H", "a,1"}); | |
//runNNClusteringTest(phishing, 4, 15, new String[] {"-L", "0.3", "-M", "0.2", "-N", "200", "-V", "0", "-S", "0", "-E", "20", "-H", "a,5"}); | |
} | |
public static void runClusterCountTest(Instances data) throws Exception { | |
System.out.println("== K MEANS 1-10 CLUSTER TEST"); | |
for(int i = 1; i <= 10; i++) { | |
kMeans.setNumClusters(i); | |
kMeans.buildClusterer(data); | |
System.out.println(kMeans.getSquaredError()); | |
} | |
System.out.println(); | |
System.out.println("== EM 1-10 CLUSTER TEST"); | |
for(int i = 1; i <= 10; i++) { | |
em.setNumClusters(i); | |
em.buildClusterer(data); | |
System.out.println(getLogLikelihood(data, em)); | |
} | |
//em.setNumClusters(-1); | |
//em.buildClusterer(cancer); | |
//System.out.println("SELECTED: " + em.numberOfClusters()); | |
} | |
public static void runTimeTest() throws Exception { | |
kMeans.setNumClusters(4); | |
em.setNumClusters(4); | |
startRecording("Cancer KM"); | |
kMeans.setNumClusters(3); | |
kMeans.buildClusterer(cancer); | |
stopRecording(); | |
startRecording("Phishing KM"); | |
kMeans.setNumClusters(4); | |
kMeans.buildClusterer(phishing); | |
stopRecording(); | |
startRecording("Cancer EM"); | |
em.setNumClusters(3); | |
em.buildClusterer(cancer); | |
stopRecording(); | |
startRecording("Phishing EM"); | |
em.setNumClusters(4); | |
em.buildClusterer(phishing); | |
stopRecording(); | |
} | |
public static void runPCATest() throws Exception { | |
/*for(int i = 1; i <= 9; i+= 2)*/ { | |
System.out.println("Cancer test " + 4); | |
PrincipalComponents pca = new PrincipalComponents(); | |
pca.setMaximumAttributes(4); | |
pca.setInputFormat(cancer); | |
Instances ret = Filter.useFilter(cancer, pca); | |
runClusterCountTest(ret); | |
} | |
/*for(int i = 6; i <= 30; i+= 6)*/ { | |
System.out.println("Phishing test " + 15); | |
PrincipalComponents pca = new PrincipalComponents(); | |
pca.setMaximumAttributes(15); | |
pca.setInputFormat(phishing); | |
Instances ret = Filter.useFilter(phishing, pca); | |
runClusterCountTest(ret); | |
} | |
} | |
public static void runICATest() throws Exception { | |
//cancer.setClassIndex(cancer.numAttributes()-1); | |
//phishing.setClassIndex(phishing.numAttributes()-1); | |
System.out.println("Cancer test"); | |
{ | |
IndependentComponents ica = new IndependentComponents(); | |
ica.setInputFormat(cancer); | |
ica.setOutputNumAtts(4); | |
Instances ret = Filter.useFilter(cancer, ica); | |
writeToFile(ret, getHomeDirectory() + "/Documents/Georgia Tech/Spring 2019/cs4641/Assignment 3/Submission/breast-cancer-ica.arff"); | |
runClusterCountTest(ret); | |
} | |
System.out.println("Phishing test"); | |
{ | |
IndependentComponents ica = new IndependentComponents(); | |
ica.setInputFormat(phishing); | |
ica.setOutputNumAtts(15); | |
Instances ret = Filter.useFilter(phishing, ica); | |
writeToFile(ret, getHomeDirectory() + "/Documents/Georgia Tech/Spring 2019/cs4641/Assignment 3/Submission/phishing-websites-ica.arff"); | |
runClusterCountTest(ret); | |
} | |
} | |
public static void runRPTest() throws Exception { | |
System.out.println("Cancer test"); | |
{ | |
RandomProjection rp = new RandomProjection(); | |
rp.setInputFormat(cancer); | |
rp.setNumberOfAttributes(4); | |
Instances ret = Filter.useFilter(cancer, rp); | |
runClusterCountTest(ret); | |
} | |
System.out.println("Phishing test"); | |
{ | |
RandomProjection rp = new RandomProjection(); | |
rp.setInputFormat(phishing); | |
rp.setNumberOfAttributes(10); | |
Instances ret = Filter.useFilter(phishing, rp); | |
runClusterCountTest(ret); | |
} | |
} | |
public static void runIGTest() throws Exception { | |
System.out.println("Cancer test"); | |
{ | |
Instances ret = processIG(cancer); | |
ret.setClassIndex(-1); | |
runClusterCountTest(ret); | |
} | |
System.out.println("Phishing test"); | |
{ | |
Instances ret = processIG(phishing); | |
ret.setClassIndex(-1); | |
runClusterCountTest(ret); | |
} | |
} | |
public static void runNNTest() throws Exception { | |
cancer.setClassIndex(cancer.numAttributes()-1); | |
phishing.setClassIndex(phishing.numAttributes()-1); | |
/*{ | |
PrincipalComponents pca = new PrincipalComponents(); | |
pca.setMaximumAttributes(4); | |
pca.setInputFormat(cancer); | |
Instances ret = Filter.useFilter(cancer, pca); | |
System.out.println("-- CANCER PCA TESTING"); | |
MultilayerPerceptron nn = new MultilayerPerceptron(); | |
nn.setOptions(new String[] {"-L", "0.3", "-M", "0.2", "-N", "300", "-V", "0", "-S", "0", "-E", "20", "-H", "a,1"}); | |
buildLearningCurve(ret, nn, 0.7); | |
} | |
{ | |
PrincipalComponents pca = new PrincipalComponents(); | |
pca.setMaximumAttributes(15); | |
pca.setInputFormat(phishing); | |
Instances ret = Filter.useFilter(phishing, pca); | |
System.out.println("-- PHISHING PCA TESTING"); | |
MultilayerPerceptron nn = new MultilayerPerceptron(); | |
nn.setOptions(new String[] {"-L", "0.3", "-M", "0.2", "-N", "200", "-V", "0", "-S", "0", "-E", "20", "-H", "a,5"}); | |
buildLearningCurve(ret, nn, 0.7); | |
}*/ | |
/*{ | |
IndependentComponents pca = new IndependentComponents(); | |
pca.setOutputNumAtts(4); | |
pca.setInputFormat(cancer); | |
Instances ret = Filter.useFilter(cancer, pca); | |
System.out.println("-- CANCER ICA TESTING"); | |
MultilayerPerceptron nn = new MultilayerPerceptron(); | |
nn.setOptions(new String[] {"-L", "0.3", "-M", "0.2", "-N", "300", "-V", "0", "-S", "0", "-E", "20", "-H", "a,1"}); | |
buildLearningCurve(ret, nn, 0.7); | |
} | |
{ | |
IndependentComponents pca = new IndependentComponents(); | |
pca.setOutputNumAtts(15); | |
pca.setInputFormat(phishing); | |
Instances ret = Filter.useFilter(phishing, pca); | |
System.out.println("-- PHISHING ICA TESTING"); | |
MultilayerPerceptron nn = new MultilayerPerceptron(); | |
nn.setOptions(new String[] {"-L", "0.3", "-M", "0.2", "-N", "200", "-V", "0", "-S", "0", "-E", "20", "-H", "a,5"}); | |
buildLearningCurve(ret, nn, 0.7); | |
}*/ | |
{ | |
RandomProjection pca = new RandomProjection(); | |
pca.setNumberOfAttributes(4); | |
pca.setInputFormat(cancer); | |
Instances ret = Filter.useFilter(cancer, pca); | |
System.out.println("-- CANCER RP TESTING"); | |
MultilayerPerceptron nn = new MultilayerPerceptron(); | |
nn.setOptions(new String[] {"-L", "0.3", "-M", "0.2", "-N", "300", "-V", "0", "-S", "0", "-E", "20", "-H", "a,1"}); | |
buildLearningCurve(ret, nn, 0.7, false); | |
} | |
{ | |
RandomProjection pca = new RandomProjection(); | |
pca.setNumberOfAttributes(15); | |
pca.setInputFormat(phishing); | |
Instances ret = Filter.useFilter(phishing, pca); | |
System.out.println("-- PHISHING RP TESTING"); | |
MultilayerPerceptron nn = new MultilayerPerceptron(); | |
nn.setOptions(new String[] {"-L", "0.3", "-M", "0.2", "-N", "200", "-V", "0", "-S", "0", "-E", "20", "-H", "a,5"}); | |
buildLearningCurve(ret, nn, 0.7, false); | |
} | |
/*{ | |
Instances ret = processIG(cancer); | |
System.out.println("-- CANCER IG TESTING"); | |
MultilayerPerceptron nn = new MultilayerPerceptron(); | |
nn.setOptions(new String[] {"-L", "0.3", "-M", "0.2", "-N", "300", "-V", "0", "-S", "0", "-E", "20", "-H", "a,1"}); | |
buildLearningCurve(ret, nn, 0.7); | |
} | |
{ | |
Instances ret = processIG(phishing); | |
System.out.println("-- PHISHING IG TESTING"); | |
MultilayerPerceptron nn = new MultilayerPerceptron(); | |
nn.setOptions(new String[] {"-L", "0.3", "-M", "0.2", "-N", "200", "-V", "0", "-S", "0", "-E", "20", "-H", "a,5"}); | |
buildLearningCurve(ret, nn, 0.7); | |
}*/ | |
/*{ | |
System.out.println("-- CANCER NONE TESTING"); | |
MultilayerPerceptron nn = new MultilayerPerceptron(); | |
nn.setOptions(new String[] {"-L", "0.3", "-M", "0.2", "-N", "300", "-V", "0", "-S", "0", "-E", "20", "-H", "a,1"}); | |
buildLearningCurve(cancer, nn, 0.7); | |
} | |
{ | |
System.out.println("-- PHISHING NONE TESTING"); | |
MultilayerPerceptron nn = new MultilayerPerceptron(); | |
nn.setOptions(new String[] {"-L", "0.3", "-M", "0.2", "-N", "200", "-V", "0", "-S", "0", "-E", "20", "-H", "a,5"}); | |
buildLearningCurve(phishing, nn, 0.7); | |
}*/ | |
} | |
public static void runNNClusteringTest(Instances data, int numClusters, int attrbs, String[] nnOptions) throws Exception { | |
cancer.setClassIndex(cancer.numAttributes()-1); | |
phishing.setClassIndex(phishing.numAttributes()-1); | |
Instances KM_PCA = applyClustering(processPCA(data, attrbs), false, numClusters); | |
Instances KM_ICA = applyClustering(processICA(data, attrbs), false, numClusters); | |
Instances KM_RP = applyClustering(processRP(data, attrbs), false, numClusters); | |
Instances KM_IG = applyClustering(processIG(data), false, numClusters); | |
Instances EM_PCA = applyClustering(processPCA(data, attrbs), true, numClusters); | |
Instances EM_ICA = applyClustering(processICA(data, attrbs), true, numClusters); | |
Instances EM_RP = applyClustering(processRP(data, attrbs), true, numClusters); | |
Instances EM_IG = applyClustering(processIG(data), true, numClusters); | |
MultilayerPerceptron nn = new MultilayerPerceptron(); | |
nn.setOptions(nnOptions); | |
startRecording("K-Means PCA Accuracy:"); | |
buildLearningCurve(KM_PCA, nn, 0.7, true);stopRecording(); | |
startRecording("K-Means ICA Accuracy:"); | |
buildLearningCurve(KM_ICA, nn, 0.7, true);stopRecording(); | |
startRecording("K-Means RP Accuracy:"); | |
buildLearningCurve(KM_RP, nn, 0.7, true);stopRecording(); | |
startRecording("K-Means IG Accuracy:"); | |
buildLearningCurve(KM_IG, nn, 0.7, true);stopRecording(); | |
startRecording("EM PCA Accuracy:"); | |
buildLearningCurve(EM_PCA, nn, 0.7, true);stopRecording(); | |
startRecording("EM ICA Accuracy:"); | |
buildLearningCurve(EM_ICA, nn, 0.7, true);stopRecording(); | |
startRecording("EM RP Accuracy:"); | |
buildLearningCurve(EM_RP, nn, 0.7, true);stopRecording(); | |
startRecording("EM IG Accuracy:"); | |
buildLearningCurve(EM_IG, nn, 0.7, true);stopRecording(); | |
startRecording("Baseline Accuracy:"); | |
buildLearningCurve(data, nn, 0.7, true);stopRecording(); | |
} | |
public static Instances processPCA(Instances data, int attrbs) throws Exception { | |
PrincipalComponents pca = new PrincipalComponents(); | |
pca.setMaximumAttributes(attrbs); | |
pca.setInputFormat(data); | |
return Filter.useFilter(data, pca); | |
} | |
public static Instances processICA(Instances data, int attrbs) throws Exception { | |
PrincipalComponents pca = new PrincipalComponents(); | |
pca.setMaximumAttributes(attrbs); | |
pca.setInputFormat(data); | |
return Filter.useFilter(data, pca); | |
} | |
public static Instances processRP(Instances data, int attrbs) throws Exception { | |
RandomProjection pca = new RandomProjection(); | |
pca.setNumberOfAttributes(attrbs); | |
pca.setInputFormat(data); | |
return Filter.useFilter(data, pca); | |
} | |
public static Instances applyClustering(Instances data, boolean em, int numClusters) throws Exception { | |
AddCluster addCluster = new AddCluster(); | |
addCluster.setInputFormat(data); | |
if(em) { | |
addCluster.setOptions(new String[] {"-W", "weka.clusterers.EM -N " + numClusters, "-I", ""}); | |
} else { | |
addCluster.setOptions(new String[] {"-W", "weka.clusterers.SimpleKMeans -N " + numClusters, "-I", ""}); | |
} | |
return Filter.useFilter(data, addCluster); | |
} | |
public static void writeToFile(Instances data, String path) throws Exception { | |
ArffSaver saver = new ArffSaver(); | |
saver.setInstances(data); | |
saver.setFile(new File(path)); | |
saver.writeBatch(); | |
} | |
private static long timestamp = 0; | |
public static void startRecording(String s) { | |
timestamp = System.currentTimeMillis(); | |
System.out.println("Recording time for: " + s); | |
} | |
public static void stopRecording() { | |
long diff = System.currentTimeMillis()-timestamp; | |
System.out.println("Time elapsed: " + diff); | |
} | |
public static Instances getData(String path) { | |
try { | |
DataSource source = new DataSource(getHomeDirectory() + path); | |
Instances data = source.getDataSet(); | |
return data; | |
} catch(Exception e) { | |
e.printStackTrace(); | |
return null; | |
} | |
} | |
public static String getHomeDirectory() { | |
return System.getProperty("user.home"); | |
} | |
public static double getLogLikelihood(Instances data, EM model) throws Exception { | |
double loglk = 0.0, sOW = 0.0; | |
for(int l = 0; l < data.numInstances(); l++) { | |
Instance in = data.instance(l); | |
loglk += in.weight() * model.logDensityForInstance(in); | |
sOW += in.weight(); | |
} | |
if(sOW <= 0) return 0; | |
return loglk / sOW; | |
} | |
public static Instances processIG(Instances data) throws Exception { | |
InfoGainAttributeEval ig = new InfoGainAttributeEval(); | |
Ranker search = new Ranker(); | |
search.setOptions(new String[] { "-T", "0.001" }); | |
AttributeSelection attSelect = new AttributeSelection(); | |
attSelect.setEvaluator(ig); | |
attSelect.setSearch(search); | |
attSelect.SelectAttributes(data); | |
for(int i = 0; i < data.numAttributes()-1; i++) { | |
double[] rankData = attSelect.rankedAttributes()[i]; | |
//System.out.println(data.attribute((int)rankData[0]).name() + " " + Arrays.toString(rankData)); | |
} | |
data = attSelect.reduceDimensionality(data); | |
return data; | |
} | |
public static double[] getEigenvalues(PrincipalComponents pca) throws Exception { | |
Field ret = pca.getClass().getDeclaredField("m_Eigenvalues"); | |
ret.setAccessible(true); | |
return (double[])ret.get(pca); | |
} | |
public static int countKaiserEigenvalues(double[] array) { | |
int count = 0; | |
for(double val : array) { | |
if(val >= 1) count++; | |
} | |
return count; | |
} | |
public static void buildLearningCurve(Instances data, Classifier classifier, double split, boolean skip) throws Exception { | |
for(int i = 0; i < 10; i++) { | |
if(skip) i = 9; | |
Resample filter = new Resample(); | |
double datasetMult = (1-split) + (i+1)*(split/10D); | |
String[] options = new String[] {"-S", "1", "-Z", Double.toString(datasetMult*100)}; | |
filter.setOptions(options); | |
filter.setInputFormat(data); | |
Instances newData = Filter.useFilter(data, filter); | |
double testSplit = ((i+1)*(split/10D))/datasetMult; | |
int trainSize = (int)Math.round(newData.numInstances() * testSplit); | |
Instances trainData = new Instances(newData, 0, trainSize); | |
Instances testData = new Instances(newData, trainSize, newData.numInstances() - trainSize); | |
classifier.buildClassifier(trainData); | |
Evaluation eval = new Evaluation(trainData); | |
eval.evaluateModel(classifier, testData); | |
System.out.println(format.format(eval.pctCorrect())); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment