Skip to content

Instantly share code, notes, and snippets.

@aidancbrady
Last active March 7, 2019 03:25
Show Gist options
  • Save aidancbrady/dccfe27f22c287a72ca5e128d14c9c13 to your computer and use it in GitHub Desktop.
Save aidancbrady/dccfe27f22c287a72ca5e128d14c9c13 to your computer and use it in GitHub Desktop.
import java.io.File;
import java.lang.reflect.Field;
import java.text.DecimalFormat;
import java.util.Arrays;
import weka.attributeSelection.AttributeSelection;
import weka.attributeSelection.InfoGainAttributeEval;
import weka.attributeSelection.Ranker;
import weka.classifiers.Classifier;
import weka.classifiers.Evaluation;
import weka.classifiers.functions.MultilayerPerceptron;
import weka.clusterers.EM;
import weka.clusterers.SimpleKMeans;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.converters.ArffSaver;
import weka.core.converters.ConverterUtils.DataSource;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.AddCluster;
import weka.filters.unsupervised.attribute.IndependentComponents;
import weka.filters.unsupervised.attribute.PrincipalComponents;
import weka.filters.unsupervised.attribute.RandomProjection;
import weka.filters.unsupervised.instance.Resample;
public class UnsupervisedLearning
{
private static final DecimalFormat format = new DecimalFormat("#.00");
private static Instances cancer = getData("/Documents/Georgia Tech/Spring 2019/cs4641/Datasets/breast-cancer.arff");
private static Instances phishing = getData("/Documents/Georgia Tech/Spring 2019/cs4641/Datasets/phishing-websites.arff");
private static SimpleKMeans kMeans = new SimpleKMeans();
private static EM em = new EM();
public static void main(String[] args) throws Exception {
//runClusterCountTest(cancer);
//runClusterCountTest(phishing);
//runTimeTest();
runPCATest();
//runICATest();
//runRPTest();
//runIGTest();
//runNNTest();
//runNNClusteringTest(cancer, 3, 4, new String[] {"-L", "0.3", "-M", "0.2", "-N", "300", "-V", "0", "-S", "0", "-E", "20", "-H", "a,1"});
//runNNClusteringTest(phishing, 4, 15, new String[] {"-L", "0.3", "-M", "0.2", "-N", "200", "-V", "0", "-S", "0", "-E", "20", "-H", "a,5"});
}
public static void runClusterCountTest(Instances data) throws Exception {
System.out.println("== K MEANS 1-10 CLUSTER TEST");
for(int i = 1; i <= 10; i++) {
kMeans.setNumClusters(i);
kMeans.buildClusterer(data);
System.out.println(kMeans.getSquaredError());
}
System.out.println();
System.out.println("== EM 1-10 CLUSTER TEST");
for(int i = 1; i <= 10; i++) {
em.setNumClusters(i);
em.buildClusterer(data);
System.out.println(getLogLikelihood(data, em));
}
//em.setNumClusters(-1);
//em.buildClusterer(cancer);
//System.out.println("SELECTED: " + em.numberOfClusters());
}
public static void runTimeTest() throws Exception {
kMeans.setNumClusters(4);
em.setNumClusters(4);
startRecording("Cancer KM");
kMeans.setNumClusters(3);
kMeans.buildClusterer(cancer);
stopRecording();
startRecording("Phishing KM");
kMeans.setNumClusters(4);
kMeans.buildClusterer(phishing);
stopRecording();
startRecording("Cancer EM");
em.setNumClusters(3);
em.buildClusterer(cancer);
stopRecording();
startRecording("Phishing EM");
em.setNumClusters(4);
em.buildClusterer(phishing);
stopRecording();
}
public static void runPCATest() throws Exception {
/*for(int i = 1; i <= 9; i+= 2)*/ {
System.out.println("Cancer test " + 4);
PrincipalComponents pca = new PrincipalComponents();
pca.setMaximumAttributes(4);
pca.setInputFormat(cancer);
Instances ret = Filter.useFilter(cancer, pca);
runClusterCountTest(ret);
}
/*for(int i = 6; i <= 30; i+= 6)*/ {
System.out.println("Phishing test " + 15);
PrincipalComponents pca = new PrincipalComponents();
pca.setMaximumAttributes(15);
pca.setInputFormat(phishing);
Instances ret = Filter.useFilter(phishing, pca);
runClusterCountTest(ret);
}
}
public static void runICATest() throws Exception {
//cancer.setClassIndex(cancer.numAttributes()-1);
//phishing.setClassIndex(phishing.numAttributes()-1);
System.out.println("Cancer test");
{
IndependentComponents ica = new IndependentComponents();
ica.setInputFormat(cancer);
ica.setOutputNumAtts(4);
Instances ret = Filter.useFilter(cancer, ica);
writeToFile(ret, getHomeDirectory() + "/Documents/Georgia Tech/Spring 2019/cs4641/Assignment 3/Submission/breast-cancer-ica.arff");
runClusterCountTest(ret);
}
System.out.println("Phishing test");
{
IndependentComponents ica = new IndependentComponents();
ica.setInputFormat(phishing);
ica.setOutputNumAtts(15);
Instances ret = Filter.useFilter(phishing, ica);
writeToFile(ret, getHomeDirectory() + "/Documents/Georgia Tech/Spring 2019/cs4641/Assignment 3/Submission/phishing-websites-ica.arff");
runClusterCountTest(ret);
}
}
public static void runRPTest() throws Exception {
System.out.println("Cancer test");
{
RandomProjection rp = new RandomProjection();
rp.setInputFormat(cancer);
rp.setNumberOfAttributes(4);
Instances ret = Filter.useFilter(cancer, rp);
runClusterCountTest(ret);
}
System.out.println("Phishing test");
{
RandomProjection rp = new RandomProjection();
rp.setInputFormat(phishing);
rp.setNumberOfAttributes(10);
Instances ret = Filter.useFilter(phishing, rp);
runClusterCountTest(ret);
}
}
public static void runIGTest() throws Exception {
System.out.println("Cancer test");
{
Instances ret = processIG(cancer);
ret.setClassIndex(-1);
runClusterCountTest(ret);
}
System.out.println("Phishing test");
{
Instances ret = processIG(phishing);
ret.setClassIndex(-1);
runClusterCountTest(ret);
}
}
public static void runNNTest() throws Exception {
cancer.setClassIndex(cancer.numAttributes()-1);
phishing.setClassIndex(phishing.numAttributes()-1);
/*{
PrincipalComponents pca = new PrincipalComponents();
pca.setMaximumAttributes(4);
pca.setInputFormat(cancer);
Instances ret = Filter.useFilter(cancer, pca);
System.out.println("-- CANCER PCA TESTING");
MultilayerPerceptron nn = new MultilayerPerceptron();
nn.setOptions(new String[] {"-L", "0.3", "-M", "0.2", "-N", "300", "-V", "0", "-S", "0", "-E", "20", "-H", "a,1"});
buildLearningCurve(ret, nn, 0.7);
}
{
PrincipalComponents pca = new PrincipalComponents();
pca.setMaximumAttributes(15);
pca.setInputFormat(phishing);
Instances ret = Filter.useFilter(phishing, pca);
System.out.println("-- PHISHING PCA TESTING");
MultilayerPerceptron nn = new MultilayerPerceptron();
nn.setOptions(new String[] {"-L", "0.3", "-M", "0.2", "-N", "200", "-V", "0", "-S", "0", "-E", "20", "-H", "a,5"});
buildLearningCurve(ret, nn, 0.7);
}*/
/*{
IndependentComponents pca = new IndependentComponents();
pca.setOutputNumAtts(4);
pca.setInputFormat(cancer);
Instances ret = Filter.useFilter(cancer, pca);
System.out.println("-- CANCER ICA TESTING");
MultilayerPerceptron nn = new MultilayerPerceptron();
nn.setOptions(new String[] {"-L", "0.3", "-M", "0.2", "-N", "300", "-V", "0", "-S", "0", "-E", "20", "-H", "a,1"});
buildLearningCurve(ret, nn, 0.7);
}
{
IndependentComponents pca = new IndependentComponents();
pca.setOutputNumAtts(15);
pca.setInputFormat(phishing);
Instances ret = Filter.useFilter(phishing, pca);
System.out.println("-- PHISHING ICA TESTING");
MultilayerPerceptron nn = new MultilayerPerceptron();
nn.setOptions(new String[] {"-L", "0.3", "-M", "0.2", "-N", "200", "-V", "0", "-S", "0", "-E", "20", "-H", "a,5"});
buildLearningCurve(ret, nn, 0.7);
}*/
{
RandomProjection pca = new RandomProjection();
pca.setNumberOfAttributes(4);
pca.setInputFormat(cancer);
Instances ret = Filter.useFilter(cancer, pca);
System.out.println("-- CANCER RP TESTING");
MultilayerPerceptron nn = new MultilayerPerceptron();
nn.setOptions(new String[] {"-L", "0.3", "-M", "0.2", "-N", "300", "-V", "0", "-S", "0", "-E", "20", "-H", "a,1"});
buildLearningCurve(ret, nn, 0.7, false);
}
{
RandomProjection pca = new RandomProjection();
pca.setNumberOfAttributes(15);
pca.setInputFormat(phishing);
Instances ret = Filter.useFilter(phishing, pca);
System.out.println("-- PHISHING RP TESTING");
MultilayerPerceptron nn = new MultilayerPerceptron();
nn.setOptions(new String[] {"-L", "0.3", "-M", "0.2", "-N", "200", "-V", "0", "-S", "0", "-E", "20", "-H", "a,5"});
buildLearningCurve(ret, nn, 0.7, false);
}
/*{
Instances ret = processIG(cancer);
System.out.println("-- CANCER IG TESTING");
MultilayerPerceptron nn = new MultilayerPerceptron();
nn.setOptions(new String[] {"-L", "0.3", "-M", "0.2", "-N", "300", "-V", "0", "-S", "0", "-E", "20", "-H", "a,1"});
buildLearningCurve(ret, nn, 0.7);
}
{
Instances ret = processIG(phishing);
System.out.println("-- PHISHING IG TESTING");
MultilayerPerceptron nn = new MultilayerPerceptron();
nn.setOptions(new String[] {"-L", "0.3", "-M", "0.2", "-N", "200", "-V", "0", "-S", "0", "-E", "20", "-H", "a,5"});
buildLearningCurve(ret, nn, 0.7);
}*/
/*{
System.out.println("-- CANCER NONE TESTING");
MultilayerPerceptron nn = new MultilayerPerceptron();
nn.setOptions(new String[] {"-L", "0.3", "-M", "0.2", "-N", "300", "-V", "0", "-S", "0", "-E", "20", "-H", "a,1"});
buildLearningCurve(cancer, nn, 0.7);
}
{
System.out.println("-- PHISHING NONE TESTING");
MultilayerPerceptron nn = new MultilayerPerceptron();
nn.setOptions(new String[] {"-L", "0.3", "-M", "0.2", "-N", "200", "-V", "0", "-S", "0", "-E", "20", "-H", "a,5"});
buildLearningCurve(phishing, nn, 0.7);
}*/
}
public static void runNNClusteringTest(Instances data, int numClusters, int attrbs, String[] nnOptions) throws Exception {
cancer.setClassIndex(cancer.numAttributes()-1);
phishing.setClassIndex(phishing.numAttributes()-1);
Instances KM_PCA = applyClustering(processPCA(data, attrbs), false, numClusters);
Instances KM_ICA = applyClustering(processICA(data, attrbs), false, numClusters);
Instances KM_RP = applyClustering(processRP(data, attrbs), false, numClusters);
Instances KM_IG = applyClustering(processIG(data), false, numClusters);
Instances EM_PCA = applyClustering(processPCA(data, attrbs), true, numClusters);
Instances EM_ICA = applyClustering(processICA(data, attrbs), true, numClusters);
Instances EM_RP = applyClustering(processRP(data, attrbs), true, numClusters);
Instances EM_IG = applyClustering(processIG(data), true, numClusters);
MultilayerPerceptron nn = new MultilayerPerceptron();
nn.setOptions(nnOptions);
startRecording("K-Means PCA Accuracy:");
buildLearningCurve(KM_PCA, nn, 0.7, true);stopRecording();
startRecording("K-Means ICA Accuracy:");
buildLearningCurve(KM_ICA, nn, 0.7, true);stopRecording();
startRecording("K-Means RP Accuracy:");
buildLearningCurve(KM_RP, nn, 0.7, true);stopRecording();
startRecording("K-Means IG Accuracy:");
buildLearningCurve(KM_IG, nn, 0.7, true);stopRecording();
startRecording("EM PCA Accuracy:");
buildLearningCurve(EM_PCA, nn, 0.7, true);stopRecording();
startRecording("EM ICA Accuracy:");
buildLearningCurve(EM_ICA, nn, 0.7, true);stopRecording();
startRecording("EM RP Accuracy:");
buildLearningCurve(EM_RP, nn, 0.7, true);stopRecording();
startRecording("EM IG Accuracy:");
buildLearningCurve(EM_IG, nn, 0.7, true);stopRecording();
startRecording("Baseline Accuracy:");
buildLearningCurve(data, nn, 0.7, true);stopRecording();
}
public static Instances processPCA(Instances data, int attrbs) throws Exception {
PrincipalComponents pca = new PrincipalComponents();
pca.setMaximumAttributes(attrbs);
pca.setInputFormat(data);
return Filter.useFilter(data, pca);
}
public static Instances processICA(Instances data, int attrbs) throws Exception {
PrincipalComponents pca = new PrincipalComponents();
pca.setMaximumAttributes(attrbs);
pca.setInputFormat(data);
return Filter.useFilter(data, pca);
}
public static Instances processRP(Instances data, int attrbs) throws Exception {
RandomProjection pca = new RandomProjection();
pca.setNumberOfAttributes(attrbs);
pca.setInputFormat(data);
return Filter.useFilter(data, pca);
}
public static Instances applyClustering(Instances data, boolean em, int numClusters) throws Exception {
AddCluster addCluster = new AddCluster();
addCluster.setInputFormat(data);
if(em) {
addCluster.setOptions(new String[] {"-W", "weka.clusterers.EM -N " + numClusters, "-I", ""});
} else {
addCluster.setOptions(new String[] {"-W", "weka.clusterers.SimpleKMeans -N " + numClusters, "-I", ""});
}
return Filter.useFilter(data, addCluster);
}
public static void writeToFile(Instances data, String path) throws Exception {
ArffSaver saver = new ArffSaver();
saver.setInstances(data);
saver.setFile(new File(path));
saver.writeBatch();
}
private static long timestamp = 0;
public static void startRecording(String s) {
timestamp = System.currentTimeMillis();
System.out.println("Recording time for: " + s);
}
public static void stopRecording() {
long diff = System.currentTimeMillis()-timestamp;
System.out.println("Time elapsed: " + diff);
}
public static Instances getData(String path) {
try {
DataSource source = new DataSource(getHomeDirectory() + path);
Instances data = source.getDataSet();
return data;
} catch(Exception e) {
e.printStackTrace();
return null;
}
}
public static String getHomeDirectory() {
return System.getProperty("user.home");
}
public static double getLogLikelihood(Instances data, EM model) throws Exception {
double loglk = 0.0, sOW = 0.0;
for(int l = 0; l < data.numInstances(); l++) {
Instance in = data.instance(l);
loglk += in.weight() * model.logDensityForInstance(in);
sOW += in.weight();
}
if(sOW <= 0) return 0;
return loglk / sOW;
}
public static Instances processIG(Instances data) throws Exception {
InfoGainAttributeEval ig = new InfoGainAttributeEval();
Ranker search = new Ranker();
search.setOptions(new String[] { "-T", "0.001" });
AttributeSelection attSelect = new AttributeSelection();
attSelect.setEvaluator(ig);
attSelect.setSearch(search);
attSelect.SelectAttributes(data);
for(int i = 0; i < data.numAttributes()-1; i++) {
double[] rankData = attSelect.rankedAttributes()[i];
//System.out.println(data.attribute((int)rankData[0]).name() + " " + Arrays.toString(rankData));
}
data = attSelect.reduceDimensionality(data);
return data;
}
public static double[] getEigenvalues(PrincipalComponents pca) throws Exception {
Field ret = pca.getClass().getDeclaredField("m_Eigenvalues");
ret.setAccessible(true);
return (double[])ret.get(pca);
}
public static int countKaiserEigenvalues(double[] array) {
int count = 0;
for(double val : array) {
if(val >= 1) count++;
}
return count;
}
public static void buildLearningCurve(Instances data, Classifier classifier, double split, boolean skip) throws Exception {
for(int i = 0; i < 10; i++) {
if(skip) i = 9;
Resample filter = new Resample();
double datasetMult = (1-split) + (i+1)*(split/10D);
String[] options = new String[] {"-S", "1", "-Z", Double.toString(datasetMult*100)};
filter.setOptions(options);
filter.setInputFormat(data);
Instances newData = Filter.useFilter(data, filter);
double testSplit = ((i+1)*(split/10D))/datasetMult;
int trainSize = (int)Math.round(newData.numInstances() * testSplit);
Instances trainData = new Instances(newData, 0, trainSize);
Instances testData = new Instances(newData, trainSize, newData.numInstances() - trainSize);
classifier.buildClassifier(trainData);
Evaluation eval = new Evaluation(trainData);
eval.evaluateModel(classifier, testData);
System.out.println(format.format(eval.pctCorrect()));
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment