Skip to content

Instantly share code, notes, and snippets.

@alfredfrancis
Created August 30, 2017 19:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alfredfrancis/54ee66341f4f0ef1f9a88de18b3b76ca to your computer and use it in GitHub Desktop.
Save alfredfrancis/54ee66341f4f0ef1f9a88de18b3b76ca to your computer and use it in GitHub Desktop.
Java Code for Spam email/sms Classification using Weka Machine learning
import weka.classifiers.Evaluation;
import weka.classifiers.bayes.NaiveBayes;
import weka.classifiers.bayes.NaiveBayesMultinomial;
import weka.classifiers.meta.FilteredClassifier;
import weka.classifiers.Evaluation;
import weka.core.Instances;
import weka.core.Instance;
import weka.core.converters.ConverterUtils.DataSource;
import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.converters.ArffSaver;
import weka.classifiers.meta.FilteredClassifier;
import weka.filters.unsupervised.attribute.StringToWordVector;
import java.io.File;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
public class WekaClassifier {
private FilteredClassifier classifier;
private Instances trainData;
private Instances testData;
private ArrayList<Attribute> fvWekaAttributes;
WekaClassifier(){
classifier = new FilteredClassifier();
// Declare text attribute
Attribute attribute_text = new Attribute("text",(List<String>) null);
// Declare the label attribute along with its values
ArrayList<String> classAttributeValues = new ArrayList<String>();
classAttributeValues.add("spam");
classAttributeValues.add("ham");
Attribute classAttribute = new Attribute("label", classAttributeValues);
// Declare the feature vector
fvWekaAttributes = new ArrayList<Attribute>();
fvWekaAttributes.add(classAttribute);
fvWekaAttributes.add(attribute_text);
}
public Instances load (String filename) throws IOException
{
// http://geekswithblogs.net/razan/archive/2011/11/08/creating-a-simple-sparse-arff-file.aspx
// http://weka.wikispaces.com/Programmatic+Use
// // Declare text attribute
// Attribute attribute_text = new Attribute("text",(List<String>) null);
// // Declare the label attribute along with its values
// ArrayList<String> classAttributeValues = new ArrayList<String>();
// classAttributeValues.add("spam");
// classAttributeValues.add("ham");
// Attribute classAttribute = new Attribute("label", classAttributeValues);
// // Declare the feature vector
// ArrayList<Attribute> fvWekaAttributes = new ArrayList<Attribute>();
// fvWekaAttributes.add(classAttribute);
// fvWekaAttributes.add(attribute_text);
/*
Create an empty training set
name the relation “Rel”.
set intial capacity of 10*
*/
Instances dataset = new Instances("Rel", fvWekaAttributes, 10);
// Set class index
dataset.setClassIndex(0);
// read text file, parse data and add to instance
try(BufferedReader br = new BufferedReader(new FileReader(filename))) {
for(String line; (line = br.readLine()) != null; ) {
try{
// split at first occurance of n no. of words
String parts[] = line.split("\\s+",2);
// basic validation
if (!parts[0].isEmpty() && !parts[1].isEmpty()){
DenseInstance row = new DenseInstance(2);
row.setValue(fvWekaAttributes.get(0), parts[0]);
row.setValue(fvWekaAttributes.get(1), parts[1]);
// add row to instances
dataset.add(row);
}
//
}
catch (ArrayIndexOutOfBoundsException e){
System.out.println("invalid row");
}
}
}
catch (IOException e){
e.printStackTrace();
}
return dataset;
}
public void prepare() throws Exception{
trainData = load("data/train.txt");
testData = load("data/test.txt");
}
public void transform(){
// create the filter and set the attribute to be transformed from text into a feature vector (the last one)
StringToWordVector filter = new StringToWordVector();
filter.setAttributeIndices("last");
classifier.setFilter(filter);
classifier.setClassifier(new NaiveBayes());
}
public void fit() throws Exception{
classifier.buildClassifier(trainData);
}
public String classify(String text) throws Exception {
Instances newDataset = new Instances("testdata", fvWekaAttributes, 1);
newDataset.setClassIndex(0);
DenseInstance newinstance = new DenseInstance(2);
newinstance.setDataset(newDataset);
newinstance.setValue(fvWekaAttributes.get(1), text);
double pred = classifier.classifyInstance(newinstance);
System.out.println("===== Classified instance =====");
System.out.println("Class predicted: " + trainData.classAttribute().value((int) pred));
return trainData.classAttribute().value((int) pred);
// try {
// DenseInstance instance = new DenseInstance(2);
// instance.setValue(new Attribute("text",(List<String>) null), text);
// double pred = classifier.classifyInstance(instance);
// System.out.println("===== Classified instance =====");
// System.out.println("Class predicted: " + trainData.classAttribute().value((int) pred));
// return trainData.classAttribute().value((int) pred);
// }
// catch (Exception e) {
// System.out.println("Problem found when classifying the text");
// }
// return "";
}
public String evaluate() throws Exception{
Evaluation eval = new Evaluation(testData);
eval.evaluateModel(classifier, testData);
System.out.println(eval.toSummaryString());
return eval.toSummaryString();
}
public void saveArff(Instances dataset,String filename) throws IOException{
try
{
// initialize
ArffSaver arffSaverInstance = new ArffSaver();
arffSaverInstance.setInstances(dataset);
arffSaverInstance.setFile(new File(filename));
arffSaverInstance.writeBatch();
}
catch (IOException e){
e.printStackTrace();
}
}
public static void main(String[] args) throws Exception{
WekaClassifier wt = new WekaClassifier();
wt.prepare();
wt.transform();
wt.fit();
wt.evaluate();
wt.classify("goldviking (29/M) is inviting you to be his friend. Reply YES-762 or NO-762 See him: www.SMS.ac/u/goldviking STOP? Send STOP FRND to 62468");
// Instances trainData = wt.load("data/train.txt");
// System.out.println(trainData);
// // create the filter and set the attribute to be transformed from text into a feature vector (the last one)
// StringToWordVector filter = new StringToWordVector();
// filter.setAttributeIndices("last");
// FilteredClassifier classifier = new FilteredClassifier();
// classifier.setFilter(filter);
// classifier.setClassifier(new NaiveBayes());
// classifier.buildClassifier(trainData);
// System.out.println(WekaTransformer.classify("I said its okay. Sorry"));
/*
Now that we create and trained a classifier, let’s test it.
To do so, we need an evaluation module (weka.classifiers.Evaluation) to which we feed our testing set
*/
// Instances testData = wt.transform("data/test.txt");
// evaluation set
// Evaluation eval = new Evaluation(testData);
// eval.evaluateModel(classifier, testData);
// System.out.println(eval.toSummaryString());
}
}
@capiyowebmaster
Copy link

Hello good work

@JeBehen
Copy link

JeBehen commented Jul 17, 2023

Nice Work!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment