Created
January 8, 2014 18:45
-
-
Save charlieda/8322085 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.*; | |
import java.util.Scanner; | |
import java.util.Map; | |
import java.util.HashMap; | |
import java.util.Set; | |
import java.util.ArrayList; | |
import java.util.Collections; | |
import java.lang.Math; | |
public class filter { | |
public static void main(String[] args) { | |
/* usage information */ | |
if(args.length != 1) { | |
System.out.println("Usage: java filter testfile"); | |
System.out.printf(" (You provided %d argument(s))\n\n", args.length); | |
return; | |
} | |
// load classifier | |
NaiveBayesClassifier c; | |
try { | |
FileInputStream fileIn = new FileInputStream("classifier.ser"); | |
ObjectInputStream in = new ObjectInputStream(fileIn); | |
c = (NaiveBayesClassifier) in.readObject(); | |
in.close(); | |
fileIn.close(); | |
} catch(IOException i) { | |
i.printStackTrace(); | |
return; | |
} catch(ClassNotFoundException cnf) { | |
System.out.println("NaiveBayesClassifier class not found"); | |
cnf.printStackTrace(); | |
return; | |
} | |
// classify given file | |
//Change classifier to expect just an array of words and calculate LRs on the fly | |
//Classifier should classify based on | |
if(c.getLikelihoodRatio( getWords( readFile(args[0]) ) ) > 0 ) { | |
System.out.print("ham\n"); | |
} else { | |
System.out.print("spam\n"); | |
} | |
} | |
private static String readFile(String filename) { | |
File messageFile = new File( filename ); | |
StringBuilder message = new StringBuilder((int)messageFile.length()); | |
Scanner scanner; | |
try { | |
scanner = new Scanner(messageFile); | |
} catch (FileNotFoundException e) { | |
System.err.printf("Error: Couldn't read message file '%s'\nExiting...\n", filename); | |
return ""; | |
} | |
String lineSeparator = System.getProperty("line.separator"); | |
try { | |
while(scanner.hasNextLine()) { | |
message.append(scanner.nextLine() + lineSeparator); | |
} | |
} finally { | |
scanner.close(); | |
} | |
return message.toString(); | |
} | |
/** | |
* @return a list of words in the text | |
*/ | |
public static ArrayList<String> getWords(String text) { | |
ArrayList<String> toReturn = new ArrayList<String>(); | |
for(String w : text.split("<[\\/]?[A-z0-9]*>|[\\s]")) { | |
w = w.trim().replaceAll("[,.;:!]$", ""); | |
toReturn.add(w); | |
} | |
return toReturn; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment