Skip to content

Instantly share code, notes, and snippets.

@karlicoss
Created March 25, 2012 14:58
Show Gist options
  • Save karlicoss/2196847 to your computer and use it in GitHub Desktop.
Save karlicoss/2196847 to your computer and use it in GitHub Desktop.
Spamlord
// CS124 HW1 SpamLord
import java.util.regex.*;
import java.util.Collections;
import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;
import java.util.Set;
import java.util.HashSet;
import java.io.*;
public class SpamLord {
/*
* You do not need to modify anything in the Contact class.
* This class encapsulates the basic information associated with
* an e-mail or phone number for this assignment. It has three
* data members:
* filename // the name of the file in which the contact item was found
* type // the type of contact information found: either "e" or "p"
* value // the actual string representatino of the e-mail or phone number
* // see assignment description for details
* you can ignore the other functions which are just necessary for correct
* behavior when used an element of a java.uitl.Set
*/
class Contact implements Comparable<Contact>{
private String fileName;
private String type;
private String value;
public Contact() {}
public Contact(String fileName,String type,String value) {
this.fileName = fileName;
this.type = type;
// automatically change value to lower case upon construction;
this.value = value.toLowerCase();
}
public String getFileName() {return fileName;}
public String getType() {return type;}
public String getValue() {return value;}
@Override
public boolean equals(Object o) {
Contact c = (Contact) o;
return (fileName.equals(c.fileName) && type.equals(c.type) && value.equals(c.value));
}
@Override
public int hashCode() {
return 31*fileName.hashCode() + 17*type.hashCode() + value.hashCode();
}
public int compareTo(Contact c) {
int fileNameCmp = fileName.compareTo(c.fileName);
if (fileNameCmp != 0) {
return fileNameCmp;
}
int typeCmp = type.compareTo(c.type);
if (typeCmp != 0) {
return typeCmp;
}
return value.compareTo(c.value);
}
@Override public String toString() {
return fileName + "\t" + type + "\t" + value;
}
}
// Example pattern for extracting e-mail addresses
private String dotAlternative = null;
{
String[] dotCandidates = {"\\.",
";",
" dot ",
"\\(dot\\)",
" d-o-t ",
" d o t ",
" dom ",
"dt",
"&#59;",
"&#46;",
" "};
dotAlternative = "(?:";
for (int i = 0; i < dotCandidates.length; i++) {
dotAlternative += dotCandidates[i];
if (i != dotCandidates.length - 1)
dotAlternative += "|";
}
dotAlternative += ")";
}
private String whitespaceSep = "\\s"; //TODO исключить \n
private Pattern emailPattern = null;
{
String[] topCandidates = { "com", "gov", "edu", "mil", "tv", "info",
"xxx", "travel", "org", "ac", "ad", "ae", "aero", "af", "ag", "ai",
"al", "an", "ao", "aq", "ar", "arpa", "as", "asia", "at",
"au", "aw", "ax", "az", "ba", "bb", "bd", "be", "bf", "bg",
"bh", "bi", "biz", "bj", "bm", "bn", "bo", "br", "bs", "bt",
"bv", "bw", "by", "bz", "ca", "cat", "cc", "cd", "cf", "cg",
"ch", "ci", "ck", "cl", "cm", "cn", "co", "coop", "cr",
"cu", "cv", "cw", "cx", "cy", "cz", "de", "dj", "dk", "dm",
"do", "dz", "ec", "ee", "eg", "er", "es", "et", "eu",
"fi", "fj", "fk", "fm", "fo", "fr", "ga", "gb", "gd", "ge",
"gf", "gg", "gh", "gi", "gl", "gm", "gn", "gp", "gq",
"gr", "gs", "gt", "gu", "gw", "gy", "hk", "hm", "hn", "hr",
"ht", "hu", "id", "ie", "il", "im", "int", "io",
"iq", "ir", "it", "je", "jm", "jo", "jobs", "jp", "ke",
"kg", "kh", "ki", "km", "kn", "kp", "kr", "kw", "ky", "kz",
"la", "lb", "lc", "li", "lk", "lr", "ls", "lt", "lu", "lv",
"ly", "ma", "mc", "md", "me", "mg", "mh", "mk", "ml",
"mm", "mn", "mo", "mobi", "mp", "mq", "mr", "ms", "mt", "mu",
"museum", "mv", "mw", "mx", "mz", "na", "name", "nc",
"ne", "net", "nf", "ng", "ni", "nl", "np", "nr", "nu",
"nz", "om", "pa", "pe", "pf", "pg", "ph", "pk", "pl",
"pm", "pn", "pr", "pro", "ps", "pt", "pw", "py", "qa", "re",
"ro", "rs", "ru", "rw", "sa", "sb", "sc", "sd", "se", "sg",
"sh", "si", "sj", "sk", "sl", "sm", "sn", "so", "sr", "st",
"su", "sv", "sx", "sy", "sz", "tc", "td", "tel", "tf", "tg",
"th", "tj", "tk", "tl", "tm", "tn", "to", "tp", "tr",
"tt", "tw", "tz", "ua", "ug", "uk", "us", "uy", "uz",
"va", "vc", "ve", "vg", "vi", "vn", "vu", "wf", "ws",
"ye", "yt", "za", "zm", "zw" };
String[] atCandidates = {"@",
" at ",
"\\(at\\)",
" a-t ",
" a t ",
"&#64;",
"&#x40;",
" where "};
String atAlternative = "(?:";
for (int i = 0; i < atCandidates.length; i++) {
atAlternative += atCandidates[i];
if (i != atCandidates.length - 1)
atAlternative += "|";
}
atAlternative += ")";
String tlDomain = "(?:";
for (int i = 0; i < topCandidates.length; i++) {
tlDomain += topCandidates[i];
if (i != topCandidates.length - 1)
tlDomain += "|";
}
tlDomain += ")";
String domain = "\\w{2,}";
String name = "\\w+(?:\\.\\w+)?";
String contactName = "(" + name + ")";
//String domainName = "(" + "(?:" + domain + "(?:" + dotAlternative + "|" + whitespaceSep + "?" + dotAlternative + ")" + whitespaceSep + "?" + "){1,3}" + tlDomain + ")" + "[^\\w]";
String domainName = "(" +
"(?:" + domain +
"(?:" + dotAlternative + "|" + whitespaceSep + dotAlternative + ")"
+ "){1,3}"
+ tlDomain + ")" + "[^\\w]";
emailPattern = Pattern.compile("(?i)" + contactName +
whitespaceSep + "?" + atAlternative + whitespaceSep + "?" +
domainName);
}
private String[] delimCandidates = {" ",
"-",
"&#45;",
"&#32;",
"\\)"};
String delimAlternative = null;
private Pattern telPattern = null;
{
delimAlternative = "(?:";
for (int i = 0; i < delimCandidates.length; i++) {
delimAlternative += delimCandidates[i];
if (i != delimCandidates.length - 1)
delimAlternative += "|";
}
delimAlternative += ")";
String prefix = "[^\\d]?(\\(?(\\d{3})\\)?";
String suffix1 = "(\\d{3})";
String suffix2 = "(\\d{4}))";
String suffix3 = "[^\\d]?";
telPattern = Pattern.compile(prefix + delimAlternative + suffix1 + delimAlternative + suffix2 + suffix3);
}
/*
* TODO
* This should return a list of Contact objects found in the input.
* You can change anything internal to this function but make sure you
* leave the interface (arguments and return value) unchanged because
* it will be directly called by the submission script.
*/
public List<Contact> processFile(String fileName, BufferedReader input) {
//System.err.println(fileName);
List<Contact> contacts = new ArrayList<Contact>();
// for each line
Matcher m;
try {
for (String line = input.readLine(); line != null; line = input
.readLine()) {
m = emailPattern.matcher(line);
while (m.find()) {
String name = m.group(1);
String suffix = m.group(2);
suffix = suffix.replaceAll(dotAlternative + "+", ".");
suffix = suffix.replaceAll(whitespaceSep, "");
//post-processing block
suffix = suffix.toLowerCase();
name = name.toLowerCase();
{
if (name.contains("server")) {
continue;
}
Contact contact = new Contact(fileName, "e", name + "@" + suffix);
contacts.add(contact);
}
}
m = telPattern.matcher(line);
while (m.find()) {
String full = m.group(1);
System.out.println(full);
boolean containsAny = false;
for (String s: delimCandidates) {
containsAny |= full.contains(s);
}
if (!containsAny) {
continue;
}
String prefix = m.group(2);
String suffix1 = m.group(3);
String suffix2 = m.group(4);
prefix = prefix.replaceAll("[\\)\\(]", "");
prefix = prefix.replaceAll(delimAlternative, "");
suffix1 = suffix1.replaceAll(delimAlternative, "");
suffix2 = suffix2.replaceAll(delimAlternative, "");
Contact contact = new Contact(fileName, "p", prefix + "-" +
suffix1 + "-" + suffix2);
contacts.add(contact);
}
}
input.close();
} catch (IOException e) {
e.printStackTrace();
System.exit(1);
}
return contacts;
}
/*
* You should not need to edit this, nor should you alter it's interface
* because it will also be called direclty by the submission program
*/
public List<Contact> processDir(String dirName) {
List<Contact> contacts = new ArrayList<Contact>();
for(File f: new File(dirName).listFiles()) {
if (f.getName().startsWith("."))
continue;
try {
BufferedReader input = new BufferedReader(new FileReader(f));
contacts.addAll(processFile(f.getName(), input));
} catch(IOException e) {
e.printStackTrace();
System.exit(1);
}
}
return contacts;
}
/*
* You should not need to edit this function
* It simply reads in a tsv gold file and returns a list of
* Contacts
*/
private List<Contact> loadGold(String goldPath) {
List<Contact> gold = new ArrayList<Contact>();
try {
BufferedReader input = new BufferedReader(new FileReader(goldPath));
String[] toks;
for(String line = input.readLine(); line != null; line = input.readLine()) {
toks = line.split("\t");
Contact contact = new Contact(toks[0],toks[1],toks[2]);
gold.add(contact);
}
input.close();
} catch(IOException e) {
e.printStackTrace();
System.exit(1);
}
return gold;
}
/*
* You should not need to edit this.
* This is just a utility function which turns a Set into
* a sorted list for convenience when looking at the output.
*/
private List<Contact> asSortedList(Set<Contact> set) {
Contact[] c = new Contact[0];
List<Contact> list = Arrays.asList(set.toArray(c));
Collections.sort(list);
return list;
}
/*
* You should not need to edit this.
* This takes in two Lists of Contacts and prints out the intersection
* and differences, which can be thought of as true positives, false
* positives and false negatives.
*/
private void score(List<Contact> guesses, List<Contact> gold) {
Set<Contact> guess_set = new HashSet<Contact>();
guess_set.addAll(guesses);
Set<Contact> gold_set = new HashSet<Contact>();
gold_set.addAll(gold);
Set<Contact> tp = new HashSet<Contact>(guess_set);
System.out.println("guess_set.size()="+guess_set.size()+"\tgold_set.size()="+gold_set.size());
tp.retainAll(gold_set);
List<Contact> tp_list = asSortedList(tp);
System.out.println("True Positives (" + tp_list.size() +")\t###############################");
for (Contact contact : tp_list) {
System.out.println(contact);
}
Set<Contact> fp = new HashSet<Contact>(guess_set);
fp.removeAll(gold_set);
List<Contact> fp_list = asSortedList(fp);
System.out.println("False Positives (" + fp_list.size() +")\t###############################");
for (Contact contact : fp_list) {
System.out.println(contact);
}
Set<Contact> fn = new HashSet<Contact>(gold_set);
fn.removeAll(guess_set);
List<Contact> fn_list = asSortedList(fn);
System.out.println("False Negatives (" + fn_list.size() +")\t###############################");
for (Contact contact : fn_list) {
System.out.println(contact);
}
System.out.println("Summary: tp=" + tp.size() + "\tfp=" + fp.size() + "\tfn=" + fn.size());
}
/*
* main takes a directory and a file with the Gold contacts.
* it processes each file in the directory, extracting any contacts
* and compares them to the contacts listed in the gold file
*/
public static void main(String[] args) {
if (args.length != 2) {
System.err.println("usage:\tSpamLord <data_dir> <gold_file>");
System.exit(0);
}
SpamLord vader = new SpamLord();
List<Contact> guesses = vader.processDir(args[0]);
List<Contact> gold = vader.loadGold(args[1]);
vader.score(guesses,gold);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment