public
Last active

  • Download Gist
gistfile1.java
Java
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
package org.mbte.groovypp.examples.wordcount;
 
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
/**
* Optimization of the original JavaWordCount found here:
* http://code.google.com/p/groovypptest/source/browse/trunk/WordCount/src/org/mbte/groovypp/examples/wordcount/JavaWordCount.java
* 1. Collect the counts w/ a backing HashMap, rather than TreeHashMap
* 2. Word count needs to be case-insensitive (which is actually an optimization)
* 3. Buffered writers
*/
public class JavaWordCount {
public static void main(String[] args) throws IOException {
 
for (int i = 0; i < 10; ++i) {
Comparator<Map.Entry<String, Integer>> sortByCount = new Comparator<Map.Entry<String, Integer>>() {
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return o2.getValue() - o1.getValue();
}
};
Comparator<Map.Entry<String, Integer>> sortByElement = new Comparator<Map.Entry<String, Integer>>() {
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return o1.getKey().compareTo(o2.getKey());
}
};
Long timeStart = System.currentTimeMillis();
Pattern wordPattern = Pattern.compile("\\w+");
File rootDir = new File("/Users/arabung/temp/groovypptest-read-only/WordCount/20_newsgroups");
CountingSet counter = new CountingSet();
for (File groupDirectory : rootDir.listFiles()) {
if (groupDirectory.isDirectory()) {
for (File f : groupDirectory.listFiles()) {
if (f.isFile()) {
BufferedReader reader = new BufferedReader(new FileReader(f));
String line;
while ((line = reader.readLine()) != null) {
Matcher matcher = wordPattern.matcher(line);
while (matcher.find()) {
counter.add(matcher.group().toLowerCase());
}
}
reader.close();
}
}
}
}
 
Writer pw = new BufferedWriter(new PrintWriter("./counts-alphabetical-java.txt"));
sortAndDisplay(counter.entrySet(), sortByElement, pw);
pw.close();
 
pw = new BufferedWriter(new PrintWriter("./counts-decreasing-java.txt"));
sortAndDisplay(counter.entrySet(), sortByCount, pw);
pw.close();
 
System.out.println("Finished in " + (System.currentTimeMillis() - timeStart) + " ms");
}
}
 
private static void sortAndDisplay(Set<Map.Entry<String, Integer>> set, Comparator<Map.Entry<String, Integer>> comp, Writer writer) throws IOException {
List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(set);
Collections.sort(list, comp);
display(list, writer);
}
 
private static void display(Iterable<java.util.Map.Entry<String, Integer>> list, Writer writer) throws IOException {
for (Map.Entry<String, Integer> entry : list) {
writer.write(entry.getKey() + " : " + entry.getValue() + "\n");
}
}
 
private static class CountingSet extends LinkedHashMap<String, Integer> {
void add(String s) {
Integer i = get(s);
put(s, (i == null) ? Integer.valueOf(1) : Integer.valueOf(i + 1));
}
}
}

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.