Created
January 14, 2010 20:14
-
-
Save adamrabung/277457 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.mbte.groovypp.examples.wordcount; | |
import java.io.BufferedReader; | |
import java.io.BufferedWriter; | |
import java.io.File; | |
import java.io.FileReader; | |
import java.io.IOException; | |
import java.io.PrintWriter; | |
import java.io.Writer; | |
import java.util.ArrayList; | |
import java.util.Collections; | |
import java.util.Comparator; | |
import java.util.LinkedHashMap; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.Set; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
/** | |
* Optimization of the original JavaWordCount found here: | |
* http://code.google.com/p/groovypptest/source/browse/trunk/WordCount/src/org/mbte/groovypp/examples/wordcount/JavaWordCount.java | |
* 1. Collect the counts w/ a backing HashMap, rather than TreeHashMap | |
* 2. Word count needs to be case-insensitive (which is actually an optimization) | |
* 3. Buffered writers | |
*/ | |
public class JavaWordCount { | |
public static void main(String[] args) throws IOException { | |
for (int i = 0; i < 10; ++i) { | |
Comparator<Map.Entry<String, Integer>> sortByCount = new Comparator<Map.Entry<String, Integer>>() { | |
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) { | |
return o2.getValue() - o1.getValue(); | |
} | |
}; | |
Comparator<Map.Entry<String, Integer>> sortByElement = new Comparator<Map.Entry<String, Integer>>() { | |
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) { | |
return o1.getKey().compareTo(o2.getKey()); | |
} | |
}; | |
Long timeStart = System.currentTimeMillis(); | |
Pattern wordPattern = Pattern.compile("\\w+"); | |
File rootDir = new File("/Users/arabung/temp/groovypptest-read-only/WordCount/20_newsgroups"); | |
CountingSet counter = new CountingSet(); | |
for (File groupDirectory : rootDir.listFiles()) { | |
if (groupDirectory.isDirectory()) { | |
for (File f : groupDirectory.listFiles()) { | |
if (f.isFile()) { | |
BufferedReader reader = new BufferedReader(new FileReader(f)); | |
String line; | |
while ((line = reader.readLine()) != null) { | |
Matcher matcher = wordPattern.matcher(line); | |
while (matcher.find()) { | |
counter.add(matcher.group().toLowerCase()); | |
} | |
} | |
reader.close(); | |
} | |
} | |
} | |
} | |
Writer pw = new BufferedWriter(new PrintWriter("./counts-alphabetical-java.txt")); | |
sortAndDisplay(counter.entrySet(), sortByElement, pw); | |
pw.close(); | |
pw = new BufferedWriter(new PrintWriter("./counts-decreasing-java.txt")); | |
sortAndDisplay(counter.entrySet(), sortByCount, pw); | |
pw.close(); | |
System.out.println("Finished in " + (System.currentTimeMillis() - timeStart) + " ms"); | |
} | |
} | |
private static void sortAndDisplay(Set<Map.Entry<String, Integer>> set, Comparator<Map.Entry<String, Integer>> comp, Writer writer) throws IOException { | |
List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(set); | |
Collections.sort(list, comp); | |
display(list, writer); | |
} | |
private static void display(Iterable<java.util.Map.Entry<String, Integer>> list, Writer writer) throws IOException { | |
for (Map.Entry<String, Integer> entry : list) { | |
writer.write(entry.getKey() + " : " + entry.getValue() + "\n"); | |
} | |
} | |
private static class CountingSet extends LinkedHashMap<String, Integer> { | |
void add(String s) { | |
Integer i = get(s); | |
put(s, (i == null) ? Integer.valueOf(1) : Integer.valueOf(i + 1)); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment