Skip to content

Instantly share code, notes, and snippets.

@grf
Created November 11, 2011 04:13
Show Gist options
  • Save grf/1357174 to your computer and use it in GitHub Desktop.
Save grf/1357174 to your computer and use it in GitHub Desktop.
Simple word counter for Cloud Computing and Storage Program 2
// G R Fischer for Fall 2011 Cloud Computing and Storage
// WordCount - Programming assignment 2, part 1 of 2
// 2011-11-10
// Derived from many different example WordCount programs.
package org.myorg;
import java.io.IOException;
import java.util.*;
import java.util.regex.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class WordCount {
// The map task breaks a text up into individual words, emitting the intermediate
// key, value pairs:
//
// word, 1
//
// Some cleanup is done over the emitted words: they are lower-cased and odd characters
// are removed. Trailing or leading apostrophes are particularly troublesome with
// Shakespearean texts.
public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
// cleanup regexp to try to get Shakespeare's spelling consistent: but leaves internal apostrophes, dashes
private Pattern pattern = Pattern.compile("^[^0-9a-z]+|[^0-9a-z]+$|[^0-9a-z'-]");
private Matcher matcher = pattern.matcher(""); // we'll reset as we use
public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
while (tokenizer.hasMoreTokens()) {
matcher.reset(tokenizer.nextToken().toLowerCase());
word.set(matcher.replaceAll(""));
output.collect(word, one);
}
}
}
// Reduce, given input as single word with a list of occurences, e.g.
//
// foo, (1, 1, 2)
//
// folds to summed occurrences
//
// foo, 4
//
// Because we use this class as a combiner as well as a reducer, the reduction phase
// may get occurences > 1
public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
int sum = 0;
while (values.hasNext()) {
sum += values.next().get();
}
output.collect(key, new IntWritable(sum));
}
}
// Boiler plate setup
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(WordCount.class);
conf.setJobName("wordcount");
// don't compress the output (was default on EC2 hadoop instance I used)
conf.setBoolean("mapred.output.compress", false);
// hardcoded instances; no particular reason for these numbers
conf.setNumMapTasks(3);
conf.setNumReduceTasks(2);
// Setup intermediate key/value domain
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
// Note, use reducer as combiner
conf.setMapperClass(Map.class);
conf.setCombinerClass(Reduce.class);
conf.setReducerClass(Reduce.class);
// input/output plain text
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
// input/output directories
FileInputFormat.setInputPaths(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
Date start = new Date();
JobClient.runJob(conf);
Date now = new Date();
System.out.println(String.format("WordCount took %d milliseconds to run", now.getTime() - start.getTime()));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment