grf/WordCount.java

## WordCount.java
// G R Fischer for Fall 2011 Cloud Computing and Storage
// WordCount - Programming assignment 2, part 1 of 2
// 2011-11-10

// Derived from many different example WordCount programs.

package org.myorg;

import java.io.IOException;
import java.util.*;
import java.util.regex.*;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;

public class WordCount {

    // The map task breaks a text up into individual words, emitting the intermediate
    // key, value pairs:
    //
    //     word, 1
    //
    // Some cleanup is done over the emitted words: they are lower-cased and odd characters
    // are removed. Trailing or leading apostrophes are particularly troublesome with
    // Shakespearean texts.


    public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
	private final static IntWritable one = new IntWritable(1);
	private Text word = new Text();

	// cleanup regexp to try to get Shakespeare's spelling consistent: but leaves internal apostrophes, dashes

	private Pattern pattern = Pattern.compile("^[^0-9a-z]+|[^0-9a-z]+$|[^0-9a-z'-]");
	private Matcher matcher = pattern.matcher(""); // we'll reset as we use

	public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
	    String line = value.toString();
	    StringTokenizer tokenizer = new StringTokenizer(line);
	    while (tokenizer.hasMoreTokens()) {
		matcher.reset(tokenizer.nextToken().toLowerCase());
		word.set(matcher.replaceAll(""));
		output.collect(word, one);
	    }
	}
    }


    // Reduce, given input as single word with a list of occurences, e.g.
    //
    //    foo, (1, 1, 2)
    //
    // folds to summed occurrences
    //
    //    foo, 4
    //
    //  Because we use this class as a combiner as well as a reducer, the reduction phase
    //  may get occurences > 1

    public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
	public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
	    int sum = 0;
	    while (values.hasNext()) {
		sum += values.next().get();
	    }
	    output.collect(key, new IntWritable(sum));
	}
    }

    // Boiler plate setup

    public static void main(String[] args) throws Exception {
	JobConf conf = new JobConf(WordCount.class);

	conf.setJobName("wordcount");

	// don't compress the output (was default on EC2 hadoop instance I used)

	conf.setBoolean("mapred.output.compress", false);

	// hardcoded instances; no particular reason for these numbers

	conf.setNumMapTasks(3);
	conf.setNumReduceTasks(2);

	// Setup intermediate key/value domain

	conf.setOutputKeyClass(Text.class);
	conf.setOutputValueClass(IntWritable.class);

	// Note, use reducer as combiner

	conf.setMapperClass(Map.class);
	conf.setCombinerClass(Reduce.class);
	conf.setReducerClass(Reduce.class);

	// input/output plain text

	conf.setInputFormat(TextInputFormat.class);
	conf.setOutputFormat(TextOutputFormat.class);

	// input/output directories

	FileInputFormat.setInputPaths(conf, new Path(args[0]));
	FileOutputFormat.setOutputPath(conf, new Path(args[1]));

	Date start  = new Date();
	JobClient.runJob(conf);
	Date now = new Date();

	System.out.println(String.format("WordCount took %d milliseconds to run", now.getTime() - start.getTime()));


    }
}
	// G R Fischer for Fall 2011 Cloud Computing and Storage
	// WordCount - Programming assignment 2, part 1 of 2
	// 2011-11-10

	// Derived from many different example WordCount programs.

	package org.myorg;

	import java.io.IOException;
	import java.util.*;
	import java.util.regex.*;

	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.conf.*;
	import org.apache.hadoop.io.*;
	import org.apache.hadoop.mapred.*;
	import org.apache.hadoop.util.*;

	public class WordCount {

	// The map task breaks a text up into individual words, emitting the intermediate
	// key, value pairs:
	//
	// word, 1
	//
	// Some cleanup is done over the emitted words: they are lower-cased and odd characters
	// are removed. Trailing or leading apostrophes are particularly troublesome with
	// Shakespearean texts.


	public static class Map extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable> {
	private final static IntWritable one = new IntWritable(1);
	private Text word = new Text();

	// cleanup regexp to try to get Shakespeare's spelling consistent: but leaves internal apostrophes, dashes

	private Pattern pattern = Pattern.compile("^[^0-9a-z]+\|[^0-9a-z]+$\|[^0-9a-z'-]");
	private Matcher matcher = pattern.matcher(""); // we'll reset as we use

	public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
	String line = value.toString();
	StringTokenizer tokenizer = new StringTokenizer(line);
	while (tokenizer.hasMoreTokens()) {
	matcher.reset(tokenizer.nextToken().toLowerCase());
	word.set(matcher.replaceAll(""));
	output.collect(word, one);
	}
	}
	}


	// Reduce, given input as single word with a list of occurences, e.g.
	//
	// foo, (1, 1, 2)
	//
	// folds to summed occurrences
	//
	// foo, 4
	//
	// Because we use this class as a combiner as well as a reducer, the reduction phase
	// may get occurences > 1

	public static class Reduce extends MapReduceBase implements Reducer<Text, IntWritable, Text, IntWritable> {
	public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, IntWritable> output, Reporter reporter) throws IOException {
	int sum = 0;
	while (values.hasNext()) {
	sum += values.next().get();
	}
	output.collect(key, new IntWritable(sum));
	}
	}

	// Boiler plate setup

	public static void main(String[] args) throws Exception {
	JobConf conf = new JobConf(WordCount.class);

	conf.setJobName("wordcount");

	// don't compress the output (was default on EC2 hadoop instance I used)

	conf.setBoolean("mapred.output.compress", false);

	// hardcoded instances; no particular reason for these numbers

	conf.setNumMapTasks(3);
	conf.setNumReduceTasks(2);

	// Setup intermediate key/value domain

	conf.setOutputKeyClass(Text.class);
	conf.setOutputValueClass(IntWritable.class);

	// Note, use reducer as combiner

	conf.setMapperClass(Map.class);
	conf.setCombinerClass(Reduce.class);
	conf.setReducerClass(Reduce.class);

	// input/output plain text

	conf.setInputFormat(TextInputFormat.class);
	conf.setOutputFormat(TextOutputFormat.class);

	// input/output directories

	FileInputFormat.setInputPaths(conf, new Path(args[0]));
	FileOutputFormat.setOutputPath(conf, new Path(args[1]));

	Date start = new Date();
	JobClient.runJob(conf);
	Date now = new Date();

	System.out.println(String.format("WordCount took %d milliseconds to run", now.getTime() - start.getTime()));


	}
	}