mjg123/WordCounter.java

## hadoop-job.xml
<assembly>
    <id>job</id>
    <formats>
        <format>jar</format>
    </formats>
    <includeBaseDirectory>false</includeBaseDirectory>
    <dependencySets>
        <dependencySet>
            <unpack>false</unpack>
            <scope>runtime</scope>
            <outputDirectory>lib</outputDirectory>
            <excludes>
                <exclude>${groupId}:${artifactId}</exclude>
            </excludes>
        </dependencySet>
        <dependencySet>
            <unpack>true</unpack>
            <includes>
                <include>${groupId}:${artifactId}</include>
            </includes>
        </dependencySet>
    </dependencySets>
</assembly>

## pom-snippet.xml
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-core</artifactId>
            <version>1.0.3</version>
            <scope>provided</scope>
        </dependency>

<!-- and... -->


    <build>
        <plugins>
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>2.2.1</version>
                <configuration>
                    <descriptors>
                        <descriptor>src/main/assembly/hadoop-job.xml</descriptor>
                    </descriptors>
                    <archive>
                        <manifest>
                            <mainClass>net.mjg123.WordCounter</mainClass>
                        </manifest>
                    </archive>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

## WordCounter.java
package net.mjg123;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.IOException;
import java.util.StringTokenizer;

public class WordCounter {

    public static class MjgMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

        private final IntWritable one = new IntWritable(1);
        private final Text word = new Text();

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            StringTokenizer tokenizer = new StringTokenizer(value.toString());

            while(tokenizer.hasMoreTokens()){

                String thisWord = tokenizer.nextToken();

                thisWord = thisWord.replaceAll("\\W+$", "");
                thisWord = thisWord.replaceAll("^\\W+", "");

                word.set(thisWord);

                word.set(tokenizer.nextToken());
                context.write(word, one);
            }

        }
    }

    public static class MjgReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable i : values){
                sum += i.get();
            }
            context.write(key, new IntWritable(sum));
        }
    }

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        Job job = new Job(conf, "dictionary");
        job.setJarByClass(WordCounter.class);
        job.setMapperClass(MjgMapper.class);
        job.setReducerClass(MjgReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        FileInputFormat.addInputPath(job, new Path("/user/hduser/gutenberg/ulysses.txt"));
        FileOutputFormat.setOutputPath(job, new Path("/user/hduser/goutput"));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }

}
	<assembly>
	<id>job</id>
	<formats>
	<format>jar</format>
	</formats>
	<includeBaseDirectory>false</includeBaseDirectory>
	<dependencySets>
	<dependencySet>
	<unpack>false</unpack>
	<scope>runtime</scope>
	<outputDirectory>lib</outputDirectory>
	<excludes>
	<exclude>${groupId}:${artifactId}</exclude>
	</excludes>
	</dependencySet>
	<dependencySet>
	<unpack>true</unpack>
	<includes>
	<include>${groupId}:${artifactId}</include>
	</includes>
	</dependencySet>
	</dependencySets>
	</assembly>
	<dependency>
	<groupId>org.apache.hadoop</groupId>
	<artifactId>hadoop-core</artifactId>
	<version>1.0.3</version>
	<scope>provided</scope>
	</dependency>

	<!-- and... -->



	<build>
	<plugins>
	<plugin>
	<artifactId>maven-assembly-plugin</artifactId>
	<version>2.2.1</version>
	<configuration>
	<descriptors>
	<descriptor>src/main/assembly/hadoop-job.xml</descriptor>
	</descriptors>
	<archive>
	<manifest>
	<mainClass>net.mjg123.WordCounter</mainClass>
	</manifest>
	</archive>
	</configuration>
	<executions>
	<execution>
	<id>make-assembly</id>
	<phase>package</phase>
	<goals>
	<goal>single</goal>
	</goals>
	</execution>
	</executions>
	</plugin>
	</plugins>
	</build>
	package net.mjg123;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.IntWritable;
	import org.apache.hadoop.io.LongWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.Job;
	import org.apache.hadoop.mapreduce.Mapper;
	import org.apache.hadoop.mapreduce.Reducer;
	import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
	import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
	import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
	import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

	import java.io.IOException;
	import java.util.StringTokenizer;

	public class WordCounter {

	public static class MjgMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

	private final IntWritable one = new IntWritable(1);
	private final Text word = new Text();

	@Override
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

	StringTokenizer tokenizer = new StringTokenizer(value.toString());

	while(tokenizer.hasMoreTokens()){

	String thisWord = tokenizer.nextToken();

	thisWord = thisWord.replaceAll("\\W+$", "");
	thisWord = thisWord.replaceAll("^\\W+", "");

	word.set(thisWord);

	word.set(tokenizer.nextToken());
	context.write(word, one);
	}

	}
	}

	public static class MjgReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
	@Override
	protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
	int sum = 0;
	for (IntWritable i : values){
	sum += i.get();
	}
	context.write(key, new IntWritable(sum));
	}
	}

	public static void main(String[] args) throws Exception {

	Configuration conf = new Configuration();

	Job job = new Job(conf, "dictionary");
	job.setJarByClass(WordCounter.class);
	job.setMapperClass(MjgMapper.class);
	job.setReducerClass(MjgReducer.class);

	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(IntWritable.class);

	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.addInputPath(job, new Path("/user/hduser/gutenberg/ulysses.txt"));
	FileOutputFormat.setOutputPath(job, new Path("/user/hduser/goutput"));
	System.exit(job.waitForCompletion(true) ? 0 : 1);
	}

	}