Skip to content

Instantly share code, notes, and snippets.

@mjg123
Created June 14, 2012 05:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mjg123/2928168 to your computer and use it in GitHub Desktop.
Save mjg123/2928168 to your computer and use it in GitHub Desktop.
Word count hadoop job notes
<assembly>
<id>job</id>
<formats>
<format>jar</format>
</formats>
<includeBaseDirectory>false</includeBaseDirectory>
<dependencySets>
<dependencySet>
<unpack>false</unpack>
<scope>runtime</scope>
<outputDirectory>lib</outputDirectory>
<excludes>
<exclude>${groupId}:${artifactId}</exclude>
</excludes>
</dependencySet>
<dependencySet>
<unpack>true</unpack>
<includes>
<include>${groupId}:${artifactId}</include>
</includes>
</dependencySet>
</dependencySets>
</assembly>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
<version>1.0.3</version>
<scope>provided</scope>
</dependency>
<!-- and... -->
<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.2.1</version>
<configuration>
<descriptors>
<descriptor>src/main/assembly/hadoop-job.xml</descriptor>
</descriptors>
<archive>
<manifest>
<mainClass>net.mjg123.WordCounter</mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
package net.mjg123;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
import java.util.StringTokenizer;
public class WordCounter {
public static class MjgMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private final IntWritable one = new IntWritable(1);
private final Text word = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer tokenizer = new StringTokenizer(value.toString());
while(tokenizer.hasMoreTokens()){
String thisWord = tokenizer.nextToken();
thisWord = thisWord.replaceAll("\\W+$", "");
thisWord = thisWord.replaceAll("^\\W+", "");
word.set(thisWord);
word.set(tokenizer.nextToken());
context.write(word, one);
}
}
}
public static class MjgReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable i : values){
sum += i.get();
}
context.write(key, new IntWritable(sum));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "dictionary");
job.setJarByClass(WordCounter.class);
job.setMapperClass(MjgMapper.class);
job.setReducerClass(MjgReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.addInputPath(job, new Path("/user/hduser/gutenberg/ulysses.txt"));
FileOutputFormat.setOutputPath(job, new Path("/user/hduser/goutput"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment