Skip to content

Instantly share code, notes, and snippets.

@whitlockjc
Created May 13, 2014 20:23
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save whitlockjc/21454d46f2d3e1278761 to your computer and use it in GitHub Desktop.
Save whitlockjc/21454d46f2d3e1278761 to your computer and use it in GitHub Desktop.
Node.js Based Wordcount MapReduce Job
'use strict';
var FileInputFormat = require('hadoop-input').FileInputFormat;
var FileOutputFormat = require('hadoop-output').FileOutputFormat;
var counterGroup = 'Word Count Counters';
var uniqueWordCounterName = 'Unique Words';
var totalWordsCounterName = 'Total Words';
var uniqueWordsCounter;
var totalWordsCounter;
module.exports = {
jobSetup: function (job, cb) {
// Default to /tmp/wordcount/4300.txt for input if not already specified (CLI or conf file)
if (FileInputFormat.getInputPaths(job).length === 0) {
FileInputFormat.setInputPaths(job, '/tmp/wordcount/4300.txt');
}
// Default to /tmp/wordcount/uniques for output if not already specified (CLI or conf file)
if (!FileOutputFormat.getOutputPath(job)) {
FileOutputFormat.setOutputPath(job, '/tmp/wordcount/uniques/');
}
// Set the map and reduce output key/value types since they cannot be inferred
job.setMapOutputKeyClass('org.apache.hadoop.io.Text')
.setMapOutputValueClass('org.apache.hadoop.io.IntWritable')
.setOutputKeyClass('org.apache.hadoop.io.Text')
.setOutputValueClass('org.apache.hadoop.io.IntWritable');
cb();
},
map: function (key, value, context, cb) {
var line = value;
// Keep track of total words seen
if (!totalWordsCounter) {
totalWordsCounter = context.getCounter(counterGroup, totalWordsCounterName);
}
// Replace all characters that aren't a letter or a hyphen with a space
line = line.replace(/[^a-zA-Z0-9\-]/g, ' ');
// Collapse all whitespace
line = line.replace(/s+/g, ' ');
// Split the line by space and iterate over each word
line.split(' ').forEach(function (word) {
// Emit the word and a 1
context.write(word, 1);
totalWordsCounter.increment(1);
});
// Call the callback
cb();
},
reduce: function (key, values, context, cb) {
var count = 0;
// Keep track of unique words seen
if (!uniqueWordsCounter) {
uniqueWordsCounter =context.getCounter(counterGroup, uniqueWordCounterName);
}
uniqueWordsCounter.increment(1);
// Sum up the count of this word
while (values.hasNext()) {
count += values.next();
}
// Emit the count
context.write(key, count);
// Call the callback
cb();
}
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment