Created
May 13, 2014 20:23
-
-
Save whitlockjc/21454d46f2d3e1278761 to your computer and use it in GitHub Desktop.
Node.js Based Wordcount MapReduce Job
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'use strict'; | |
var FileInputFormat = require('hadoop-input').FileInputFormat; | |
var FileOutputFormat = require('hadoop-output').FileOutputFormat; | |
var counterGroup = 'Word Count Counters'; | |
var uniqueWordCounterName = 'Unique Words'; | |
var totalWordsCounterName = 'Total Words'; | |
var uniqueWordsCounter; | |
var totalWordsCounter; | |
module.exports = { | |
jobSetup: function (job, cb) { | |
// Default to /tmp/wordcount/4300.txt for input if not already specified (CLI or conf file) | |
if (FileInputFormat.getInputPaths(job).length === 0) { | |
FileInputFormat.setInputPaths(job, '/tmp/wordcount/4300.txt'); | |
} | |
// Default to /tmp/wordcount/uniques for output if not already specified (CLI or conf file) | |
if (!FileOutputFormat.getOutputPath(job)) { | |
FileOutputFormat.setOutputPath(job, '/tmp/wordcount/uniques/'); | |
} | |
// Set the map and reduce output key/value types since they cannot be inferred | |
job.setMapOutputKeyClass('org.apache.hadoop.io.Text') | |
.setMapOutputValueClass('org.apache.hadoop.io.IntWritable') | |
.setOutputKeyClass('org.apache.hadoop.io.Text') | |
.setOutputValueClass('org.apache.hadoop.io.IntWritable'); | |
cb(); | |
}, | |
map: function (key, value, context, cb) { | |
var line = value; | |
// Keep track of total words seen | |
if (!totalWordsCounter) { | |
totalWordsCounter = context.getCounter(counterGroup, totalWordsCounterName); | |
} | |
// Replace all characters that aren't a letter or a hyphen with a space | |
line = line.replace(/[^a-zA-Z0-9\-]/g, ' '); | |
// Collapse all whitespace | |
line = line.replace(/s+/g, ' '); | |
// Split the line by space and iterate over each word | |
line.split(' ').forEach(function (word) { | |
// Emit the word and a 1 | |
context.write(word, 1); | |
totalWordsCounter.increment(1); | |
}); | |
// Call the callback | |
cb(); | |
}, | |
reduce: function (key, values, context, cb) { | |
var count = 0; | |
// Keep track of unique words seen | |
if (!uniqueWordsCounter) { | |
uniqueWordsCounter =context.getCounter(counterGroup, uniqueWordCounterName); | |
} | |
uniqueWordsCounter.increment(1); | |
// Sum up the count of this word | |
while (values.hasNext()) { | |
count += values.next(); | |
} | |
// Emit the count | |
context.write(key, count); | |
// Call the callback | |
cb(); | |
} | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment