Skip to content

Instantly share code, notes, and snippets.

@dkuppitz
Created June 3, 2013 09:41
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dkuppitz/5697149 to your computer and use it in GitHub Desktop.
Save dkuppitz/5697149 to your computer and use it in GitHub Desktop.
MapReduce using CoffeeScript
#!/bin/sh
cd /tmp/
wget http://mirror.derwebwolf.net/apache/hadoop/common/hadoop-1.1.2/hadoop-1.1.2.tar.gz
tar xfz hadoop-1.1.2.tar.gz
cd hadoop-1.1.2/
# TODO: set JAVA_HOME
nano conf/hadoop-env.sh
# TODO: http://hadoop.apache.org/docs/stable/single_node_setup.html#PseudoDistributed
#!/usr/bin/coffee
stdin = process.stdin
stdout = process.stdout
processLine = (line) ->
line?.trim().split(' ').map (word) ->
stdout.write "#{word}\t1\n" if /^[a-z]+$/i.test word
data = ""
stdin.setEncoding 'utf8'
stdin.on "data", (chunk) ->
data += chunk
data = data.replace /\r\n/g, "\n"
while data.indexOf("\n") > -1
i = data.indexOf("\n") + 1
processLine data.slice(0, i)
data = data.slice i
stdin.on "end", ->
processLine data
stdin.resume()
#!/usr/bin/coffee
stdin = process.stdin
stdout = process.stdout
data = ""
cw = null
cc = 0
processLine = (line) ->
s = line?.trim().split "\t"
if s.length == 2
w = s[0].trim()
c = parseInt s[1]
unless isNaN c
if cw == w
cc += c
else
stdout.write("#{cw}\t#{cc}\n") if cw
cw = w
cc = c
stdin.setEncoding 'utf8'
stdin.on "data", (chunk) ->
data += chunk
data = data.replace /\r\n/g, "\n"
while data.indexOf("\n") > -1
i = data.indexOf("\n") + 1
processLine data.slice(0, i)
data = data.slice i
stdin.on "end", ->
processLine data
stdout.write("#{cw}\t#{cc}\n") if cw
stdin.resume()
#!/bin/sh
cd
wget http://www.gutenberg.org/cache/epub/5000/pg5000.txt
/tmp/hadoop-1.1.2/bin/hadoop dfs -copyFromLocal pg5000.txt /user/vagrant/pg5000.txt
/tmp/hadoop-1.1.2/bin/hadoop jar /tmp/hadoop-1.1.2/contrib/streaming/hadoop-streaming-1.1.2.jar -file ./mapper.coffee -mapper ./mapper.coffee -file ./reducer.coffee -reducer ./reducer.coffee -input /user/vagrant/pg5000.txt -output /user/vagrant/wc-output
/tmp/hadoop-1.1.2/bin/hadoop dfs -getmerge /user/vagrant/wc-output ./wordcount.txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment