Created
May 9, 2010 16:23
-
-
Save alienrobotwizard/395254 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'rubygems' | |
require 'wukong' | |
class LetterMapper < Wukong::Streamer::LineStreamer | |
def map_text text | |
h = { } | |
idx = 0 | |
fixd = fixed_length_text text | |
fixd.each_char do |c| | |
h[idx] = c | |
idx += 1 | |
end | |
h | |
end | |
def fixed_length_text text, len=140 | |
return text if text.size == len | |
if text.size > len | |
text[0...len].downcase | |
else | |
text.downcase.ljust len | |
end | |
end | |
def join_letter_pos letter, pos | |
[letter, pos.to_s].join(",") | |
end | |
def process line | |
valid_letters = %w{ a b c d e f g h i j k l m n o p q r s t u v w x y z } | |
letter_map = map_text line | |
letter_map.each do |pos, letter| | |
next unless valid_letters.include? letter | |
yield [join_letter_pos(letter, pos), 1] | |
end | |
end | |
end | |
class LetterReducer < Wukong::Streamer::AccumulatingReducer | |
attr_accessor :key_count | |
def start!(*args) self.key_count = 0 end | |
def accumulate(*args) self.key_count += 1 end | |
def finalize | |
yield [key.split(",").join("\t"), key_count] | |
end | |
end | |
# Execute the script | |
Wukong::Script.new( | |
LetterMapper, | |
LetterReducer | |
).run |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment