Skip to content

Instantly share code, notes, and snippets.

@jbarrus
Last active December 25, 2015 17:39
Show Gist options
  • Save jbarrus/7014768 to your computer and use it in GitHub Desktop.
Save jbarrus/7014768 to your computer and use it in GitHub Desktop.
scalding wordcount - top 25 words
package com.twitter.scalding.examples
import com.twitter.scalding._
class WordCountJob(args : Args) extends Job(args) {
TextLine( args("input") )
.flatMap('line -> 'word) { line : String => tokenize(line) }
.groupBy('word){ _.size }
.groupAll { _.sortedReverseTake[(Long, String)](( 'size, 'word) -> 'top, 25) }
.flattenTo[(Long, String)]('top -> ('size, 'word))
.write( Tsv( args("output") ) )
// Split a piece of text into individual words.
def tokenize(text : String) : Array[String] = {
// Lowercase each word and remove punctuation.
text.toLowerCase.replaceAll("[^a-zA-Z\\s]", "").split("\\s+")
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment