Skip to content

Instantly share code, notes, and snippets.

@veekaybee
Last active November 7, 2021 13:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save veekaybee/783fd4a9ccca555a716de46af698733a to your computer and use it in GitHub Desktop.
Save veekaybee/783fd4a9ccca555a716de46af698733a to your computer and use it in GitHub Desktop.
import com.twitter.scalding._
class WordCountJob(args: Args) extends Job(args) {
val lines = TypedPipe.from(TextLine("posts.txt"))
lines.flatMap { line => tokenize(line) }
.groupBy { word => word }
.size
.groupAll
.sortBy{ case (word, count) => -count}
.take(10)
.dump
// Split a piece of text into individual words.
def tokenize(text: String): Array[String] = {
// Lowercase each word and remove punctuation.
text.toLowerCase.replaceAll("[^a-zA-Z0-9\\s]", "").split("\\s+")
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment