Skip to content

Instantly share code, notes, and snippets.

@pjazdzewski1990
Created February 4, 2016 07:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pjazdzewski1990/f593d80194e2e665b253 to your computer and use it in GitHub Desktop.
Save pjazdzewski1990/f593d80194e2e665b253 to your computer and use it in GitHub Desktop.
def findSimilar(target: String, rdd: RDD[(String, Seq[String])]): RDD[(String, Double, Seq[String])] = {
val mapped = rdd.map {
case (author, commits) =>
val words = commits.flatMap(_.split(" "))
(author, words)
}
countSimilarity(target.split(" "), mapped)
}
private def findSimilar(target: Array[String], rdd: RDD[(String, Seq[String])]): RDD[(String, Double, Seq[String])] = {
rdd.map {
case (author, words) =>
val matching = (for {
w <- words
t <- target
score = scoreSimilarity(t, w)
} yield (score, w))
val score = matching.map(_._1).sum
val wordsMatched = matching.map(_._2)
(author, score, wordsMatched)
}.sortBy(_._2, false)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment