Skip to content

Instantly share code, notes, and snippets.

@shkesar
Created February 10, 2015 09:52
Show Gist options
  • Save shkesar/201edf8b08b657732bb4 to your computer and use it in GitHub Desktop.
Save shkesar/201edf8b08b657732bb4 to your computer and use it in GitHub Desktop.
A version of NaiveBayes
// DON'T PUSH CHANGES MADE TO THIS FILE
// ADD IT TO YOUR .gitignore AFTER DOWNLOADING IT ONCE
import org.apache.spark.rdd.RDD
class NaiveBayesModel(val lambda: Double,
val vocabSize: Long,
private val vocabEmotion: Array[Seq[String]],
private val vocabEmotionLength: Array[Int],
val emotions: Array[String],
val priorProbabilities: Array[Double]) {
def predict(tweet: String): String = {
val words = tweet.split(" ")
val emotionScore = (priorProbabilities zip (0 to 2)).map { case (priorProb: Double, index: Int) =>
words.foldLeft(priorProb)((sum, word) => {
val prob = wordCondProbability(word, vocabEmotionLength(index), vocabSize, vocabEmotion(index))
if (prob == 0) sum else sum + prob
})
}.zip(emotions).toSeq
println(emotionScore.mkString(" "))
val prominentEmotion = emotionScore.max(maxEmotion)._2
prominentEmotion
}
def predict(tweetRDD: RDD[String]): RDD[String] = {
tweetRDD.map(predict)
}
private def wordCondProbability(word: String, vocabEmotionSize: Long, vocabSize: Long, vocabEmotion: Seq[String]): Double = {
val termLength = vocabEmotion.count(_ == word)
(termLength + lambda) / (vocabEmotionSize + vocabSize)
}
private def maxEmotion = new Ordering[(Double, String)] {
override def compare(x: (Double, String), y: (Double, String)): Int = {
val diff = x._1 - y._1
if (diff > 0) diff.ceil.toInt else diff.floor.toInt // convert a floating difference result to an integer keeping the Ordering API the same
}
}
}
import scalax.io.{LongTraversable, Resource}
object NaiveBayes {
def train(dictionaryPath:String, emotions: Array[String], lambda: Double = 1.0): NaiveBayesModel = {
val dictionaryLines = Resource.fromFile(dictionaryPath).lines()
val totalLines = dictionaryLines.size
val priorProb = emotions.map(getEmotionLineCount(dictionaryLines, _).toDouble / totalLines)
val vocabEmotion = emotions.map(genEmotionWords(dictionaryLines, _).toSeq)
val vocabEmotionLength = vocabEmotion.map(_.length)
val vocabSize = vocabEmotion.flatMap(_.toList.distinct).size
new NaiveBayesModel(lambda, vocabSize, vocabEmotion, vocabEmotionLength, emotions, priorProb)
}
// helper functions
private def genEmotionWords(dictionaryLines: LongTraversable[String], emotion: String) =
dictionaryLines.map(_.split(",")).filter(_(0) == emotion).flatMap(_(1).split(" "))
private def getEmotionLineCount(dictionaryLines: LongTraversable[String], emotion: String) =
dictionaryLines.map(_.split(",")(0)).filter(_ == emotion).size
}
val model = NaiveBayes.train("/Users/shubham/projects/thealth/data/dict/sentiment_dictionary.txt", Array("positive", "negative", "neutral"), 0.0)
model.predict("")
model.predict("happy is thy man who has food")
model.predict("precious is happy")
model.predict("Apple and YouTube, with @PTXofficial's help, surprise all at Clive Davis' pre-#GRAMMYs party http://on.mash.to/1EPFsQQ")
model.predict("The Apple Watch may sound cool but here's why it's going to be a flop ")
model.predict("serious ferociously sinister bowdlerize leer inflammatory exuberance outshine easiness")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment