shkesar/NaiveBayesSandBox.sc

## NaiveBayesSandBox.sc
// DON'T PUSH CHANGES MADE TO THIS FILE
// ADD IT TO YOUR .gitignore AFTER DOWNLOADING IT ONCE

import org.apache.spark.rdd.RDD

class NaiveBayesModel(val lambda: Double,
                      val vocabSize: Long,
                      private val vocabEmotion: Array[Seq[String]],
                      private val vocabEmotionLength: Array[Int],
                      val emotions: Array[String],
                      val priorProbabilities: Array[Double]) {

  def predict(tweet: String): String = {
    val words = tweet.split(" ")
    val emotionScore = (priorProbabilities zip (0 to 2)).map { case (priorProb: Double, index: Int) =>
      words.foldLeft(priorProb)((sum, word) => {
        val prob = wordCondProbability(word, vocabEmotionLength(index), vocabSize, vocabEmotion(index))
        if (prob == 0) sum else sum + prob
      })
    }.zip(emotions).toSeq

    println(emotionScore.mkString(" "))

    val prominentEmotion = emotionScore.max(maxEmotion)._2
    prominentEmotion
  }

  def predict(tweetRDD: RDD[String]): RDD[String] = {
    tweetRDD.map(predict)
  }

  private def wordCondProbability(word: String, vocabEmotionSize: Long, vocabSize: Long, vocabEmotion: Seq[String]): Double = {
    val termLength = vocabEmotion.count(_ == word)
    (termLength + lambda) / (vocabEmotionSize + vocabSize)
  }

  private def maxEmotion = new Ordering[(Double, String)] {
    override def compare(x: (Double, String), y: (Double, String)): Int = {
      val diff = x._1 - y._1
      if (diff > 0) diff.ceil.toInt else diff.floor.toInt // convert a floating difference result to an integer keeping the Ordering API the same
    }
  }

}

import scalax.io.{LongTraversable, Resource}

object NaiveBayes {
  def train(dictionaryPath:String, emotions: Array[String], lambda: Double = 1.0): NaiveBayesModel = {
    val dictionaryLines = Resource.fromFile(dictionaryPath).lines()
    val totalLines = dictionaryLines.size

    val priorProb = emotions.map(getEmotionLineCount(dictionaryLines, _).toDouble / totalLines)
    val vocabEmotion = emotions.map(genEmotionWords(dictionaryLines, _).toSeq)
    val vocabEmotionLength = vocabEmotion.map(_.length)
    val vocabSize = vocabEmotion.flatMap(_.toList.distinct).size

    new NaiveBayesModel(lambda, vocabSize, vocabEmotion, vocabEmotionLength, emotions, priorProb)
  }

  // helper functions
  private def genEmotionWords(dictionaryLines: LongTraversable[String], emotion: String) =
    dictionaryLines.map(_.split(",")).filter(_(0) == emotion).flatMap(_(1).split(" "))

  private def getEmotionLineCount(dictionaryLines: LongTraversable[String], emotion: String) =
    dictionaryLines.map(_.split(",")(0)).filter(_ == emotion).size
}


val model = NaiveBayes.train("/Users/shubham/projects/thealth/data/dict/sentiment_dictionary.txt", Array("positive", "negative", "neutral"), 0.0)

model.predict("")
model.predict("happy is thy man who has food")
model.predict("precious is happy")
model.predict("Apple and YouTube, with @PTXofficial's help, surprise all at Clive Davis' pre-#GRAMMYs party http://on.mash.to/1EPFsQQ")
model.predict("The Apple Watch may sound cool but here's why it's going to be a flop ")
model.predict("serious ferociously sinister bowdlerize leer inflammatory exuberance outshine easiness")
	// DON'T PUSH CHANGES MADE TO THIS FILE
	// ADD IT TO YOUR .gitignore AFTER DOWNLOADING IT ONCE

	import org.apache.spark.rdd.RDD

	class NaiveBayesModel(val lambda: Double,
	val vocabSize: Long,
	private val vocabEmotion: Array[Seq[String]],
	private val vocabEmotionLength: Array[Int],
	val emotions: Array[String],
	val priorProbabilities: Array[Double]) {

	def predict(tweet: String): String = {
	val words = tweet.split(" ")
	val emotionScore = (priorProbabilities zip (0 to 2)).map { case (priorProb: Double, index: Int) =>
	words.foldLeft(priorProb)((sum, word) => {
	val prob = wordCondProbability(word, vocabEmotionLength(index), vocabSize, vocabEmotion(index))
	if (prob == 0) sum else sum + prob
	})
	}.zip(emotions).toSeq

	println(emotionScore.mkString(" "))

	val prominentEmotion = emotionScore.max(maxEmotion)._2
	prominentEmotion
	}

	def predict(tweetRDD: RDD[String]): RDD[String] = {
	tweetRDD.map(predict)
	}

	private def wordCondProbability(word: String, vocabEmotionSize: Long, vocabSize: Long, vocabEmotion: Seq[String]): Double = {
	val termLength = vocabEmotion.count(_ == word)
	(termLength + lambda) / (vocabEmotionSize + vocabSize)
	}

	private def maxEmotion = new Ordering[(Double, String)] {
	override def compare(x: (Double, String), y: (Double, String)): Int = {
	val diff = x._1 - y._1
	if (diff > 0) diff.ceil.toInt else diff.floor.toInt // convert a floating difference result to an integer keeping the Ordering API the same
	}
	}

	}

	import scalax.io.{LongTraversable, Resource}

	object NaiveBayes {
	def train(dictionaryPath:String, emotions: Array[String], lambda: Double = 1.0): NaiveBayesModel = {
	val dictionaryLines = Resource.fromFile(dictionaryPath).lines()
	val totalLines = dictionaryLines.size

	val priorProb = emotions.map(getEmotionLineCount(dictionaryLines, _).toDouble / totalLines)
	val vocabEmotion = emotions.map(genEmotionWords(dictionaryLines, _).toSeq)
	val vocabEmotionLength = vocabEmotion.map(_.length)
	val vocabSize = vocabEmotion.flatMap(_.toList.distinct).size

	new NaiveBayesModel(lambda, vocabSize, vocabEmotion, vocabEmotionLength, emotions, priorProb)
	}

	// helper functions
	private def genEmotionWords(dictionaryLines: LongTraversable[String], emotion: String) =
	dictionaryLines.map(_.split(",")).filter(_(0) == emotion).flatMap(_(1).split(" "))

	private def getEmotionLineCount(dictionaryLines: LongTraversable[String], emotion: String) =
	dictionaryLines.map(_.split(",")(0)).filter(_ == emotion).size
	}


	val model = NaiveBayes.train("/Users/shubham/projects/thealth/data/dict/sentiment_dictionary.txt", Array("positive", "negative", "neutral"), 0.0)

	model.predict("")
	model.predict("happy is thy man who has food")
	model.predict("precious is happy")
	model.predict("Apple and YouTube, with @PTXofficial's help, surprise all at Clive Davis' pre-#GRAMMYs party http://on.mash.to/1EPFsQQ")
	model.predict("The Apple Watch may sound cool but here's why it's going to be a flop ")
	model.predict("serious ferociously sinister bowdlerize leer inflammatory exuberance outshine easiness")