Skip to content

Instantly share code, notes, and snippets.

@Cowa
Last active October 23, 2015 11:16
Show Gist options
  • Save Cowa/9432d825cbfcee57d708 to your computer and use it in GitHub Desktop.
Save Cowa/9432d825cbfcee57d708 to your computer and use it in GitHub Desktop.
object Marker {
val start = "<s>"
val end = "</s>"
}
object NGram {
def build(t: String, n: Int): NGram = {
NGram(addProbability(addCount(t.split("\n").flatMap(l =>
sliding(l.split(" ").toList, n)).toList), n), n)
}
private def sliding(tokens: List[String], n: Int) =
(List.fill(n - 1)(Marker.start) ++ tokens :+ Marker.end).sliding(n).toList
private def addCount(slides: List[List[String]]) =
slides.groupBy(identity).mapValues(_.size)
private def addProbability(withCount: Map[List[String], Int], n: Int) = {
val sumIndex = withCount.groupBy { case (k, v) => k.take(n - 1) }.mapValues(_.values.sum)
withCount.map { case (k, v) => (k, v.toDouble / sumIndex(k.take(n - 1))) }
}
}
case class NGram(model: Map[List[String], Double], n: Int)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment