Skip to content

Instantly share code, notes, and snippets.

@Cowa
Cowa / testES6.js
Last active August 29, 2015 14:27
class Car {
constructor(make, wheels) {
this.make = make;
this.wheels = wheels;
this.currentSpeed = 25;
}
printCurrentSpeed() {
console.log(this.make + ' is going ' + this.currentSpeed + ' mph.');
}
@Cowa
Cowa / ngrams.scala
Last active October 20, 2015 18:05
N-grams models in Scala - Step by step
// Tokenized text
val words = List("bonjour", "je", "suis", "ici", ".", "bonjour", "je", "suis", "là")
// 2-grams
val n = 2
val ngram = words.sliding(n).toList
// ngram: List[List[String]] = List(List(bonjour, je), List(je, suis), List(suis, ici), List(ici, .), List(., bonjour), List(bonjour, je), List(je, suis), List(suis, là))
val ngramWithCount = ngram.groupBy(identity).mapValues(_.size)
// ngramWithCount: Map[List[String],Int] = Map(List(bonjour, je) -> 2, List(je, suis) -> 2, List(suis, ici) -> 1, List(suis, là) -> 1, List(ici, .) -> 1, List(., bonjour) -> 1)
val source = "The blue sky is near the red koala near the blue sky"
val tokens = source.split(" ").toList
// List(The, blue, sky, is, near, the, red, koala, near, the, blue, sky)
// For bigrams
tokens.sliding(2).toList
// List(List(The, blue), List(blue, sky), List(sky, is), List(is, near), List(near, the), List(the, red), List(red, koala), List(koala, near), List(near, the), List(the, blue), List(blue, sky))
// For trigrams
tokens.sliding(3).toList
// List(List(The, blue, sky), List(blue, sky, is), List(sky, is, near), List(is, near, the), List(near, the, red), List(the, red, koala), List(red, koala, near), List(koala, near, the), List(near, the, blue), List(the, blue, sky))
// For n-gram
val n = ...
val n = 2
val ngram = tokens.sliding(n).toList
// List(List(The, blue), List(blue, sky), List(sky, is), List(is, near), List(near, the), List(the, red), List(red, koala), List(koala, near), List(near, the), List(the, blue), List(blue, sky))
val ngramWithCount = ngram.groupBy(identity).mapValues(_.size)
// Map(List(koala, near) -> 1, List(blue, sky) -> 2, List(red, koala) -> 1, List(near, the) -> 2, List(the, blue) -> 1, List(The, blue) -> 1, List(sky, is) -> 1, List(the, red) -> 1, List(is, near) -> 1)
val ngramWithProbability = ngramWithCount.map { case (k, v) =>
(k, v.toDouble / ngramWithCount.filterKeys(_.take(n - 1) == k.take(n - 1)).values.sum)
}
// Map(List(eating, the) -> 1.0, List(blue, sky) -> 1.0, List(red, koala) -> 1.0, List(near, the) -> 1.0, List(The, blue) -> 1.0, List(the, sky) -> 0.5, List(koala, eating) -> 1.0, List(sky, is) -> 1.0, List(the, red) -> 0.5, List(is, near) -> 1.0)
val sumIndex = ngramWithCount.groupBy { case (k, v) => k.take(n - 1) }.mapValues(_.values.sum)
val ngramWithProbabilityFaster = ngramWithCount.map { case (k, v) =>
(k, v.toDouble / sumIndex(k.take(n - 1)))
}
// Map(List(eating, the) -> 1.0, List(blue, sky) -> 1.0, List(red, koala) -> 1.0, List(near, the) -> 1.0, List(The, blue) -> 1.0, List(the, sky) -> 0.5, List(koala, eating) -> 1.0, List(sky, is) -> 1.0, List(the, red) -> 0.5, List(is, near) -> 1.0)
val ngram = (List.fill(n - 1)("<s>") ++ tokens :+ "</s>")
.sliding(n)
.toList
// Same song
val ngramWithCount = ...
val ngramWithProbabilityFaster = ...
object Marker {
val start = "<s>"
val end = "</s>"
}
object NGram {
def build(t: String, n: Int): NGram = {
NGram(addProbability(addCount(t.split("\n").flatMap(l =>
sliding(l.split(" ").toList, n)).toList), n), n)
}