Statistical text generator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// you need Scala 2.10 to run this | |
val text = ??? | |
// we are interested in bi-grams | |
val n = 2 | |
// map from ngrams to seq of following words | |
val map = collection.mutable.Map[Seq[String], Seq[String]]() withDefaultValue Seq() | |
val words = text.toLowerCase split "(?U)\\W+" toVector | |
for (ngram :+ next <- words sliding n+1) | |
map(ngram) = map(ngram) :+ next | |
// generate text | |
var firstNgram, lastNgram = Seq("anomalocaris", "detrimentum", "něco") take n | |
val generatedWords = for (i <- 1 to 5000) yield { | |
map get lastNgram match { | |
case Some(nextWords) => | |
val idx = util.Random.nextInt(nextWords.size) | |
val next = nextWords(idx) | |
lastNgram = lastNgram.tail :+ next | |
Some(next) | |
case None => | |
None | |
} | |
} | |
var length = 0 | |
for (w <- firstNgram ++ generatedWords.flatten) { | |
length += w.length + 1 | |
if (length > 140) { | |
length = w.length + 1 | |
println() | |
} | |
print(w+" ") | |
} | |
println() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment