Last active
December 16, 2015 13:18
-
-
Save jerluc/5440335 to your computer and use it in GitHub Desktop.
My first naive attempt at Markov chains
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import util.Random | |
object Markov extends App { | |
// Just some long plain-text sources from Project Gutenberg | |
val sherlockHolmes = "http://www.gutenberg.org/cache/epub/1661/pg1661.txt" | |
val kingJamesBible = "http://www.gutenberg.org/cache/epub/10/pg10.txt" | |
val artOfWar = "http://www.gutenberg.org/cache/epub/17405/pg17405.txt" | |
val metamorphosis = "http://www.gutenberg.org/cache/epub/5200/pg5200.txt" | |
val greatExpectations = "http://www.gutenberg.org/cache/epub/1400/pg1400.txt" | |
val sources = List( | |
sherlockHolmes, | |
kingJamesBible, | |
artOfWar, | |
metamorphosis, | |
greatExpectations | |
) | |
val r = new Random(System.nanoTime) | |
val source = if (args.length == 1) args(0) else sources(r.nextInt(sources.size)) | |
val freqs = io.Source.fromURL(source) | |
.getLines | |
.flatMap(_.split("\\s+")) // Split on whitespace | |
.filter(_.trim != "") // Filter out empty strings | |
.map(_.toLowerCase) // Lower case that shit | |
.sliding(2) // Create a sliding iterable over two elements at a time | |
.map(s => (s(0), s(1))) // Not really necessary I guess, but easier for me | |
.toList | |
.groupBy(_._1) // Group by the starting word | |
.map(s => { | |
val startWord = s._1 | |
val ends = s._2 | |
.map(_._2) | |
.sortBy(identity(_)) // Sort each ending word | |
.groupBy(identity(_)) // Group on itself to aggregate | |
.map(p => (p._1, p._2.size)) // Output absolute end word frequency | |
val total = ends.values.sum | |
(startWord, ends.map(p => (p._1, p._2.toDouble / total.toDouble))) // Convert to probabilities | |
}) | |
val startingWord = "the" | |
var currentWord = startingWord | |
println("Markov chain generated from source [%s]".format(source)) | |
for (i <- 0 to 100) { | |
print(currentWord + " ") | |
val pop = freqs(currentWord) | |
.map(p => (p._1, (p._2 * 100.0).toInt)) // Multiply by 100--maybe should test how this affects output | |
.flatMap(e => { | |
List.fill(e._2)(e._1) // Output a partial candidate set filled with the same word based on probability | |
// Maybe there's a better way to populate the candidate set? | |
}) | |
.toList | |
currentWord = pop(r.nextInt(pop.size)) // Grab a random word from the cumulative candidate set | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment