jerluc/Markov.scala

## Markov.scala
import util.Random

object Markov extends App {

  // Just some long plain-text sources from Project Gutenberg

  val sherlockHolmes = "http://www.gutenberg.org/cache/epub/1661/pg1661.txt"

  val kingJamesBible = "http://www.gutenberg.org/cache/epub/10/pg10.txt"

  val artOfWar = "http://www.gutenberg.org/cache/epub/17405/pg17405.txt"

  val metamorphosis = "http://www.gutenberg.org/cache/epub/5200/pg5200.txt"

  val greatExpectations = "http://www.gutenberg.org/cache/epub/1400/pg1400.txt"

  val sources = List(
    sherlockHolmes,
    kingJamesBible,
    artOfWar,
    metamorphosis,
    greatExpectations
  )

  val r = new Random(System.nanoTime)

  val source = if (args.length == 1) args(0) else sources(r.nextInt(sources.size))

  val freqs = io.Source.fromURL(source)
    .getLines
    .flatMap(_.split("\\s+")) // Split on whitespace
    .filter(_.trim != "") // Filter out empty strings
    .map(_.toLowerCase) // Lower case that shit
    .sliding(2) // Create a sliding iterable over two elements at a time
    .map(s => (s(0), s(1))) // Not really necessary I guess, but easier for me
    .toList
    .groupBy(_._1) // Group by the starting word
    .map(s => {
      val startWord = s._1
      val ends = s._2
          .map(_._2)
          .sortBy(identity(_)) // Sort each ending word
          .groupBy(identity(_)) // Group on itself to aggregate
          .map(p => (p._1, p._2.size)) // Output absolute end word frequency
      val total = ends.values.sum
      (startWord, ends.map(p => (p._1, p._2.toDouble / total.toDouble))) // Convert to probabilities
    })

  val startingWord = "the"
  var currentWord = startingWord

  println("Markov chain generated from source [%s]".format(source))

  for (i <- 0 to 100) {
    print(currentWord + " ")
    val pop = freqs(currentWord)
      .map(p => (p._1, (p._2 * 100.0).toInt)) // Multiply by 100--maybe should test how this affects output
      .flatMap(e => {
        List.fill(e._2)(e._1) // Output a partial candidate set filled with the same word based on probability
                              // Maybe there's a better way to populate the candidate set?
      })
      .toList
    currentWord = pop(r.nextInt(pop.size)) // Grab a random word from the cumulative candidate set
  }

}
	import util.Random

	object Markov extends App {

	// Just some long plain-text sources from Project Gutenberg

	val sherlockHolmes = "http://www.gutenberg.org/cache/epub/1661/pg1661.txt"

	val kingJamesBible = "http://www.gutenberg.org/cache/epub/10/pg10.txt"

	val artOfWar = "http://www.gutenberg.org/cache/epub/17405/pg17405.txt"

	val metamorphosis = "http://www.gutenberg.org/cache/epub/5200/pg5200.txt"

	val greatExpectations = "http://www.gutenberg.org/cache/epub/1400/pg1400.txt"

	val sources = List(
	sherlockHolmes,
	kingJamesBible,
	artOfWar,
	metamorphosis,
	greatExpectations
	)

	val r = new Random(System.nanoTime)

	val source = if (args.length == 1) args(0) else sources(r.nextInt(sources.size))

	val freqs = io.Source.fromURL(source)
	.getLines
	.flatMap(_.split("\\s+")) // Split on whitespace
	.filter(_.trim != "") // Filter out empty strings
	.map(_.toLowerCase) // Lower case that shit
	.sliding(2) // Create a sliding iterable over two elements at a time
	.map(s => (s(0), s(1))) // Not really necessary I guess, but easier for me
	.toList
	.groupBy(_._1) // Group by the starting word
	.map(s => {
	val startWord = s._1
	val ends = s._2
	.map(_._2)
	.sortBy(identity(_)) // Sort each ending word
	.groupBy(identity(_)) // Group on itself to aggregate
	.map(p => (p._1, p._2.size)) // Output absolute end word frequency
	val total = ends.values.sum
	(startWord, ends.map(p => (p._1, p._2.toDouble / total.toDouble))) // Convert to probabilities
	})

	val startingWord = "the"
	var currentWord = startingWord

	println("Markov chain generated from source [%s]".format(source))

	for (i <- 0 to 100) {
	print(currentWord + " ")
	val pop = freqs(currentWord)
	.map(p => (p._1, (p._2 * 100.0).toInt)) // Multiply by 100--maybe should test how this affects output
	.flatMap(e => {
	List.fill(e._2)(e._1) // Output a partial candidate set filled with the same word based on probability
	// Maybe there's a better way to populate the candidate set?
	})
	.toList
	currentWord = pop(r.nextInt(pop.size)) // Grab a random word from the cumulative candidate set
	}

	}