Created
November 25, 2009 01:09
-
-
Save smerrill/242386 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val alphabet = "abcdefghijklmnopqrstuvwxyz" | |
def train(text:String) = { | |
"[a-z]+".r.findAllIn(text).foldLeft(Map[String, Int]() withDefaultValue 1) {(a, b) => a(b) = a(b) + 1} | |
} | |
val NWORDS = train(scala.io.Source.fromFile("big.txt").getLines.mkString.toLowerCase) | |
def known(words:Set[String]) = {println("Known invocation: %s" format words); Set.empty ++ (for(w <- words if NWORDS contains w) yield w)} | |
def edits1(word:String) = { | |
Set.empty ++ | |
(for (i <- 0 until word.length) yield (word take i) + (word drop (i + 1))) ++ // Deletes | |
(for (i <- 0 until word.length - 1) yield (word take i) + word(i + 1) + word(i) + (word drop (i + 2))) ++ // Transposes | |
(for (i <- 0 until word.length; j <- alphabet) yield (word take i) + j + (word drop (i+1))) ++ // Replaces | |
(for (i <- 0 until word.length; j <- alphabet) yield (word take i) + j + (word drop i)) // Inserts | |
} | |
def known_edits2(word:String) = {Set.empty ++ (for (e1 <- edits1(word); e2 <- edits1(e1) if NWORDS contains e2) yield e2)} | |
implicit def toOr[A](one: Set[A]) = new AnyRef { | |
def or(other: => Set[A]): Set[A] = { println("or called: %s" format one); if (one.isEmpty) other else one } | |
} | |
def correct(word: String) = { | |
val candidates = known(Set(word)) or known(edits1(word)) or known_edits2(word) or Set(word) | |
candidates.foldLeft("") {(a, b) => if (NWORDS(a) > NWORDS(b)) a else b} | |
} | |
/* Outputs: | |
* scala> correct("the") | |
* Known invocation: Set(the) | |
* Known invocation: Set(the) | |
* or called: Set(the) | |
* Known invocation: Set(the) | |
* Known invocation: Set(the) | |
* or called: Set(the) | |
* or called: Set(the) | |
* Known invocation: Set(the) | |
* Known invocation: Set(the) | |
* or called: Set(the) | |
* Known invocation: Set(the) | |
* Known invocation: Set(the) | |
* or called: Set(the) | |
* or called: Set(the) | |
* or called: Set(the) | |
* res3: java.lang.String = the | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment