Scala implementation of Peter Norvig's spellchecker (http://norvig.com/spell-correct.html)
class NorvigSpellChecker(corpus: String, alphabet: Seq[Char] = 'a' to 'z', level: Int = 2) { | |
val words = s"[${alphabet.head}-${alphabet.last}]+".r.findAllIn(corpus.toLowerCase).toSeq | |
val count = words.groupBy(_.toSeq).mapValues(_.size) withDefaultValue 0 | |
def edit(n: Int)(word: Seq[Char]): Set[Seq[Char]] = n match { | |
case 0 => Set(word) | |
case 1 => | |
val splits = word.indices map word.splitAt | |
val deletes = splits collect {case (a, b0 +: b1) => a ++ b1} | |
val transposes = splits collect {case (a, b0 +: b1 +: b2) => a ++: b1 +: b0 +: b2} | |
val replaces = alphabet flatMap {c => splits collect {case (a, b0 +: b1) => a ++: c +: b1}} | |
val inserts = alphabet flatMap {c => splits map {case (a, b) => a ++: c +: b}} | |
(deletes ++ transposes ++ replaces ++ inserts).toSet | |
case _ => edit(n - 1)(word) flatMap edit(1) | |
} | |
def apply(word: String) = Stream.tabulate(level + 1)(i => edit(i)(word) maxBy count).find(count.contains).map(_.mkString) | |
} | |
// Demo: | |
object NorvigSpellChecker extends App { | |
val checker = new NorvigSpellChecker(io.Source.fromFile("big.txt").mkString) // curl http://norvig.com/big.txt > big.txt | |
Seq("speling", "korrecter", "corrected", "xyz") foreach {w => println(s"$w: ${checker(w)}")} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment