Skip to content

Instantly share code, notes, and snippets.

@airspeedswift
Last active July 17, 2019 07:22
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save airspeedswift/6d8c9d95f0d812f58be3 to your computer and use it in GitHub Desktop.
Save airspeedswift/6d8c9d95f0d812f58be3 to your computer and use it in GitHub Desktop.
Norvig Spellchecker in Swift
/// Translation of [Peter Norvig's spell checker](http://norvig.com/spell-correct.html) into Swift.
/// Sample input corpus [here](http://norvig.com/big.txt)
import Foundation.NSString // purely for IO, most things done with Swift.String
// pythony slicing
postfix operator ..< { }
prefix operator ..< { }
postfix func ..<<I: ForwardIndexType>(lhs: I) -> RangeStart<I> { return RangeStart(start: lhs) }
prefix func ..<<I: ForwardIndexType>(rhs: I) -> RangeEnd<I> { return RangeEnd(end: rhs) }
struct RangeStart<I: ForwardIndexType> { let start: I }
struct RangeEnd<I: ForwardIndexType> { let end: I }
extension Array {
subscript(r: RangeStart<Int>) -> SubSlice { return self[r.start..<self.endIndex] }
subscript(r: RangeEnd<Int>) -> SubSlice { return self[self.startIndex..<r.end] }
}
extension String {
subscript(r: RangeStart<String.Index>) -> String { return self[r.start..<self.endIndex] }
subscript(r: RangeEnd<String.Index>) -> String { return self[self.startIndex..<r.end] }
}
/// Return an `Array` containing the results of mapping `transform`
/// over `source`, filtering out results where `transform` returns `nil`.
func mapSome<S: SequenceType, U>(source: S, transform: S.Generator.Element -> U?) -> [U] {
return lazy(source).map(transform).filter { $0 != nil }.map { $0! }.array
}
/// Given a word, produce a set of possible alternatives with
/// letters transposed, deleted, replaced or rogue characters inserted
func edits(word: String) -> Set<String> {
if word.isEmpty { return [] }
let splits: [(String,String)] = indices(word).map {
return (word[..<$0],word[$0..<])
}
let deletes = splits.map { $0.0 + dropFirst($0.1) }
let transposes: [String] = mapSome(splits) { left, right in
if let fst = first(right) {
let drop1 = dropFirst(right)
if let snd = first(drop1) {
let drop2 = dropFirst(drop1)
return "\(left)\(snd)\(fst)\(drop2)"
}
}
return nil
}
let alphabet = "abcdefghijklmnopqrstuvwxyz"
let replaces = splits.flatMap { left, right in
map(alphabet) { letter in
"\(left)\(letter)\(dropFirst(right))"
}
}
let inserts = splits.flatMap { left, right in
map(alphabet) { letter in
"\(left)\(letter)\(right)"
}
}
return Set(deletes + transposes + replaces + inserts)
}
struct SpellChecker {
var knownWords: [String:Int] = [:]
init?(contentsOfFile file: String) {
if let text = NSString(contentsOfFile: file, encoding: NSUTF8StringEncoding, error: nil)?.lowercaseString {
println("Loaded file \(file)")
// not really unicode-friendly, but splitting String directly is sloooow...
let words = split(text.unicodeScalars) { !("a"..."z").contains($0) }.map { String($0) }
println("Split string into \(words.count) words")
for word in words { self.train(word) }
println("Trained \(knownWords.count) unique words")
}
else {
return nil
}
}
mutating func train(word: String) {
knownWords[word] = knownWords[word]?.successor() ?? 1
}
func knownEdits2(word: String) -> Set<String>? {
var known_edits: Set<String> = []
for edit in edits(word) {
if let k = known(edits(edit)) {
known_edits.unionInPlace(k)
}
}
return known_edits.isEmpty ? nil : known_edits
}
func known<S: SequenceType where S.Generator.Element == String>(words: S) -> Set<String>? {
let s = Set(filter(words) { self.knownWords.indexForKey($0) != nil })
return s.isEmpty ? nil : s
}
func correct(word: String) -> String {
let candidates = known([word]) ?? known(edits(word)) ?? knownEdits2(word)
return reduce(candidates ?? [], word) {
(knownWords[$0] ?? 1) < (knownWords[$1] ?? 1) ? $1 : $0
}
}
}
// main()
if Process.argc == 2, let checker = SpellChecker(contentsOfFile: Process.arguments[1]) {
println("Type word to check and hit enter")
while let word = NSString(data: NSFileHandle.fileHandleWithStandardInput().availableData,
encoding: NSUTF8StringEncoding) as? String
where last(word) == "\n"
{
let word = dropLast(word)
let checked = checker.correct(word)
if word == checked {
println("\(word) unchanged")
}
else {
println("\(word) -> \(checked)")
}
}
}
else {
println("Usage: \(Process.arguments[0]) <corpus_filename>")
}
@ollybeck
Copy link

Any way you could update this for swift 3? I can't quite seem to figure out the errors

@AlehUlitsionak
Copy link

+1

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment