Skip to content

Instantly share code, notes, and snippets.

@shaulhameed
Last active May 30, 2019 20:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save shaulhameed/568d2ecdf3a69657b29e40c8be949538 to your computer and use it in GitHub Desktop.
Save shaulhameed/568d2ecdf3a69657b29e40c8be949538 to your computer and use it in GitHub Desktop.
Auto correction implementation for swift based on http://norvig.com/spell-correct.html. Improved the search performance using "trie" data structure.
//
// Trie.swift
// AutoCorrect
//
// Created by Shaul Hameed on 30/07/17.
//
//
import Foundation
/// A node in the trie
class TrieNode<T: Hashable> {
var value: T?
weak var parentNode: TrieNode?
var children: [T: TrieNode] = [:]
var isTerminating = false
var isLeaf: Bool {
return children.count == 0
}
/// Initializes a node.
///
/// - Parameters:
/// - value: The value that goes into the node
/// - parentNode: A reference to this node's parent
init(value: T? = nil, parentNode: TrieNode? = nil) {
self.value = value
self.parentNode = parentNode
}
/// Adds a child node to self. If the child is already present,
/// do nothing.
///
/// - Parameter value: The item to be added to this node.
func add(value: T) {
guard children[value] == nil else {
return
}
children[value] = TrieNode(value: value, parentNode: self)
}
}
/// A trie data structure containing words. Each node is a single
/// character of a word.
class Trie: NSObject, NSCoding {
typealias Node = TrieNode<Character>
/// The number of words in the trie
public var count: Int {
return wordCount
}
/// Is the trie empty?
public var isEmpty: Bool {
return wordCount == 0
}
/// All words currently in the trie
public var words: [String] {
return wordsInSubtrie(rootNode: root, partialWord: "")
}
fileprivate let root: Node
fileprivate var wordCount: Int
/// Creates an empty trie.
override init() {
root = Node()
wordCount = 0
super.init()
}
// MARK: NSCoding
/// Initializes the trie with words from an archive
///
/// - Parameter decoder: Decodes the archive
required convenience init?(coder decoder: NSCoder) {
self.init()
let words = decoder.decodeObject(forKey: "words") as? [String]
for word in words! {
self.insert(word: word)
}
}
/// Encodes the words in the trie by putting them in an array then encoding
/// the array.
///
/// - Parameter coder: The object that will encode the array
func encode(with coder: NSCoder) {
coder.encode(self.words, forKey: "words")
}
}
// MARK: - Adds methods: insert, remove, contains
extension Trie {
/// Inserts a word into the trie. If the word is already present,
/// there is no change.
///
/// - Parameter word: the word to be inserted.
func insert(word: String) {
guard !word.isEmpty else {
return
}
var currentNode = root
for character in word.lowercased().characters {
if let childNode = currentNode.children[character] {
currentNode = childNode
} else {
currentNode.add(value: character)
currentNode = currentNode.children[character]!
}
}
// Word already present?
guard !currentNode.isTerminating else {
return
}
wordCount += 1
currentNode.isTerminating = true
}
/// Determines whether a word is in the trie.
///
/// - Parameter word: the word to check for
/// - Returns: true if the word is present, false otherwise.
func contains(word: String) -> Bool {
guard !word.isEmpty else {
return false
}
var currentNode = root
for character in word.lowercased().characters {
guard let childNode = currentNode.children[character] else {
return false
}
currentNode = childNode
}
return currentNode.isTerminating
}
/// Attempts to walk to the last node of a word. The
/// search will fail if the word is not present. Doesn't
/// check if the node is terminating
///
/// - Parameter word: the word in question
/// - Returns: the node where the search ended, nil if the
/// search failed.
private func findLastNodeOf(word: String) -> Node? {
var currentNode = root
for character in word.lowercased().characters {
guard let childNode = currentNode.children[character] else {
return nil
}
currentNode = childNode
}
return currentNode
}
/// Attempts to walk to the terminating node of a word. The
/// search will fail if the word is not present.
///
/// - Parameter word: the word in question
/// - Returns: the node where the search ended, nil if the
/// search failed.
private func findTerminalNodeOf(word: String) -> Node? {
if let lastNode = findLastNodeOf(word: word) {
return lastNode.isTerminating ? lastNode : nil
}
return nil
}
/// Deletes a word from the trie by starting with the last letter
/// and moving back, deleting nodes until either a non-leaf or a
/// terminating node is found.
///
/// - Parameter terminalNode: the node representing the last node
/// of a word
private func deleteNodesForWordEndingWith(terminalNode: Node) {
var lastNode = terminalNode
var character = lastNode.value
while lastNode.isLeaf, let parentNode = lastNode.parentNode {
lastNode = parentNode
lastNode.children[character!] = nil
character = lastNode.value
if lastNode.isTerminating {
break
}
}
}
/// Removes a word from the trie. If the word is not present or
/// it is empty, just ignore it. If the last node is a leaf,
/// delete that node and higher nodes that are leaves until a
/// terminating node or non-leaf is found. If the last node of
/// the word has more children, the word is part of other words.
/// Mark the last node as non-terminating.
///
/// - Parameter word: the word to be removed
func remove(word: String) {
guard !word.isEmpty else {
return
}
guard let terminalNode = findTerminalNodeOf(word: word) else {
return
}
if terminalNode.isLeaf {
deleteNodesForWordEndingWith(terminalNode: terminalNode)
} else {
terminalNode.isTerminating = false
}
wordCount -= 1
}
/// Returns an array of words in a subtrie of the trie
///
/// - Parameters:
/// - rootNode: the root node of the subtrie
/// - partialWord: the letters collected by traversing to this node
/// - Returns: the words in the subtrie
fileprivate func wordsInSubtrie(rootNode: Node, partialWord: String) -> [String] {
var subtrieWords = [String]()
var previousLetters = partialWord
if let value = rootNode.value {
previousLetters.append(value)
}
if rootNode.isTerminating {
subtrieWords.append(previousLetters)
}
for childNode in rootNode.children.values {
let childWords = wordsInSubtrie(rootNode: childNode, partialWord: previousLetters)
subtrieWords += childWords
}
return subtrieWords
}
/// Returns an array of words in a subtrie of the trie that start
/// with given prefix
///
/// - Parameters:
/// - prefix: the letters for word prefix
/// - Returns: the words in the subtrie that start with prefix
func findWordsWithPrefix(prefix: String) -> [String] {
var words = [String]()
let prefixLowerCased = prefix.lowercased()
if let lastNode = findLastNodeOf(word: prefixLowerCased) {
if lastNode.isTerminating {
words.append(prefixLowerCased)
}
for childNode in lastNode.children.values {
let childWords = wordsInSubtrie(rootNode: childNode, partialWord: prefixLowerCased)
words += childWords
}
}
return words
}
}
//
// Prediction.swift
// AutoCorrect
//
// Created by Shaul Hameed on 25/07/17.
//
//
import UIKit
extension String {
var length: Int {
return self.characters.count
}
subscript (i: Int) -> String {
return self[Range(i ..< i + 1)]
}
func substring(from: Int) -> String {
return self[Range(min(from, length) ..< length)]
}
func substring(to: Int) -> String {
return self[Range(0 ..< max(0, to))]
}
subscript (r: Range<Int>) -> String {
let range = Range(uncheckedBounds: (lower: max(0, min(length, r.lowerBound)),
upper: min(length, max(0, r.upperBound))))
let start = index(startIndex, offsetBy: range.lowerBound)
let end = index(start, offsetBy: range.upperBound - range.lowerBound)
return self[Range(start ..< end)]
}
}
// Todo: Have to implement trie algorithm for correction and prediction
class Prediction: NSObject {
private var _words: [String]?
private var FilePath:URL?
public static var sharedInstance = Prediction()
private let letters = "abcdefghijklmnopqrstuvwxyz"
private var trie: Trie = Trie()
//Read only data.
var dictionary:[String]{
get {
guard let words = self._words else {
return self.readFromDictionary()
}
return words
}
}
override init() {
super.init()
// sets up the word from the library.
// we are using a temprory library right now.
// will be using users typed in characters to make the results more accurate.
self._words = self.readFromDictionary()
self.populateTrie(words: self.readFromDictionary())
}
private func populateTrie(words: [String]) {
for (_, word) in words.enumerated() {
self.trie.insert(word: word)
}
}
// reads all the data from the dictionary.
private func readFromDictionary() -> [String]{
//Begin reading the data from a dictionary.
if let directory = Bundle.main.url(forResource: "Dictionary", withExtension: "txt"){
self.FilePath = directory
do {
let wordAsString = try String(contentsOf: self.FilePath!, encoding: String.Encoding.utf8)
return self.extractWords(source: wordAsString)
}
catch _ {
// todo - write an custom error handler.
return [String]()
}
}
return [String]()
}
// returns all the words from the list.
private func extractWords(source: String) -> [String] {
let PATTERN = "\\w+"
do {
let regex = try NSRegularExpression(pattern: PATTERN, options: .caseInsensitive)
var nsSource:NSString = NSString(string: source.lowercased())
// This will sanitize the new line and whitespaces.
// We are uncerntain about the source.
nsSource = nsSource.components(separatedBy: NSCharacterSet.whitespacesAndNewlines).joined(separator: " ") as NSString
let searchRange = NSMakeRange(0, nsSource.length)
let words = regex.matches(in: nsSource as String, options: .reportCompletion, range: searchRange)
return words.map {
return nsSource.substring(with: $0.range)
}
}
catch _{
// Need to send a custom exception.
return [String]()
}
}
// Searches for the known word matching.
// Suits the case when there is no spelling mistakes in your text.
private func find(words: [String])-> Set<String>{
var matches: [[String]] = [[String]]()
for (_, word) in words.enumerated() {
// This is going to take N number of times for searching it.
matches.append(trie.findWordsWithPrefix(prefix: word))
}
return Set<String>(matches.flatMap{$0})
}
private func editsForOne(word: String)-> Set<String>{
let split: [(String, String)] = self.getSplit(forWord: word)
return self.find(words: Array(self.gerenateEditForOne(forSplit: split)))
}
private func gerenateEditForOne(forSplit: [(String, String)]) -> Set<String>{
var deletes: [String] = [String]()
var transposes: [String] = [String]()
var replaces: [String] = [String]()
var inserts: [String] = [String]()
for (_, value) in forSplit.enumerated() {
let left = "\(value.0)"
let right = "\(value.1)"
let sourceLength = right.characters.count
if(sourceLength > 1){
// generate transpose
let range = Range(2 ..< right.characters.count)
let transposedWord = "\(left)\(right[1])\(right[0])\(right[range])"
transposes.append(transposedWord)
}
if(sourceLength > 0){
// generates deletes
let deleteRange = Range(1 ..< right.characters.count)
let deletedWord = "\(left)\(right[deleteRange])"
deletes.append(deletedWord)
// generates replaces.
for c in letters.characters.enumerated() {
let replaceRange = Range(1 ..< right.characters.count)
let repacedWord = "\(left)\(c.element)\(right[replaceRange])"
replaces.append(repacedWord)
// generates inserts
let insertedWord = "\(left)\(c.element)\(right)"
inserts.append(insertedWord)
}
}
}
return Set<String>([deletes, transposes, replaces, inserts].flatMap{$0})
}
private func getDeletes(forSplit: [(String, String)]) -> Set<String>{
return Set<String>()
}
private func getSplit(forWord: String)-> [(String, String)]{
var splits:[(String, String)] = [(String, String)]()
var length = forWord.characters.count
for i in 0...forWord.characters.count {
let forwardRange = NSMakeRange(0, i)
let reverseRange = NSMakeRange(i, length)
let firstString = (forWord as NSString).substring(with: forwardRange)
let secondString = (forWord as NSString).substring(with: reverseRange)
splits.append((firstString, secondString))
length = length - 1
}
return splits;
}
private func convertTo( mutable: String)-> String{
// we are doing this to avoid segfault.
// i.e. making the immutable mutable.
var mutable = mutable
let leftShift = mutable.withMutableCharacters({ (chars) -> String.CharacterView in
return chars
})
return String(leftShift)
}
//MARK: Public functions
//PS: Need to write a benchmark for this.
//retuns back an array of corrected text.
func getCorrected(text: String) -> Set<String> {
// standard find.
let standardSearch = self.find(words:[text])
if(standardSearch.count < 1) {
let editForOneSearch = self.editsForOne(word: text)
if(editForOneSearch.count < 1){
var secondList:[[String]] = [[String]]()
let splits = self.getSplit(forWord: text)
for (_, word) in self.gerenateEditForOne(forSplit: splits).enumerated() {
let splitsForSecond = self.getSplit(forWord: word)
secondList.append(Array(self.gerenateEditForOne(forSplit: splitsForSecond)))
}
return self.find(words: secondList.flatMap{$0})
}
else {
return editForOneSearch
}
}
else {
return standardSearch
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment