Skip to content

Instantly share code, notes, and snippets.

# inquisitiveSoft/Sift4.swift Created Apr 17, 2015

A translation of Siderite Zackwehdex's Sift4 algorithm for ranking how close one string is to another, which is a few orders of magnitude faster than calculating the Levenshtein distance
 // // Super Fast and Accurate string distance algorithm: Sift4 // http://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html // public func sift4(firstString: String, secondString: String, maxOffset: Int = 5, maxDistance: Int? = nil) -> Int? { if firstString.isEmpty { return secondString.isEmpty ? 0 : count(secondString) } else if secondString.isEmpty { return count(firstString) } let s1 = firstString as NSString let s2 = secondString as NSString let l1 = s1.length; let l2 = s2.length; var c1 = 0 // cursor for string 1 var c2 = 0 // cursor for string 2 var lcss = 0 // largest common subsequence var local_cs = 0 // local common substring var numberOfTranspositions = 0 // e.g. 'ab' vs 'ba' typealias Offset = (c1: Int, c2: Int, isTransposition: Bool) var offsets: [Offset] = []; // offset pair array, for computing the transpositions while ((c1 < l1) && (c2 < l2)) { if (s1.characterAtIndex(c1) == s2.characterAtIndex(c2)) { local_cs++; var isTransposition = false //see if current match is a transposition var i = 0; while i < count(offsets) { var offset = offsets[i] if c1 <= offset.c1 || c2 <= offset.c2 { // when two matches cross, the one considered a transposition is the one with the largest difference in offsets isTransposition = abs(c2 - c1) >= abs(offset.c2 - offset.c1) if isTransposition { numberOfTranspositions++ } else if !offset.isTransposition { offset.isTransposition = true; numberOfTranspositions++; } offsets[i] = offset break } else { if c1 > offset.c2 && c2 > offset.c1 { offsets.removeAtIndex(i) } else { i++ } } } offsets.append((c1, c2, isTransposition)) } else { lcss+=local_cs local_cs = 0 if c1 != c2 { let min_c = min(c1, c2) //using min allows the computation of transpositions c1 = min_c c2 = min_c } //if matching characters are found, remove 1 from both cursors (they get incremented at the end of the loop) //so that we can have only one code block handling matches for var i = 0; i < maxOffset && (c1+i < l1 || c2+i < l2); i++ { if (c1 + i < l1) && (s1.characterAtIndex(c1 + i) == s2.characterAtIndex(c2)) { c1 += i-1 c2-- break } if (c2 + i < l2) && (s1.characterAtIndex(c1) == s2.characterAtIndex(c2 + i)) { c1-- c2 += i - 1 break } } } c1++ c2++ if let max_d = maxDistance where (max(c1, c2) - lcss + numberOfTranspositions) > max_d { return nil } // this covers the case where the last match is on the last token in list, so that it can compute transpositions correctly if (c1 >= l1) || (c2 >= l2) { lcss += local_cs local_cs = 0 let min_c = min(c1,c2) c1 = min_c c2 = min_c } } lcss += local_cs; return max(l1, l2) - lcss + numberOfTranspositions //add the cost of transpositions to the final result }
to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.