Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
A translation of Siderite Zackwehdex's Sift4 algorithm for ranking how close one string is to another, which is a few orders of magnitude faster than calculating the Levenshtein distance
//
// Super Fast and Accurate string distance algorithm: Sift4
// http://siderite.blogspot.com/2014/11/super-fast-and-accurate-string-distance.html
//
public func sift4(firstString: String, secondString: String, maxOffset: Int = 5, maxDistance: Int? = nil) -> Int? {
if firstString.isEmpty {
return secondString.isEmpty ? 0 : count(secondString)
} else if secondString.isEmpty {
return count(firstString)
}
let s1 = firstString as NSString
let s2 = secondString as NSString
let l1 = s1.length;
let l2 = s2.length;
var c1 = 0 // cursor for string 1
var c2 = 0 // cursor for string 2
var lcss = 0 // largest common subsequence
var local_cs = 0 // local common substring
var numberOfTranspositions = 0 // e.g. 'ab' vs 'ba'
typealias Offset = (c1: Int, c2: Int, isTransposition: Bool)
var offsets: [Offset] = []; // offset pair array, for computing the transpositions
while ((c1 < l1) && (c2 < l2)) {
if (s1.characterAtIndex(c1) == s2.characterAtIndex(c2)) {
local_cs++;
var isTransposition = false
//see if current match is a transposition
var i = 0;
while i < count(offsets) {
var offset = offsets[i]
if c1 <= offset.c1 || c2 <= offset.c2 {
// when two matches cross, the one considered a transposition is the one with the largest difference in offsets
isTransposition = abs(c2 - c1) >= abs(offset.c2 - offset.c1)
if isTransposition {
numberOfTranspositions++
} else if !offset.isTransposition {
offset.isTransposition = true;
numberOfTranspositions++;
}
offsets[i] = offset
break
} else {
if c1 > offset.c2 && c2 > offset.c1 {
offsets.removeAtIndex(i)
} else {
i++
}
}
}
offsets.append((c1, c2, isTransposition))
} else {
lcss+=local_cs
local_cs = 0
if c1 != c2 {
let min_c = min(c1, c2) //using min allows the computation of transpositions
c1 = min_c
c2 = min_c
}
//if matching characters are found, remove 1 from both cursors (they get incremented at the end of the loop)
//so that we can have only one code block handling matches
for var i = 0; i < maxOffset && (c1+i < l1 || c2+i < l2); i++ {
if (c1 + i < l1) && (s1.characterAtIndex(c1 + i) == s2.characterAtIndex(c2)) {
c1 += i-1
c2--
break
}
if (c2 + i < l2) && (s1.characterAtIndex(c1) == s2.characterAtIndex(c2 + i)) {
c1--
c2 += i - 1
break
}
}
}
c1++
c2++
if let max_d = maxDistance where (max(c1, c2) - lcss + numberOfTranspositions) > max_d {
return nil
}
// this covers the case where the last match is on the last token in list, so that it can compute transpositions correctly
if (c1 >= l1) || (c2 >= l2) {
lcss += local_cs
local_cs = 0
let min_c = min(c1,c2)
c1 = min_c
c2 = min_c
}
}
lcss += local_cs;
return max(l1, l2) - lcss + numberOfTranspositions //add the cost of transpositions to the final result
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.