Skip to content

Instantly share code, notes, and snippets.

@k3zi
Last active April 4, 2021 14:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save k3zi/d4d711c5bba68e447402030e374059cc to your computer and use it in GitHub Desktop.
Save k3zi/d4d711c5bba68e447402030e374059cc to your computer and use it in GitHub Desktop.
Needleman-Wunsch Algorithm
import Foundation
enum NeedlemanWunsch {
enum Origin {
case top
case left
case diagonal
}
enum Match {
case indexAndValue(Int, Character)
case missing
}
enum Error: Swift.Error {
case couldNotFindOverlap
}
static func align(input1 seq1: [Character], input2 seq2: [Character], match: Int = 5, substitution: Int = -3, gap: Int = -2, offset1: Int = 0, offset2: Int = 0) throws -> (output1: [Match], output2: [Match]) {
let m = min(seq1.count, seq2.count)
if m < 20000 {
return partialAlign(input1: seq1, input2: seq2, match: match, substitution: substitution, gap: gap, offset1: offset1, offset2: offset2)
}
let seq1Halves = seq1.split()
let seq2Halves = seq2.split()
let overlap = m / 8
let left = try align(
input1: seq1Halves[0] + seq1Halves[1].prefix(upTo: overlap),
input2: seq2Halves[0] + seq2Halves[1].prefix(upTo: overlap),
match: match, substitution: substitution, gap: gap,
offset1: offset1,
offset2: offset2
)
let right = try align(
input1: seq1Halves[0].suffix(overlap) + seq1Halves[1],
input2: seq2Halves[0].suffix(overlap) + seq2Halves[1],
match: match, substitution: substitution, gap: gap,
offset1: offset1 + seq1Halves[0].count - overlap,
offset2: offset2 + seq2Halves[0].count - overlap
)
let middle = try align(
input1: seq1Halves[0].suffix(overlap) + seq1Halves[1].prefix(upTo: overlap),
input2: seq2Halves[0].suffix(overlap) + seq2Halves[1].prefix(upTo: overlap),
match: match, substitution: substitution, gap: gap,
offset1: offset1 + seq1Halves[0].count - overlap,
offset2: offset2 + seq2Halves[0].count - overlap
)
let zipped = Array(zip(middle.output1, middle.output2))
let firstOverlapOptionalIndex = zipped.firstSeries(length: 10, where: {
if case let .indexAndValue(_, first) = $0.0, case let .indexAndValue(_, second) = $0.1 {
return first == second
}
return false
})
let lastOverlapOptionalIndex = zipped.lastSeries(length: 10, where: {
if case let .indexAndValue(_, first) = $0.0, case let .indexAndValue(_, second) = $0.1 {
return first == second
}
return false
})
guard let firstOverlapIndex = firstOverlapOptionalIndex, let lastOverlapIndex = lastOverlapOptionalIndex else {
throw Error.couldNotFindOverlap
}
let firstOverlapOptional = middle.output1[firstOverlapIndex]
let lastOverlapOptional = middle.output1[lastOverlapIndex]
guard case let .indexAndValue(seq1OverlapFirstIndex, _) = firstOverlapOptional else {
throw Error.couldNotFindOverlap
}
guard case let .indexAndValue(seq1OverlapLastIndex, _) = lastOverlapOptional else {
throw Error.couldNotFindOverlap
}
let matchingLeftIndexOptional = left.output1.lastIndex(where: {
if case let .indexAndValue(index, _) = $0, index == seq1OverlapFirstIndex {
return true
}
return false
})
let matchingRightIndexOptional = right.output1.firstIndex(where: {
if case let .indexAndValue(index, _) = $0, index == seq1OverlapLastIndex {
return true
}
return false
})
guard let leftIndex = matchingLeftIndexOptional, let rightIndex = matchingRightIndexOptional else {
throw Error.couldNotFindOverlap
}
let output1 = Array(left.output1[..<leftIndex]) + Array(middle.output1[firstOverlapIndex...lastOverlapIndex]) + Array(right.output1[rightIndex...])
let output2 = Array(left.output2[..<leftIndex]) + Array(middle.output2[firstOverlapIndex...lastOverlapIndex]) + Array(right.output2[rightIndex...])
return (output1, output2)
}
static func partialAlign(input1 seq1: [Character], input2 seq2: [Character], match: Int = 5, substitution: Int = -3, gap: Int = -2, offset1: Int = 0, offset2: Int = 0) -> (output1: [Match], output2: [Match]) {
var scores: [[Int]] = Array(repeating: Array(repeating: 0, count: seq1.count + 1), count: seq2.count + 1)
var paths: [[[Origin]]] = Array(repeating: Array(repeating: [], count: seq1.count + 1), count: seq2.count + 1)
for j in 1...seq1.count {
scores[0][j] = scores[0][j - 1] &+ gap
paths[0][j] = [.left]
}
for i in 1...seq2.count {
scores[i][0] = scores[i - 1][0] &+ gap
paths[i][0] = [.top]
}
for i in 1...seq2.count {
let diagSeq2 = seq2[i &- 1]
for j in 1...seq1.count {
if i == 1 {
paths[i][j].reserveCapacity(seq1.count)
}
let fromTop = scores[i &- 1][j] &+ gap
let fromLeft = scores[i][j &- 1] &+ gap
let fromDiagonal = scores[i &- 1][j &- 1] &+ (seq1[j &- 1] == diagSeq2 ? match : substitution)
let fromMax = max(fromTop, fromLeft, fromDiagonal)
scores[i][j] = fromMax
if fromDiagonal == fromMax { paths[i][j].append(.diagonal) }
if fromTop == fromMax { paths[i][j].append(.top) }
if fromLeft == fromMax { paths[i][j].append(.left) }
}
}
var i = seq2.count
var j = seq1.count
var output1 = [Match]()
var output2 = [Match]()
let m = max(i, j)
output1.reserveCapacity(m)
output2.reserveCapacity(m)
while i != 0 || j != 0 {
switch paths[i][j].first! {
case .diagonal:
output1.insert(.indexAndValue(j - 1 + offset1, seq1[j &- 1]), at: .zero)
output2.insert(.indexAndValue(i - 1 + offset2, seq2[i &- 1]), at: .zero)
i &-= 1
j &-= 1
case .left:
output1.insert(.indexAndValue(j &- 1 &+ offset1, seq1[j &- 1]), at: .zero)
output2.insert(.missing, at: .zero)
j &-= 1
case .top:
output1.insert(.missing, at: 0)
output2.insert(.indexAndValue(i &- 1 &+ offset2, seq2[i &- 1]), at: .zero)
i &-= 1
}
}
return (output1, output2)
}
}
extension Array {
func firstSeries(length: Int, where handler: (Element) -> Bool) -> Index? {
var startIndex = self.startIndex
var index = startIndex
var currentLength = 0
while currentLength != length && index != endIndex {
if handler(self[index]) {
currentLength += 1
index += 1
} else {
currentLength = 0
index += 1
startIndex = index
}
}
return currentLength == length ? startIndex : nil
}
func lastSeries(length: Int, where handler: (Element) -> Bool) -> Index? {
guard !isEmpty else { return nil }
var startIndex = self.endIndex.advanced(by: -1)
var index = startIndex
var currentLength = 0
while currentLength != length && index != self.startIndex {
if handler(self[index]) {
currentLength += 1
index -= 1
} else {
currentLength = 0
index -= 1
startIndex = index
}
}
return currentLength == length ? (index + 1) : nil
}
func split() -> [[Element]] {
let half = count / 2
let leftSplit = self[0 ..< half]
let rightSplit = self[half ..< count]
return [Array(leftSplit), Array(rightSplit)]
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment