Last active
December 24, 2023 02:37
-
-
Save Daniel-Liu-c0deb0t/7078ebca04569068f15507aa856be6e8 to your computer and use it in GitHub Desktop.
Robust winnowing with ntHash
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// ntHash constants. | |
static LUT: [u64; 128] = { | |
let mut l = [0u64; 128]; | |
l[b'A' as usize] = 0x3c8bfbb395c60474u64; | |
l[b'C' as usize] = 0x3193c18562a02b4cu64; | |
l[b'G' as usize] = 0x20323ed082572324u64; | |
l[b'T' as usize] = 0x295549f54be24456u64; | |
l | |
}; | |
/// Robust winnowing. | |
pub fn minimizers_callback(s: &[u8], w: usize, k: usize, prev: u64, mut f: impl FnMut(usize, u64)) { | |
let mut min = 0; | |
let mut min_idx = 0; | |
let mut curr = 0; | |
for (i, win) in s.windows(w).enumerate() { | |
if i == 0 || i > min_idx { | |
let (m_idx, m, c) = minimum(win, k, prev); | |
min_idx = i + m_idx; | |
min = m; | |
curr = c; | |
f(min_idx, min); | |
} else { | |
curr = curr.rotate_left(1) | |
^ LUT[win[w - 1 - k] as usize].rotate_left(k as u32) | |
^ LUT[win[w - 1] as usize]; | |
let h = prev ^ curr; | |
if h < min { | |
min_idx = i + w - k; | |
min = h; | |
f(min_idx, min); | |
} | |
} | |
} | |
} | |
/// Get the rightmost minimum kmer. | |
pub fn minimum(s: &[u8], k: usize, prev: u64) -> (usize, u64, u64) { | |
let mut curr = 0; | |
for (i, &b) in s[..k].iter().enumerate() { | |
curr ^= LUT[b as usize].rotate_left((k - 1 - i) as u32); | |
} | |
let mut min = prev ^ curr; | |
let mut min_idx = 0; | |
for (i, &b) in s[k..].iter().enumerate() { | |
curr = curr.rotate_left(1) ^ LUT[s[i] as usize].rotate_left(k as u32) ^ LUT[b as usize]; | |
let h = prev ^ curr; | |
if h <= min { | |
min = h; | |
min_idx = i + 1; | |
} | |
} | |
(min_idx, min, curr) | |
} |
Another possible issue: You probably want to return the rightmost minimizer of each window. For string
TATT
with k=1 and w=2, we haventhash(T) < nthash(A)
. So we get positions 0 (T in TA), 2 (T in AT). But if we take the rightmost minimizer of each window of length 2, we must also include position 3 (2nd T in TT). This is fixed by doingh<=min
in line 30.
As we have discussed in DMs, this is intended behavior.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for spotting this! I've updated the gist. @jnalanko please take a look at your code and make sure you are using this fix!