-
-
Save Daniel-Liu-c0deb0t/7078ebca04569068f15507aa856be6e8 to your computer and use it in GitHub Desktop.
/// ntHash constants. | |
static LUT: [u64; 128] = { | |
let mut l = [0u64; 128]; | |
l[b'A' as usize] = 0x3c8bfbb395c60474u64; | |
l[b'C' as usize] = 0x3193c18562a02b4cu64; | |
l[b'G' as usize] = 0x20323ed082572324u64; | |
l[b'T' as usize] = 0x295549f54be24456u64; | |
l | |
}; | |
/// Robust winnowing. | |
pub fn minimizers_callback(s: &[u8], w: usize, k: usize, prev: u64, mut f: impl FnMut(usize, u64)) { | |
let mut min = 0; | |
let mut min_idx = 0; | |
let mut curr = 0; | |
for (i, win) in s.windows(w).enumerate() { | |
if i == 0 || i > min_idx { | |
let (m_idx, m, c) = minimum(win, k, prev); | |
min_idx = i + m_idx; | |
min = m; | |
curr = c; | |
f(min_idx, min); | |
} else { | |
curr = curr.rotate_left(1) | |
^ LUT[win[w - 1 - k] as usize].rotate_left(k as u32) | |
^ LUT[win[w - 1] as usize]; | |
let h = prev ^ curr; | |
if h < min { | |
min_idx = i + w - k; | |
min = h; | |
f(min_idx, min); | |
} | |
} | |
} | |
} | |
/// Get the rightmost minimum kmer. | |
pub fn minimum(s: &[u8], k: usize, prev: u64) -> (usize, u64, u64) { | |
let mut curr = 0; | |
for (i, &b) in s[..k].iter().enumerate() { | |
curr ^= LUT[b as usize].rotate_left((k - 1 - i) as u32); | |
} | |
let mut min = prev ^ curr; | |
let mut min_idx = 0; | |
for (i, &b) in s[k..].iter().enumerate() { | |
curr = curr.rotate_left(1) ^ LUT[s[i] as usize].rotate_left(k as u32) ^ LUT[b as usize]; | |
let h = prev ^ curr; | |
if h <= min { | |
min = h; | |
min_idx = i + 1; | |
} | |
} | |
(min_idx, min, curr) | |
} |
Thank you for this code. I'm confused about what prev
does. Should I just always pass in prev = 0 if I don't want multiple hashes?
Yes! You can also just remove it from the code.
Ok, thanks!
There's a (not so small?) bug: On line 56 you want min_idx = i+1
instead.
Another possible issue: You probably want to return the rightmost minimizer of each window.
For string TATT
with k=1 and w=2, we have nthash(T) < nthash(A)
. So we get positions 0 (T in TA), 2 (T in AT).
But if we take the rightmost minimizer of each window of length 2, we must also include position 3 (2nd T in TT).
This is fixed by doing h<=min
in line 30.
There's a (not so small?) bug: On line 56 you want
min_idx = i+1
instead.
Thanks for spotting this! I've updated the gist. @jnalanko please take a look at your code and make sure you are using this fix!
Another possible issue: You probably want to return the rightmost minimizer of each window. For string
TATT
with k=1 and w=2, we haventhash(T) < nthash(A)
. So we get positions 0 (T in TA), 2 (T in AT). But if we take the rightmost minimizer of each window of length 2, we must also include position 3 (2nd T in TT). This is fixed by doingh<=min
in line 30.
As we have discussed in DMs, this is intended behavior.
Let me know if you spot any issues with this!
prev
is used as a hacky way to generate multiple different hashes.