-
-
Save Daniel-Liu-c0deb0t/7078ebca04569068f15507aa856be6e8 to your computer and use it in GitHub Desktop.
| /// ntHash constants. | |
| static LUT: [u64; 128] = { | |
| let mut l = [0u64; 128]; | |
| l[b'A' as usize] = 0x3c8bfbb395c60474u64; | |
| l[b'C' as usize] = 0x3193c18562a02b4cu64; | |
| l[b'G' as usize] = 0x20323ed082572324u64; | |
| l[b'T' as usize] = 0x295549f54be24456u64; | |
| l | |
| }; | |
| /// Robust winnowing. | |
| pub fn minimizers_callback(s: &[u8], w: usize, k: usize, prev: u64, mut f: impl FnMut(usize, u64)) { | |
| let mut min = 0; | |
| let mut min_idx = 0; | |
| let mut curr = 0; | |
| for (i, win) in s.windows(w).enumerate() { | |
| if i == 0 || i > min_idx { | |
| let (m_idx, m, c) = minimum(win, k, prev); | |
| min_idx = i + m_idx; | |
| min = m; | |
| curr = c; | |
| f(min_idx, min); | |
| } else { | |
| curr = curr.rotate_left(1) | |
| ^ LUT[win[w - 1 - k] as usize].rotate_left(k as u32) | |
| ^ LUT[win[w - 1] as usize]; | |
| let h = prev ^ curr; | |
| if h < min { | |
| min_idx = i + w - k; | |
| min = h; | |
| f(min_idx, min); | |
| } | |
| } | |
| } | |
| } | |
| /// Get the rightmost minimum kmer. | |
| pub fn minimum(s: &[u8], k: usize, prev: u64) -> (usize, u64, u64) { | |
| let mut curr = 0; | |
| for (i, &b) in s[..k].iter().enumerate() { | |
| curr ^= LUT[b as usize].rotate_left((k - 1 - i) as u32); | |
| } | |
| let mut min = prev ^ curr; | |
| let mut min_idx = 0; | |
| for (i, &b) in s[k..].iter().enumerate() { | |
| curr = curr.rotate_left(1) ^ LUT[s[i] as usize].rotate_left(k as u32) ^ LUT[b as usize]; | |
| let h = prev ^ curr; | |
| if h <= min { | |
| min = h; | |
| min_idx = i + 1; | |
| } | |
| } | |
| (min_idx, min, curr) | |
| } |
Thank you for this code. I'm confused about what prev does. Should I just always pass in prev = 0 if I don't want multiple hashes?
Yes! You can also just remove it from the code.
Ok, thanks!
There's a (not so small?) bug: On line 56 you want min_idx = i+1 instead.
Another possible issue: You probably want to return the rightmost minimizer of each window.
For string TATT with k=1 and w=2, we have nthash(T) < nthash(A). So we get positions 0 (T in TA), 2 (T in AT).
But if we take the rightmost minimizer of each window of length 2, we must also include position 3 (2nd T in TT).
This is fixed by doing h<=min in line 30.
There's a (not so small?) bug: On line 56 you want
min_idx = i+1instead.
Thanks for spotting this! I've updated the gist. @jnalanko please take a look at your code and make sure you are using this fix!
Another possible issue: You probably want to return the rightmost minimizer of each window. For string
TATTwith k=1 and w=2, we haventhash(T) < nthash(A). So we get positions 0 (T in TA), 2 (T in AT). But if we take the rightmost minimizer of each window of length 2, we must also include position 3 (2nd T in TT). This is fixed by doingh<=minin line 30.
As we have discussed in DMs, this is intended behavior.
Let me know if you spot any issues with this!
previs used as a hacky way to generate multiple different hashes.