Last active
November 7, 2017 19:58
-
-
Save timbitz/cb728a9dcddf49f3c3a29e81102f752f to your computer and use it in GitHub Desktop.
Bitwise parallel implementations of gc_content and hasambiguity
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import BioSequences: index, offset, bitindex | |
function parallel_gc_content( seq::BioSequence ) | |
gc = 0 | |
s = bitindex(seq, 1) | |
e = bitindex(seq, length(seq)) | |
for i in index(s):index(e) | |
@inbounds data = seq.data[i] | |
# mask nucleotides upstream of first(seq.part) | |
if i == index(s) | |
data &= 0xFFFFFFFFFFFFFFFF << offset(s) | |
end | |
# mask nucleotides downstream of last(seq.part) | |
if i == index(e) | |
data &= 0xFFFFFFFFFFFFFFFF >> (64 - (offset(e)+4)) | |
end | |
# count unambiguous GC nucleotides | |
gc += count_ones(data & 0x6666666666666666) | |
end | |
return length(seq) > 0 ? gc / length(seq) : 0.0 | |
end | |
function parallel_hasambiguity( seq::BioSequence ) | |
nucs = 0 | |
s = bitindex(seq, 1) | |
e = bitindex(seq, length(seq)) | |
for i in index(s):index(e) | |
@inbounds data = seq.data[i] | |
# mask nucleotides upstream of first(seq.part) | |
if i == index(s) | |
data &= 0xFFFFFFFFFFFFFFFF << offset(s) | |
end | |
# mask nucleotides downstream of last(seq.part) | |
if i == index(e) | |
data &= 0xFFFFFFFFFFFFFFFF >> (64 - (offset(e)+4)) | |
end | |
# count | |
block_ones = count_ones(data) | |
if block_ones > 16 | |
return true | |
end | |
nucs += block_ones | |
end | |
return nucs > length(seq) ? true : false | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment