Created
June 8, 2020 13:52
-
-
Save CiaranOMara/79c1889262c3d776207acb8c37c84e8b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#= | |
The purpose of this gist is to explore what a high-level interface or workflow might look like with `BioSymbols` as parametric primitive types. | |
For ease of conceptual development of a potential high-level interface, I have used `BitVector`s at the middle-level. | |
I have made no attempt in this example at low-level optimisations. | |
Operations performed with the `BitVector` would be replaced by optimised bit shiting procedures. | |
Ideally, the packing and unpacking methods would be refactored into iterators. | |
=# | |
abstract type BioSymbolEncoding end | |
abstract type BioSymbol end | |
abstract type NucleicAcid <: BioSymbol end | |
struct TwoBit <: BioSymbolEncoding end | |
primitive type DNA{E} <: NucleicAcid 8 end | |
abstract type BioSequence end # The container of compressed symbol encodings. | |
prefix(::DNA) = "DNA" | |
Base.bitstring(nt::DNA) = bitstring(reinterpret(UInt8, nt)) | |
function Base.show(io::IO, symbol::S) where S<: BioSymbol | |
write(io, prefix(symbol), '_', lookup_char(alphabet(S), reinterpret(UInt8, symbol))) | |
return nothing | |
end | |
function Base.print(io::IO, symbol::BioSymbol) | |
write(io, lookup_char(alphabet(S), reinterpret(UInt8, symbol))) | |
return nothing | |
end | |
function createlookupvectors(definitions; empty_char = typemax(Char), empty_encoding = typemax(UInt8)) | |
# Determine maximum required lengths. | |
max_encoding = maximum(last.(definitions)) + 1 #Note: adjusting for offset pointer access. | |
max_char = codepoint(maximum(first.(definitions))) | |
# Setup lookup vectors. | |
encoding_to_char = fill(empty_char, length(0b0:max_encoding)) #Note: assuming typemax sufficiently out of bounds. | |
char_to_encoding = fill(empty_encoding, length(0b0:max_char)) #Note: assuming typemax sufficiently out of bounds. | |
# Place information into vector. | |
for (char, encoding) in definitions | |
cp = codepoint(char) | |
encoding_to_char[encoding + 1] = cp # TODO: is there a more efficient method of loading via pointer offest? | |
char_to_encoding[cp] = encoding | |
# p_encoding_to_char = Ptr{Char}(pointer(encoding_to_char) + encoding * 8) #TODO: can we assume contiguous addressing here? | |
# unsafe_store!(p_encoding_to_char, cp) | |
end | |
return encoding_to_char, char_to_encoding, empty_char, empty_encoding | |
end | |
struct Alphabet | |
encoding_to_char # Note: larger at the top. | |
char_to_encoding | |
empty_char | |
empty_encoding | |
function Alphabet(table) | |
return new(createlookupvectors(table)...) | |
end | |
end | |
function _encoding(a::Alphabet, c::Char) | |
encoding = a.char_to_encoding[codepoint(c)] | |
return encoding | |
end | |
function _char(a::Alphabet, encoding::UInt8) | |
encoding = a.encoding_to_char[encoding + 1] | |
# p = Ptr{Char}(pointer(a.encoding_to_char) + encoding * 8) #TODO: can we assume contiguous addressing here? | |
# return unsafe_load(p) #TODO: not sure whether this is tracked for GC. | |
end | |
function has_encoding(a::Alphabet, bits::UInt8) | |
char = lookup_char(a, bits) | |
resolved = lookup_encoding(a, char) | |
return bits == resolved # Note: Determines whether we're back where we started. | |
end | |
function has_char(a::Alphabet, char::Char) | |
encoding = lookup_encoding(a, char) | |
resolved = lookup_char(a, encoding) | |
return char == resolved # Note: Determines whether we're back where we started. | |
end | |
"Retrieve symbol char." | |
function lookup_encoding(a::Alphabet, c::Char) | |
c = uppercase(c) #TODO: how safe do we want to be? What about having a longer lookup vector? | |
encoding = _encoding(a, c)# function _char(a::Alphabet{BumpedEncoding{false}}, encoding::UInt8) | |
encoding == a.empty_encoding && error("Not in the Alphabet's set of symbols.") | |
return encoding | |
end | |
"Retrieve symbol encoding." | |
function lookup_char(a::Alphabet, encoding::UInt8) | |
char = _char(a, encoding) | |
char == a.empty_char && error("Not in the Alphabet's set of symbols.") | |
return char | |
end | |
"DNA 4bit symbol definitions." | |
const definitions_dna_4bit = [ | |
('A', 0b0001), | |
('C', 0b0010), | |
('G', 0b0100), | |
('T', 0b1000), | |
] | |
const alphabet_dna_4bit = Alphabet(definitions_dna_4bit) | |
lookup_encoding(alphabet_dna_4bit, 'A') | |
lookup_encoding(alphabet_dna_4bit, 'C') | |
lookup_encoding(alphabet_dna_4bit, 'G') | |
lookup_encoding(alphabet_dna_4bit, 'T') | |
lookup_char(alphabet_dna_4bit, 0b0001) | |
lookup_char(alphabet_dna_4bit, 0b0010) | |
lookup_char(alphabet_dna_4bit, 0b0100) | |
lookup_char(alphabet_dna_4bit, 0b1000) | |
"DNA 2bit symbol definitions." | |
const definitions_dna_2bit = [ | |
('A', 0b10), | |
('C', 0b01), | |
('G', 0b11), | |
('T', 0b00), | |
] | |
const alphabet_dna_2bit = Alphabet(definitions_dna_2bit) | |
lookup_encoding(alphabet_dna_2bit, 'A') | |
lookup_encoding(alphabet_dna_2bit, 'C') | |
lookup_encoding(alphabet_dna_2bit, 'G') | |
lookup_encoding(alphabet_dna_2bit, 'T') | |
lookup_char(alphabet_dna_2bit, 0b10) | |
lookup_char(alphabet_dna_2bit, 0b01) | |
lookup_char(alphabet_dna_2bit, 0b11) | |
lookup_char(alphabet_dna_2bit, 0b00) | |
BitsPerElem(::Type{DNA{TwoBit}}) = 2 # Actual TwoBit. | |
BitsPerElem(::Type{DNA{4}}) = 4 | |
alphabet(::Type{DNA{TwoBit}}) = alphabet_dna_2bit | |
alphabet(::Type{DNA{4}}) = alphabet_dna_4bit | |
function (::Type{S})(bits::UInt8) where S <:BioSymbol | |
has_encoding(alphabet(S), bits) || error("Not in the Alphabet's set of symbols.") | |
return reinterpret(S, bits) | |
end | |
function (::Type{S})(char::Char) where S <:BioSymbol | |
encoding = lookup_encoding(alphabet(S), char) # retrieving encoding, should throw error if char is not in the alphabet's set of symbols. | |
return reinterpret(S, encoding) | |
end | |
function Base.convert(char::Type{Char}, symbol::S) where S <: BioSymbol | |
encoding = reinterpret(UInt8, symbol) | |
return lookup_char(alphabet(S), encoding) | |
end | |
function Base.convert(::Type{S}, char::Char) where S<:DNA #TODO: the body of this convert is the same as the constructor. | |
encoding = lookup_encoding(alphabet(S), char) | |
return reinterpret(S, encoding) | |
end | |
encoding = lookup_encoding(alphabet(DNA{TwoBit}), 'C') | |
encoding = lookup_encoding(alphabet(DNA{TwoBit}), 'c') | |
DNA{TwoBit}('c') | |
convert(DNA{TwoBit}, 'A') | |
convert(Char, DNA{TwoBit}('A')) | |
collect(DNA{TwoBit}, ['G', 'A', 'T', 'C']) | |
encoding = lookup_encoding(alphabet(DNA{4}), 'c') | |
DNA{4}('c') | |
convert(DNA{4}, 'A') | |
convert(Char, DNA{4}('A')) | |
collect(DNA{4}, ['G', 'A', 'T', 'C']) | |
"Default outward facing DNA symbols with 4bit encoding." | |
for (char, encoding) in definitions_dna_4bit | |
var = Symbol("DNA_", char) | |
@eval begin | |
const $(var) = reinterpret(DNA{4}, $(encoding)) | |
end | |
end | |
da = DNA_A | |
typeof(da) | |
reinterpret(UInt8, DNA_A) | |
reinterpret(UInt8, DNA_C) | |
reinterpret(UInt8, DNA_T) | |
reinterpret(UInt8, DNA_G) | |
function Base.convert(::Type{SA}, s::SB) where {SA <: BioSymbol, SB <: BioSymbol} | |
char_b = convert(Char, s) #Note: converting between encodings through common Char. | |
return convert(SA, char_b) | |
end | |
d2 = convert(DNA{TwoBit}, DNA_A) | |
d4 = convert(DNA{4}, DNA{TwoBit}('A')) | |
bitstring(d2) | |
bitstring(d4) | |
""" | |
Compress symbol encoding. | |
Retrieve important bits from encoded byte. | |
""" | |
function compress(nt::S) where S <: BioSymbol | |
bv = BitVector() | |
bv.len = BitsPerElem(S) | |
push!(bv.chunks, reinterpret(UInt8, nt)) | |
return bv | |
end | |
compress(DNA_A) | |
compress(DNA{TwoBit}('A')) | |
""" | |
Illustrative container. | |
This would be LongSequence, NTupleKMer, etc. | |
""" | |
struct CompressedSymbols{S<:BioSymbol} <: BioSequence | |
bv::BitVector | |
function CompressedSymbols{S}(nts::Vector{S}) where S <: BioSymbol | |
return new(pack(CompressedSymbols{S}, nts::Vector{S})) # Note: this is where you could dispatch to different packers. | |
end | |
end | |
function CompressedSymbols(nts::Vector{S}) where S <: BioSymbol | |
return CompressedSymbols{S}(nts) | |
end | |
function CompressedSymbols{S}(nts) where S <: BioSymbol | |
return CompressedSymbols{S}(collect(S, nts)) | |
end | |
"Quick container trait." | |
ElemOrder(::Type{CompressedSymbols}) = true # LeftToRight, RightToLeft. | |
""" | |
Pack symbols into a specified container type. | |
""" | |
function pack(::Type{CompressedSymbols{S}}, nts::Vector{S}) where S <: BioSymbol | |
#TODO: explore IOBuffer for writing chunks. | |
#TODO: explore the TanscodingStreams.jl paradigm. This packer/compressor looks like a codec. | |
N = BitsPerElem(S) | |
bv = BitVector(undef,length(nts) * N) | |
nts = compress.(nts) | |
if ElemOrder(CompressedSymbols) # TODO: dispatch to the best packing algorithm based on this trait. | |
reverse!.(nts) #Note: reverse makes bits easier to reconcile at the moment. | |
end | |
i = 1 | |
for bits in nts | |
bv[i:i+N-1] .= bits | |
i = i + N | |
end | |
return bv | |
end | |
# A: 0b10 | |
# C: 0b01 | |
# G: 0b11 | |
# T: 0b00 | |
# A: 0b10 | |
packed2 = CompressedSymbols(collect(DNA{TwoBit}, [ 'A', 'C', 'G', 'T', 'A'])) | |
reshape(packed2.bv, 2, :)' | |
# A: 0b0001 | |
# C: 0b0010 | |
# G: 0b0100 | |
# T: 0b1000 | |
# A: 0b0001 | |
packed4 = CompressedSymbols([DNA_A, DNA_C, DNA_G, DNA_T, DNA_A]) | |
reshape(packed4.bv, 4, :)' | |
""" | |
Move compressed intsencoding back into a byte. | |
""" | |
function Base.UInt8(bv::BitVector) | |
length(bv) <= 8 || error() | |
p = Ptr{UInt8}(pointer(bv.chunks)) | |
return unsafe_load(p) #TODO: not sure whether the returned value is tracked for GC. | |
end | |
""" | |
Unpack compressed symbols. | |
""" | |
function unpack(container::CompressedSymbols{S}) where S <: BioSymbol | |
n = BitsPerElem(S) | |
L = length(container.bv) | |
bvs = [container.bv[lo:hi] for (lo, hi) in zip(1:n:L, n:n:L)] | |
if ElemOrder(CompressedSymbols) # TODO: dispatch to the best packing algorithm based on this trait. | |
reverse!.(bvs) #Note: reverse makes bits easier to reconcile at the moment. | |
end | |
ints = UInt8.(bvs) #TODO: move into a constructor? | |
return S.(ints) #Note: going through the symbol constructor, which checks whther the encoding is in the Alphabet's set of Symbols. | |
end | |
@time unpack(packed2) | |
@time unpack(packed4) | |
@time packed = CompressedSymbols{DNA{TwoBit}}(DNA{4}.(['A','C','G','T','A'])) | |
@time unpack(packed) | |
@time packed = CompressedSymbols{DNA{TwoBit}}("ACGTA") | |
@time unpack(packed) | |
@time packed = CompressedSymbols{DNA{4}}("ACGTA") | |
@time unpack(packed) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment