Last active
June 8, 2020 07:55
-
-
Save CiaranOMara/0f9a0d88e421f55b7f2e3b87d8cbf071 to your computer and use it in GitHub Desktop.
Parametric BioSymbols alphabet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#= | |
The purpose of this gist is to explore what a high-level interface or workflow might look like with `BioSymbols` as parametric primitive types. | |
For ease of conceptual development of a potential high-level interface, I have used `BitVector`s at the middle-level. | |
I have made no attempt in this example at low-level optimisations. | |
Operations performed with the `BitVector` would be replaced by optimised bit shiting procedures. | |
Ideally, the packing and unpacking methods would be refactored into iterators. | |
=# | |
abstract type BioSymbolEncoding end | |
abstract type BioSymbol end | |
abstract type NucleicAcid <: BioSymbol end | |
struct TwoBit <: BioSymbolEncoding end | |
primitive type DNA{E} <: NucleicAcid 8 end | |
abstract type BioSequence end # The container of compressed symbol encodings. | |
prefix(::DNA) = "DNA" | |
Base.bitstring(nt::DNA) = bitstring(reinterpret(UInt8, nt)) | |
function Base.show(io::IO, symbol::BioSymbol) | |
write(io, prefix(symbol), '_', convert(Char, symbol)) #TODO: factor out char retrieval. | |
return nothing | |
end | |
function Base.print(io::IO, symbol::BioSymbol) | |
write(io, convert(Char, symbol)) | |
return nothing | |
end | |
function createlookupvectors(table) | |
# Determine maximum required lengths. | |
max_encoding = maximum(last.(table)) | |
max_char = codepoint(maximum(first.(table))) | |
# Setup lookup vectors. #TODO: instead of undef, what could represent an invalid value not in the set? | |
encoding_to_char = Vector{Char}(undef, length(0x0:max_encoding)) | |
char_to_encoding = Vector{UInt8}(undef, length(0x0:max_char)) | |
# Place information into vector. | |
for (char, encoding) in table | |
cp = codepoint(char) | |
encoding_to_char[encoding] = cp | |
char_to_encoding[cp] = encoding | |
end | |
return encoding_to_char, char_to_encoding | |
end | |
struct BumpedEncoding{b} end #TODO: not sure whether this is a good idea - attempting to be expressive. | |
BumpedEncoding(b) = BumpedEncoding{b}() | |
struct Lookup{B<:BumpedEncoding} #TODO: perhaps this is the Alphabet!? | |
encoding_to_char | |
char_to_encoding | |
end | |
function Lookup{BumpedEncoding{true}}(table) | |
table = [(char, encoding + 0x01) for (char, encoding) in table] #TODO: remove addition and subtraction by using pointer offset. | |
return Lookup{BumpedEncoding{true}}(createlookupvectors(table)...) | |
end | |
function Lookup{BumpedEncoding{false}}(table)definition_dna_4bit | |
return Lookup{BumpedEncoding{false}}(createlookupvectors(table)...) | |
end | |
function Lookup(table) | |
min_encoding = minimum(last.(table)) | |
bumped = min_encoding == 0x0 | |
return Lookup{BumpedEncoding{bumped}}(table) | |
end | |
"Retrieve symbol char." | |
function (l::Lookup{BumpedEncoding{false}})(c::Char) | |
return l.char_to_encoding[codepoint(c)] | |
end | |
"Retrieve symbol char." | |
function (l::Lookup{BumpedEncoding{true}})(c::Char) | |
return l.char_to_encoding[codepoint(c)] - 0x01 #TODO: remove addition and subtraction by using pointer offset. | |
end | |
"Retrieve symbol encoding." | |
function (l::Lookup{BumpedEncoding{false}})(c::UInt8) | |
return l.encoding_to_char[c] | |
end | |
"Retrieve symbol encoding." | |
function (l::Lookup{BumpedEncoding{true}})(c::UInt8) | |
return l.encoding_to_char[c + 0x01] #TODO: remove addition and subtraction by using pointer offset. | |
end | |
"DNA 4bit symbol definitions." | |
const definition_dna_4bit = [ | |
('A', 0b0001), | |
('C', 0b0010), | |
('G', 0b0100), | |
('T', 0b1000), | |
] | |
const lookup_dna_4bit = Lookup(definition_dna_4bit) | |
@time lookup_dna_4bit('A') | |
@time lookup_dna_4bit(0x01) | |
"DNA 2bit symbol definitions." | |
const definition_dna_2bit = [ | |
('A', 0b10), | |
('C', 0b01), | |
('G', 0b11), | |
('T', 0b00), | |
] | |
const lookup_dna_2bit = Lookup(definition_dna_2bit) | |
@time lookup_dna_2bit('A') | |
@time lookup_dna_2bit(0x02) | |
BitsPerElem(::Type{DNA{TwoBit}}) = 2 # Actual TwoBit. | |
BitsPerElem(::Type{DNA{4}}) = 4 | |
table(::Type{DNA{TwoBit}}) = lookup_dna_2bit | |
table(::Type{DNA{4}}) = lookup_dna_4bit | |
function (s::Type{<:DNA})(bits::UInt8) #TODO: Perhaps this is where we goto the Alphabet and ask whether the encoding or char is in the alphabet's set of symbols. | |
return reinterpret(s, bits) #TODO: how do we know whether the bits are reasonable? | |
end | |
function (s::Type{<:DNA})(char::Char) #TODO: Perhaps this is where we goto the Alphabet and ask whether the encoding or char is in the alphabet's set of symbols. | |
bits = table(s)(uppercase(char)) # retrieving encoding. | |
return reinterpret(s, bits) | |
end | |
function Base.convert(char::Type{Char}, s::S) where S <: DNA | |
bits = reinterpret(UInt8, s) | |
return table(S)(bits) | |
end | |
function Base.convert(::Type{S}, char::Char) where S<:DNA | |
return reinterpret(S, table(S)(char)) | |
end | |
bits = table(DNA{TwoBit})(uppercase('c')) | |
@time DNA{TwoBit}('c') | |
convert(DNA{TwoBit}, 'A') | |
convert(Char, DNA{TwoBit}('A')) | |
collect(DNA{TwoBit}, ['G', 'A', 'T', 'C']) | |
bits = table(DNA{4})(uppercase('c')) | |
@time DNA{4}('c') | |
convert(DNA{4}, 'A') | |
convert(Char, DNA{4}('A')) | |
collect(DNA{4}, ['G', 'A', 'T', 'C']) | |
"Default outward facing DNA symbols with 4bit encoding." | |
for (char, bits) in definition_dna_4bit | |
var = Symbol("DNA_", char) | |
@eval begin | |
const $(var) = reinterpret(DNA{4}, $(bits)) | |
end | |
end | |
da = DNA_A | |
typeof(da) | |
reinterpret(UInt8, DNA_A) | |
reinterpret(UInt8, DNA_C) | |
reinterpret(UInt8, DNA_T) | |
reinterpret(UInt8, DNA_G) | |
function Base.convert(::Type{SA}, s::SB) where {SA <: DNA, SB <: DNA} | |
char_b = convert(Char, s) #Note: converting between encodings through common Char. | |
return convert(SA, char_b) | |
end | |
d2 = convert(DNA{TwoBit}, DNA_A) | |
d4 = convert(DNA{4}, DNA{TwoBit}('A')) | |
bitstring(d2) | |
bitstring(d4) | |
""" | |
Compress symbol encoding. | |
Retrieve important bits from encoded byte. | |
""" | |
function compress(nt::S) where S <: BioSymbol | |
bv = BitVector() | |
bv.len = BitsPerElem(S) | |
push!(bv.chunks, reinterpret(UInt8, nt)) | |
return bv | |
end | |
compress(DNA_A) | |
compress(DNA{TwoBit}('A')) | |
""" | |
Illustrative container. | |
This would be LongSequence, NTupleKMer, etc. | |
""" | |
struct CompressedSymbols{S<:BioSymbol} <: BioSequence | |
bv::BitVector | |
function CompressedSymbols{S}(nts::Vector{S}) where S <: BioSymbol | |
return new(pack(CompressedSymbols{S}, nts::Vector{S})) # Note: this is where you could dispatch to different packers. | |
end | |
end | |
function CompressedSymbols(nts::Vector{S}) where S <: BioSymbol | |
return CompressedSymbols{S}(nts) | |
end | |
function CompressedSymbols{S}(nts) where S <: BioSymbol | |
return CompressedSymbols{S}(collect(S, nts)) | |
end | |
"Quick container trait." | |
ElemOrder(::Type{CompressedSymbols}) = true # LeftToRight, RightToLeft. | |
""" | |
Pack symbols into a specified container type. | |
""" | |
function pack(::Type{CompressedSymbols{S}}, nts::Vector{S}) where S <: BioSymbol | |
#TODO: explore IOBuffer for writing chunks. | |
#TODO: explore the TanscodingStreams.jl paradigm. This packer/compressor looks like a codec. | |
N = BitsPerElem(S) | |
bv = BitVector(undef,length(nts) * N) | |
nts = compress.(nts) | |
if ElemOrder(CompressedSymbols) # TODO: dispatch to the best packing algorithm based on this trait. | |
reverse!.(nts) #Note: reverse makes bits easier to reconcile at the moment. | |
end | |
i = 1 | |
for bits in nts | |
bv[i:i+N-1] .= bits | |
i = i + N | |
end | |
# return container(bv) | |
return bv | |
end | |
# A: 0b10 | |
# C: 0b01 | |
# G: 0b11 | |
# T: 0b00 | |
# A: 0b10 | |
packed2 = CompressedSymbols(collect(DNA{TwoBit}, [ 'A', 'C', 'G', 'T', 'A'])) | |
reshape(packed2.bv, 2, :)' | |
# A: 0b0001 | |
# C: 0b0010 | |
# G: 0b0100 | |
# T: 0b1000 | |
# A: 0b0001 | |
packed4 = CompressedSymbols([DNA_A, DNA_C, DNA_G, DNA_T, DNA_A]) | |
reshape(packed4.bv, 4, :)' | |
""" | |
Move compressed encoding back into a byte. | |
""" | |
function Base.UInt8(bv::BitVector) | |
length(bv) <= 8 || error() | |
p = Ptr{UInt8}(pointer(bv.chunks)) | |
return unsafe_load(p) #TODO: not sure wether this is tracked for GC. | |
# return S(unsafe_load(p)) | |
# p = Ptr{S}(pointer(bv.chunks)) | |
# return unsafe_load(p) #TODO: not sure wether this is tracked for GC. | |
end | |
""" | |
Unpack compressed symbols. | |
""" | |
function unpack(container::CompressedSymbols{S}) where S <: BioSymbol | |
n = BitsPerElem(S) | |
L = length(container.bv) | |
bvs = [container.bv[lo:hi] for (lo, hi) in zip(1:n:L, n:n:L)] | |
if ElemOrder(CompressedSymbols) # TODO: dispatch to the best packing algorithm based on this trait. | |
reverse!.(bvs) #Note: reverse makes bits easier to reconcile at the moment. | |
end | |
ints = UInt8.(bvs) | |
return S.(ints) #Note: going through the symbol constructor checks -- options here. TODO: use Alphabet to check whether UInts are in the Alphabet's set of Symbols. | |
end | |
unpack(packed2) | |
unpack(packed4) | |
packed = CompressedSymbols{DNA{TwoBit}}(DNA{4}.(['A','C','G','T','A'])) | |
unpack(packed) | |
@time packed = CompressedSymbols{DNA{TwoBit}}("ACGTA") | |
unpack(packed) | |
@time packed = CompressedSymbols{DNA{4}}("ACGTA") | |
unpack(packed) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment