Skip to content

Instantly share code, notes, and snippets.

@CiaranOMara
Last active June 8, 2020 07:55
Show Gist options
  • Save CiaranOMara/0f9a0d88e421f55b7f2e3b87d8cbf071 to your computer and use it in GitHub Desktop.
Save CiaranOMara/0f9a0d88e421f55b7f2e3b87d8cbf071 to your computer and use it in GitHub Desktop.
Parametric BioSymbols alphabet
#=
The purpose of this gist is to explore what a high-level interface or workflow might look like with `BioSymbols` as parametric primitive types.
For ease of conceptual development of a potential high-level interface, I have used `BitVector`s at the middle-level.
I have made no attempt in this example at low-level optimisations.
Operations performed with the `BitVector` would be replaced by optimised bit shiting procedures.
Ideally, the packing and unpacking methods would be refactored into iterators.
=#
abstract type BioSymbolEncoding end
abstract type BioSymbol end
abstract type NucleicAcid <: BioSymbol end
struct TwoBit <: BioSymbolEncoding end
primitive type DNA{E} <: NucleicAcid 8 end
abstract type BioSequence end # The container of compressed symbol encodings.
prefix(::DNA) = "DNA"
Base.bitstring(nt::DNA) = bitstring(reinterpret(UInt8, nt))
function Base.show(io::IO, symbol::BioSymbol)
write(io, prefix(symbol), '_', convert(Char, symbol)) #TODO: factor out char retrieval.
return nothing
end
function Base.print(io::IO, symbol::BioSymbol)
write(io, convert(Char, symbol))
return nothing
end
function createlookupvectors(table)
# Determine maximum required lengths.
max_encoding = maximum(last.(table))
max_char = codepoint(maximum(first.(table)))
# Setup lookup vectors. #TODO: instead of undef, what could represent an invalid value not in the set?
encoding_to_char = Vector{Char}(undef, length(0x0:max_encoding))
char_to_encoding = Vector{UInt8}(undef, length(0x0:max_char))
# Place information into vector.
for (char, encoding) in table
cp = codepoint(char)
encoding_to_char[encoding] = cp
char_to_encoding[cp] = encoding
end
return encoding_to_char, char_to_encoding
end
struct BumpedEncoding{b} end #TODO: not sure whether this is a good idea - attempting to be expressive.
BumpedEncoding(b) = BumpedEncoding{b}()
struct Lookup{B<:BumpedEncoding} #TODO: perhaps this is the Alphabet!?
encoding_to_char
char_to_encoding
end
function Lookup{BumpedEncoding{true}}(table)
table = [(char, encoding + 0x01) for (char, encoding) in table] #TODO: remove addition and subtraction by using pointer offset.
return Lookup{BumpedEncoding{true}}(createlookupvectors(table)...)
end
function Lookup{BumpedEncoding{false}}(table)definition_dna_4bit
return Lookup{BumpedEncoding{false}}(createlookupvectors(table)...)
end
function Lookup(table)
min_encoding = minimum(last.(table))
bumped = min_encoding == 0x0
return Lookup{BumpedEncoding{bumped}}(table)
end
"Retrieve symbol char."
function (l::Lookup{BumpedEncoding{false}})(c::Char)
return l.char_to_encoding[codepoint(c)]
end
"Retrieve symbol char."
function (l::Lookup{BumpedEncoding{true}})(c::Char)
return l.char_to_encoding[codepoint(c)] - 0x01 #TODO: remove addition and subtraction by using pointer offset.
end
"Retrieve symbol encoding."
function (l::Lookup{BumpedEncoding{false}})(c::UInt8)
return l.encoding_to_char[c]
end
"Retrieve symbol encoding."
function (l::Lookup{BumpedEncoding{true}})(c::UInt8)
return l.encoding_to_char[c + 0x01] #TODO: remove addition and subtraction by using pointer offset.
end
"DNA 4bit symbol definitions."
const definition_dna_4bit = [
('A', 0b0001),
('C', 0b0010),
('G', 0b0100),
('T', 0b1000),
]
const lookup_dna_4bit = Lookup(definition_dna_4bit)
@time lookup_dna_4bit('A')
@time lookup_dna_4bit(0x01)
"DNA 2bit symbol definitions."
const definition_dna_2bit = [
('A', 0b10),
('C', 0b01),
('G', 0b11),
('T', 0b00),
]
const lookup_dna_2bit = Lookup(definition_dna_2bit)
@time lookup_dna_2bit('A')
@time lookup_dna_2bit(0x02)
BitsPerElem(::Type{DNA{TwoBit}}) = 2 # Actual TwoBit.
BitsPerElem(::Type{DNA{4}}) = 4
table(::Type{DNA{TwoBit}}) = lookup_dna_2bit
table(::Type{DNA{4}}) = lookup_dna_4bit
function (s::Type{<:DNA})(bits::UInt8) #TODO: Perhaps this is where we goto the Alphabet and ask whether the encoding or char is in the alphabet's set of symbols.
return reinterpret(s, bits) #TODO: how do we know whether the bits are reasonable?
end
function (s::Type{<:DNA})(char::Char) #TODO: Perhaps this is where we goto the Alphabet and ask whether the encoding or char is in the alphabet's set of symbols.
bits = table(s)(uppercase(char)) # retrieving encoding.
return reinterpret(s, bits)
end
function Base.convert(char::Type{Char}, s::S) where S <: DNA
bits = reinterpret(UInt8, s)
return table(S)(bits)
end
function Base.convert(::Type{S}, char::Char) where S<:DNA
return reinterpret(S, table(S)(char))
end
bits = table(DNA{TwoBit})(uppercase('c'))
@time DNA{TwoBit}('c')
convert(DNA{TwoBit}, 'A')
convert(Char, DNA{TwoBit}('A'))
collect(DNA{TwoBit}, ['G', 'A', 'T', 'C'])
bits = table(DNA{4})(uppercase('c'))
@time DNA{4}('c')
convert(DNA{4}, 'A')
convert(Char, DNA{4}('A'))
collect(DNA{4}, ['G', 'A', 'T', 'C'])
"Default outward facing DNA symbols with 4bit encoding."
for (char, bits) in definition_dna_4bit
var = Symbol("DNA_", char)
@eval begin
const $(var) = reinterpret(DNA{4}, $(bits))
end
end
da = DNA_A
typeof(da)
reinterpret(UInt8, DNA_A)
reinterpret(UInt8, DNA_C)
reinterpret(UInt8, DNA_T)
reinterpret(UInt8, DNA_G)
function Base.convert(::Type{SA}, s::SB) where {SA <: DNA, SB <: DNA}
char_b = convert(Char, s) #Note: converting between encodings through common Char.
return convert(SA, char_b)
end
d2 = convert(DNA{TwoBit}, DNA_A)
d4 = convert(DNA{4}, DNA{TwoBit}('A'))
bitstring(d2)
bitstring(d4)
"""
Compress symbol encoding.
Retrieve important bits from encoded byte.
"""
function compress(nt::S) where S <: BioSymbol
bv = BitVector()
bv.len = BitsPerElem(S)
push!(bv.chunks, reinterpret(UInt8, nt))
return bv
end
compress(DNA_A)
compress(DNA{TwoBit}('A'))
"""
Illustrative container.
This would be LongSequence, NTupleKMer, etc.
"""
struct CompressedSymbols{S<:BioSymbol} <: BioSequence
bv::BitVector
function CompressedSymbols{S}(nts::Vector{S}) where S <: BioSymbol
return new(pack(CompressedSymbols{S}, nts::Vector{S})) # Note: this is where you could dispatch to different packers.
end
end
function CompressedSymbols(nts::Vector{S}) where S <: BioSymbol
return CompressedSymbols{S}(nts)
end
function CompressedSymbols{S}(nts) where S <: BioSymbol
return CompressedSymbols{S}(collect(S, nts))
end
"Quick container trait."
ElemOrder(::Type{CompressedSymbols}) = true # LeftToRight, RightToLeft.
"""
Pack symbols into a specified container type.
"""
function pack(::Type{CompressedSymbols{S}}, nts::Vector{S}) where S <: BioSymbol
#TODO: explore IOBuffer for writing chunks.
#TODO: explore the TanscodingStreams.jl paradigm. This packer/compressor looks like a codec.
N = BitsPerElem(S)
bv = BitVector(undef,length(nts) * N)
nts = compress.(nts)
if ElemOrder(CompressedSymbols) # TODO: dispatch to the best packing algorithm based on this trait.
reverse!.(nts) #Note: reverse makes bits easier to reconcile at the moment.
end
i = 1
for bits in nts
bv[i:i+N-1] .= bits
i = i + N
end
# return container(bv)
return bv
end
# A: 0b10
# C: 0b01
# G: 0b11
# T: 0b00
# A: 0b10
packed2 = CompressedSymbols(collect(DNA{TwoBit}, [ 'A', 'C', 'G', 'T', 'A']))
reshape(packed2.bv, 2, :)'
# A: 0b0001
# C: 0b0010
# G: 0b0100
# T: 0b1000
# A: 0b0001
packed4 = CompressedSymbols([DNA_A, DNA_C, DNA_G, DNA_T, DNA_A])
reshape(packed4.bv, 4, :)'
"""
Move compressed encoding back into a byte.
"""
function Base.UInt8(bv::BitVector)
length(bv) <= 8 || error()
p = Ptr{UInt8}(pointer(bv.chunks))
return unsafe_load(p) #TODO: not sure wether this is tracked for GC.
# return S(unsafe_load(p))
# p = Ptr{S}(pointer(bv.chunks))
# return unsafe_load(p) #TODO: not sure wether this is tracked for GC.
end
"""
Unpack compressed symbols.
"""
function unpack(container::CompressedSymbols{S}) where S <: BioSymbol
n = BitsPerElem(S)
L = length(container.bv)
bvs = [container.bv[lo:hi] for (lo, hi) in zip(1:n:L, n:n:L)]
if ElemOrder(CompressedSymbols) # TODO: dispatch to the best packing algorithm based on this trait.
reverse!.(bvs) #Note: reverse makes bits easier to reconcile at the moment.
end
ints = UInt8.(bvs)
return S.(ints) #Note: going through the symbol constructor checks -- options here. TODO: use Alphabet to check whether UInts are in the Alphabet's set of Symbols.
end
unpack(packed2)
unpack(packed4)
packed = CompressedSymbols{DNA{TwoBit}}(DNA{4}.(['A','C','G','T','A']))
unpack(packed)
@time packed = CompressedSymbols{DNA{TwoBit}}("ACGTA")
unpack(packed)
@time packed = CompressedSymbols{DNA{4}}("ACGTA")
unpack(packed)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment