Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save CiaranOMara/79c1889262c3d776207acb8c37c84e8b to your computer and use it in GitHub Desktop.
Save CiaranOMara/79c1889262c3d776207acb8c37c84e8b to your computer and use it in GitHub Desktop.
#=
The purpose of this gist is to explore what a high-level interface or workflow might look like with `BioSymbols` as parametric primitive types.
For ease of conceptual development of a potential high-level interface, I have used `BitVector`s at the middle-level.
I have made no attempt in this example at low-level optimisations.
Operations performed with the `BitVector` would be replaced by optimised bit shiting procedures.
Ideally, the packing and unpacking methods would be refactored into iterators.
=#
abstract type BioSymbolEncoding end
abstract type BioSymbol end
abstract type NucleicAcid <: BioSymbol end
struct TwoBit <: BioSymbolEncoding end
primitive type DNA{E} <: NucleicAcid 8 end
abstract type BioSequence end # The container of compressed symbol encodings.
prefix(::DNA) = "DNA"
Base.bitstring(nt::DNA) = bitstring(reinterpret(UInt8, nt))
function Base.show(io::IO, symbol::S) where S<: BioSymbol
write(io, prefix(symbol), '_', lookup_char(alphabet(S), reinterpret(UInt8, symbol)))
return nothing
end
function Base.print(io::IO, symbol::BioSymbol)
write(io, lookup_char(alphabet(S), reinterpret(UInt8, symbol)))
return nothing
end
function createlookupvectors(definitions; empty_char = typemax(Char), empty_encoding = typemax(UInt8))
# Determine maximum required lengths.
max_encoding = maximum(last.(definitions)) + 1 #Note: adjusting for offset pointer access.
max_char = codepoint(maximum(first.(definitions)))
# Setup lookup vectors.
encoding_to_char = fill(empty_char, length(0b0:max_encoding)) #Note: assuming typemax sufficiently out of bounds.
char_to_encoding = fill(empty_encoding, length(0b0:max_char)) #Note: assuming typemax sufficiently out of bounds.
# Place information into vector.
for (char, encoding) in definitions
cp = codepoint(char)
encoding_to_char[encoding + 1] = cp # TODO: is there a more efficient method of loading via pointer offest?
char_to_encoding[cp] = encoding
# p_encoding_to_char = Ptr{Char}(pointer(encoding_to_char) + encoding * 8) #TODO: can we assume contiguous addressing here?
# unsafe_store!(p_encoding_to_char, cp)
end
return encoding_to_char, char_to_encoding, empty_char, empty_encoding
end
struct Alphabet
encoding_to_char # Note: larger at the top.
char_to_encoding
empty_char
empty_encoding
function Alphabet(table)
return new(createlookupvectors(table)...)
end
end
function _encoding(a::Alphabet, c::Char)
encoding = a.char_to_encoding[codepoint(c)]
return encoding
end
function _char(a::Alphabet, encoding::UInt8)
encoding = a.encoding_to_char[encoding + 1]
# p = Ptr{Char}(pointer(a.encoding_to_char) + encoding * 8) #TODO: can we assume contiguous addressing here?
# return unsafe_load(p) #TODO: not sure whether this is tracked for GC.
end
function has_encoding(a::Alphabet, bits::UInt8)
char = lookup_char(a, bits)
resolved = lookup_encoding(a, char)
return bits == resolved # Note: Determines whether we're back where we started.
end
function has_char(a::Alphabet, char::Char)
encoding = lookup_encoding(a, char)
resolved = lookup_char(a, encoding)
return char == resolved # Note: Determines whether we're back where we started.
end
"Retrieve symbol char."
function lookup_encoding(a::Alphabet, c::Char)
c = uppercase(c) #TODO: how safe do we want to be? What about having a longer lookup vector?
encoding = _encoding(a, c)# function _char(a::Alphabet{BumpedEncoding{false}}, encoding::UInt8)
encoding == a.empty_encoding && error("Not in the Alphabet's set of symbols.")
return encoding
end
"Retrieve symbol encoding."
function lookup_char(a::Alphabet, encoding::UInt8)
char = _char(a, encoding)
char == a.empty_char && error("Not in the Alphabet's set of symbols.")
return char
end
"DNA 4bit symbol definitions."
const definitions_dna_4bit = [
('A', 0b0001),
('C', 0b0010),
('G', 0b0100),
('T', 0b1000),
]
const alphabet_dna_4bit = Alphabet(definitions_dna_4bit)
lookup_encoding(alphabet_dna_4bit, 'A')
lookup_encoding(alphabet_dna_4bit, 'C')
lookup_encoding(alphabet_dna_4bit, 'G')
lookup_encoding(alphabet_dna_4bit, 'T')
lookup_char(alphabet_dna_4bit, 0b0001)
lookup_char(alphabet_dna_4bit, 0b0010)
lookup_char(alphabet_dna_4bit, 0b0100)
lookup_char(alphabet_dna_4bit, 0b1000)
"DNA 2bit symbol definitions."
const definitions_dna_2bit = [
('A', 0b10),
('C', 0b01),
('G', 0b11),
('T', 0b00),
]
const alphabet_dna_2bit = Alphabet(definitions_dna_2bit)
lookup_encoding(alphabet_dna_2bit, 'A')
lookup_encoding(alphabet_dna_2bit, 'C')
lookup_encoding(alphabet_dna_2bit, 'G')
lookup_encoding(alphabet_dna_2bit, 'T')
lookup_char(alphabet_dna_2bit, 0b10)
lookup_char(alphabet_dna_2bit, 0b01)
lookup_char(alphabet_dna_2bit, 0b11)
lookup_char(alphabet_dna_2bit, 0b00)
BitsPerElem(::Type{DNA{TwoBit}}) = 2 # Actual TwoBit.
BitsPerElem(::Type{DNA{4}}) = 4
alphabet(::Type{DNA{TwoBit}}) = alphabet_dna_2bit
alphabet(::Type{DNA{4}}) = alphabet_dna_4bit
function (::Type{S})(bits::UInt8) where S <:BioSymbol
has_encoding(alphabet(S), bits) || error("Not in the Alphabet's set of symbols.")
return reinterpret(S, bits)
end
function (::Type{S})(char::Char) where S <:BioSymbol
encoding = lookup_encoding(alphabet(S), char) # retrieving encoding, should throw error if char is not in the alphabet's set of symbols.
return reinterpret(S, encoding)
end
function Base.convert(char::Type{Char}, symbol::S) where S <: BioSymbol
encoding = reinterpret(UInt8, symbol)
return lookup_char(alphabet(S), encoding)
end
function Base.convert(::Type{S}, char::Char) where S<:DNA #TODO: the body of this convert is the same as the constructor.
encoding = lookup_encoding(alphabet(S), char)
return reinterpret(S, encoding)
end
encoding = lookup_encoding(alphabet(DNA{TwoBit}), 'C')
encoding = lookup_encoding(alphabet(DNA{TwoBit}), 'c')
DNA{TwoBit}('c')
convert(DNA{TwoBit}, 'A')
convert(Char, DNA{TwoBit}('A'))
collect(DNA{TwoBit}, ['G', 'A', 'T', 'C'])
encoding = lookup_encoding(alphabet(DNA{4}), 'c')
DNA{4}('c')
convert(DNA{4}, 'A')
convert(Char, DNA{4}('A'))
collect(DNA{4}, ['G', 'A', 'T', 'C'])
"Default outward facing DNA symbols with 4bit encoding."
for (char, encoding) in definitions_dna_4bit
var = Symbol("DNA_", char)
@eval begin
const $(var) = reinterpret(DNA{4}, $(encoding))
end
end
da = DNA_A
typeof(da)
reinterpret(UInt8, DNA_A)
reinterpret(UInt8, DNA_C)
reinterpret(UInt8, DNA_T)
reinterpret(UInt8, DNA_G)
function Base.convert(::Type{SA}, s::SB) where {SA <: BioSymbol, SB <: BioSymbol}
char_b = convert(Char, s) #Note: converting between encodings through common Char.
return convert(SA, char_b)
end
d2 = convert(DNA{TwoBit}, DNA_A)
d4 = convert(DNA{4}, DNA{TwoBit}('A'))
bitstring(d2)
bitstring(d4)
"""
Compress symbol encoding.
Retrieve important bits from encoded byte.
"""
function compress(nt::S) where S <: BioSymbol
bv = BitVector()
bv.len = BitsPerElem(S)
push!(bv.chunks, reinterpret(UInt8, nt))
return bv
end
compress(DNA_A)
compress(DNA{TwoBit}('A'))
"""
Illustrative container.
This would be LongSequence, NTupleKMer, etc.
"""
struct CompressedSymbols{S<:BioSymbol} <: BioSequence
bv::BitVector
function CompressedSymbols{S}(nts::Vector{S}) where S <: BioSymbol
return new(pack(CompressedSymbols{S}, nts::Vector{S})) # Note: this is where you could dispatch to different packers.
end
end
function CompressedSymbols(nts::Vector{S}) where S <: BioSymbol
return CompressedSymbols{S}(nts)
end
function CompressedSymbols{S}(nts) where S <: BioSymbol
return CompressedSymbols{S}(collect(S, nts))
end
"Quick container trait."
ElemOrder(::Type{CompressedSymbols}) = true # LeftToRight, RightToLeft.
"""
Pack symbols into a specified container type.
"""
function pack(::Type{CompressedSymbols{S}}, nts::Vector{S}) where S <: BioSymbol
#TODO: explore IOBuffer for writing chunks.
#TODO: explore the TanscodingStreams.jl paradigm. This packer/compressor looks like a codec.
N = BitsPerElem(S)
bv = BitVector(undef,length(nts) * N)
nts = compress.(nts)
if ElemOrder(CompressedSymbols) # TODO: dispatch to the best packing algorithm based on this trait.
reverse!.(nts) #Note: reverse makes bits easier to reconcile at the moment.
end
i = 1
for bits in nts
bv[i:i+N-1] .= bits
i = i + N
end
return bv
end
# A: 0b10
# C: 0b01
# G: 0b11
# T: 0b00
# A: 0b10
packed2 = CompressedSymbols(collect(DNA{TwoBit}, [ 'A', 'C', 'G', 'T', 'A']))
reshape(packed2.bv, 2, :)'
# A: 0b0001
# C: 0b0010
# G: 0b0100
# T: 0b1000
# A: 0b0001
packed4 = CompressedSymbols([DNA_A, DNA_C, DNA_G, DNA_T, DNA_A])
reshape(packed4.bv, 4, :)'
"""
Move compressed intsencoding back into a byte.
"""
function Base.UInt8(bv::BitVector)
length(bv) <= 8 || error()
p = Ptr{UInt8}(pointer(bv.chunks))
return unsafe_load(p) #TODO: not sure whether the returned value is tracked for GC.
end
"""
Unpack compressed symbols.
"""
function unpack(container::CompressedSymbols{S}) where S <: BioSymbol
n = BitsPerElem(S)
L = length(container.bv)
bvs = [container.bv[lo:hi] for (lo, hi) in zip(1:n:L, n:n:L)]
if ElemOrder(CompressedSymbols) # TODO: dispatch to the best packing algorithm based on this trait.
reverse!.(bvs) #Note: reverse makes bits easier to reconcile at the moment.
end
ints = UInt8.(bvs) #TODO: move into a constructor?
return S.(ints) #Note: going through the symbol constructor, which checks whther the encoding is in the Alphabet's set of Symbols.
end
@time unpack(packed2)
@time unpack(packed4)
@time packed = CompressedSymbols{DNA{TwoBit}}(DNA{4}.(['A','C','G','T','A']))
@time unpack(packed)
@time packed = CompressedSymbols{DNA{TwoBit}}("ACGTA")
@time unpack(packed)
@time packed = CompressedSymbols{DNA{4}}("ACGTA")
@time unpack(packed)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment