CiaranOMara/primitive_alphabet.jl

## primitive_alphabet.jl
#=
The purpose of this gist is to explore what a high-level interface or workflow might look like with `BioSymbols` as parametric primitive types.
For ease of conceptual development of a potential high-level interface, I have used `BitVector`s at the middle-level.
I have made no attempt in this example at low-level optimisations.
Operations performed with the `BitVector` would be replaced by optimised bit shiting procedures.
Ideally, the packing and unpacking methods would be refactored into iterators.
=#

abstract type BioSymbolEncoding end

abstract type BioSymbol end

abstract type NucleicAcid <: BioSymbol end

struct TwoBit <: BioSymbolEncoding end

primitive type DNA{E} <: NucleicAcid 8 end

abstract type BioSequence end # The container of compressed symbol encodings.

prefix(::DNA) = "DNA"

Base.bitstring(nt::DNA) = bitstring(reinterpret(UInt8, nt))

function Base.show(io::IO, symbol::BioSymbol)
    write(io, prefix(symbol), '_', convert(Char, symbol)) #TODO: factor out char retrieval.
    return nothing
end

function Base.print(io::IO, symbol::BioSymbol)
    write(io, convert(Char, symbol))
    return nothing
end

function createlookupvectors(table)

    # Determine maximum required lengths.
    max_encoding = maximum(last.(table))
    max_char = codepoint(maximum(first.(table)))

    # Setup lookup vectors. #TODO: instead of undef, what could represent an invalid value not in the set?
    encoding_to_char = Vector{Char}(undef, length(0x0:max_encoding))
    char_to_encoding = Vector{UInt8}(undef, length(0x0:max_char))

    # Place information into vector.
    for (char, encoding) in table
        cp = codepoint(char)
        encoding_to_char[encoding] = cp
        char_to_encoding[cp] = encoding
    end

    return encoding_to_char, char_to_encoding
end

struct BumpedEncoding{b} end #TODO: not sure whether this is a good idea - attempting to be expressive.
BumpedEncoding(b) = BumpedEncoding{b}()

struct Lookup{B<:BumpedEncoding} #TODO: perhaps this is the Alphabet!?
    encoding_to_char
    char_to_encoding
end

function Lookup{BumpedEncoding{true}}(table)
    table = [(char, encoding + 0x01) for (char, encoding) in table] #TODO: remove addition and subtraction by using pointer offset.
    return Lookup{BumpedEncoding{true}}(createlookupvectors(table)...)
end

function Lookup{BumpedEncoding{false}}(table)definition_dna_4bit
    return Lookup{BumpedEncoding{false}}(createlookupvectors(table)...)
end

function Lookup(table)
    min_encoding = minimum(last.(table))
    bumped = min_encoding == 0x0
    return Lookup{BumpedEncoding{bumped}}(table)
end

"Retrieve symbol char."
function (l::Lookup{BumpedEncoding{false}})(c::Char)
    return l.char_to_encoding[codepoint(c)]
end

"Retrieve symbol char."
function (l::Lookup{BumpedEncoding{true}})(c::Char)
    return l.char_to_encoding[codepoint(c)] - 0x01 #TODO: remove addition and subtraction by using pointer offset.
end

"Retrieve symbol encoding."
function (l::Lookup{BumpedEncoding{false}})(c::UInt8)
    return l.encoding_to_char[c]
end

"Retrieve symbol encoding."
function (l::Lookup{BumpedEncoding{true}})(c::UInt8)
    return l.encoding_to_char[c + 0x01] #TODO: remove addition and subtraction by using pointer offset.
end

"DNA 4bit symbol definitions."
const definition_dna_4bit = [
    ('A', 0b0001),
    ('C', 0b0010),
    ('G', 0b0100),
    ('T', 0b1000),
]

const lookup_dna_4bit = Lookup(definition_dna_4bit)

@time lookup_dna_4bit('A')
@time lookup_dna_4bit(0x01)

"DNA 2bit symbol definitions."
const definition_dna_2bit = [
    ('A', 0b10),
    ('C', 0b01),
    ('G', 0b11),
    ('T', 0b00),
]

const lookup_dna_2bit = Lookup(definition_dna_2bit)

@time lookup_dna_2bit('A')
@time lookup_dna_2bit(0x02)

BitsPerElem(::Type{DNA{TwoBit}}) = 2 # Actual TwoBit.
BitsPerElem(::Type{DNA{4}}) = 4

table(::Type{DNA{TwoBit}}) = lookup_dna_2bit
table(::Type{DNA{4}}) = lookup_dna_4bit

function (s::Type{<:DNA})(bits::UInt8) #TODO: Perhaps this is where we goto the Alphabet and ask whether the encoding or char is in the alphabet's set of symbols.
    return reinterpret(s, bits) #TODO: how do we know whether the bits are reasonable?
end

function (s::Type{<:DNA})(char::Char) #TODO: Perhaps this is where we goto the Alphabet and ask whether the encoding or char is in the alphabet's set of symbols.
    bits = table(s)(uppercase(char)) # retrieving encoding.
    return reinterpret(s, bits)
end

function Base.convert(char::Type{Char}, s::S) where S <: DNA
    bits = reinterpret(UInt8, s)
    return table(S)(bits)
end

function Base.convert(::Type{S}, char::Char) where S<:DNA
    return reinterpret(S, table(S)(char))
end

bits = table(DNA{TwoBit})(uppercase('c'))
@time DNA{TwoBit}('c')

convert(DNA{TwoBit}, 'A')

convert(Char, DNA{TwoBit}('A'))

collect(DNA{TwoBit}, ['G', 'A', 'T', 'C'])

bits = table(DNA{4})(uppercase('c'))
@time DNA{4}('c')

convert(DNA{4}, 'A')

convert(Char, DNA{4}('A'))

collect(DNA{4}, ['G', 'A', 'T', 'C'])


"Default outward facing DNA symbols with 4bit encoding."
for (char, bits) in definition_dna_4bit
    var = Symbol("DNA_", char)
    @eval begin
        const $(var) = reinterpret(DNA{4}, $(bits))
    end
end

da = DNA_A
typeof(da)

reinterpret(UInt8, DNA_A)
reinterpret(UInt8, DNA_C)
reinterpret(UInt8, DNA_T)
reinterpret(UInt8, DNA_G)

function Base.convert(::Type{SA}, s::SB) where {SA <: DNA, SB <: DNA}
    char_b = convert(Char, s) #Note: converting between encodings through common Char.
    return convert(SA, char_b)
end


d2 = convert(DNA{TwoBit}, DNA_A)
d4 = convert(DNA{4}, DNA{TwoBit}('A'))

bitstring(d2)
bitstring(d4)


"""
Compress symbol encoding.
Retrieve important bits from encoded byte.
"""
function compress(nt::S)  where S <: BioSymbol

    bv = BitVector()
    bv.len = BitsPerElem(S)
    push!(bv.chunks, reinterpret(UInt8, nt))

    return bv
end

compress(DNA_A)
compress(DNA{TwoBit}('A'))


"""
Illustrative container.
This would be LongSequence, NTupleKMer, etc.
"""
struct CompressedSymbols{S<:BioSymbol} <: BioSequence
    bv::BitVector
    function CompressedSymbols{S}(nts::Vector{S}) where S <: BioSymbol
        return new(pack(CompressedSymbols{S}, nts::Vector{S})) # Note: this is where you could dispatch to different packers.
    end
end

function CompressedSymbols(nts::Vector{S}) where S <: BioSymbol
    return CompressedSymbols{S}(nts)
end

function CompressedSymbols{S}(nts) where S <: BioSymbol
    return CompressedSymbols{S}(collect(S, nts))
end

"Quick container trait."
ElemOrder(::Type{CompressedSymbols}) = true # LeftToRight, RightToLeft.

"""
Pack symbols into a specified container type.
"""
function pack(::Type{CompressedSymbols{S}}, nts::Vector{S}) where S <: BioSymbol

    #TODO: explore IOBuffer for writing chunks.
    #TODO: explore the TanscodingStreams.jl paradigm. This packer/compressor looks like a codec.

    N = BitsPerElem(S)

    bv = BitVector(undef,length(nts) * N)

    nts = compress.(nts)

    if ElemOrder(CompressedSymbols) # TODO: dispatch to the best packing algorithm based on this trait.
        reverse!.(nts) #Note: reverse makes bits easier to reconcile at the moment.
    end

    i = 1
    for bits in nts
        bv[i:i+N-1] .= bits
        i = i + N
    end

    # return container(bv)
    return bv
end


# A: 0b10
# C: 0b01
# G: 0b11
# T: 0b00
# A: 0b10
packed2 = CompressedSymbols(collect(DNA{TwoBit}, [ 'A', 'C', 'G', 'T', 'A']))
reshape(packed2.bv, 2, :)'


# A: 0b0001
# C: 0b0010
# G: 0b0100
# T: 0b1000
# A: 0b0001
packed4 = CompressedSymbols([DNA_A, DNA_C, DNA_G, DNA_T, DNA_A])
reshape(packed4.bv, 4, :)'

"""
Move compressed encoding back into a byte.
"""
function Base.UInt8(bv::BitVector)

    length(bv) <= 8 || error()

    p = Ptr{UInt8}(pointer(bv.chunks))
    return unsafe_load(p) #TODO: not sure wether this is tracked for GC.

    # return S(unsafe_load(p))

    # p = Ptr{S}(pointer(bv.chunks))
    # return unsafe_load(p) #TODO: not sure wether this is tracked for GC.

end

"""
Unpack compressed symbols.
"""
function unpack(container::CompressedSymbols{S}) where S <: BioSymbol

    n = BitsPerElem(S)
    L = length(container.bv)

    bvs = [container.bv[lo:hi] for (lo, hi) in zip(1:n:L, n:n:L)]

    if ElemOrder(CompressedSymbols) # TODO: dispatch to the best packing algorithm based on this trait.
        reverse!.(bvs) #Note: reverse makes bits easier to reconcile at the moment.
    end

    ints = UInt8.(bvs)

    return S.(ints) #Note: going through the symbol constructor checks -- options here. TODO: use Alphabet to check whether UInts are in the Alphabet's set of Symbols.
end

unpack(packed2)

unpack(packed4)

packed = CompressedSymbols{DNA{TwoBit}}(DNA{4}.(['A','C','G','T','A']))
unpack(packed)

@time packed = CompressedSymbols{DNA{TwoBit}}("ACGTA")
unpack(packed)

@time packed = CompressedSymbols{DNA{4}}("ACGTA")
unpack(packed)
	#=
	The purpose of this gist is to explore what a high-level interface or workflow might look like with `BioSymbols` as parametric primitive types.
	For ease of conceptual development of a potential high-level interface, I have used `BitVector`s at the middle-level.
	I have made no attempt in this example at low-level optimisations.
	Operations performed with the `BitVector` would be replaced by optimised bit shiting procedures.
	Ideally, the packing and unpacking methods would be refactored into iterators.
	=#

	abstract type BioSymbolEncoding end

	abstract type BioSymbol end

	abstract type NucleicAcid <: BioSymbol end

	struct TwoBit <: BioSymbolEncoding end

	primitive type DNA{E} <: NucleicAcid 8 end

	abstract type BioSequence end # The container of compressed symbol encodings.

	prefix(::DNA) = "DNA"

	Base.bitstring(nt::DNA) = bitstring(reinterpret(UInt8, nt))

	function Base.show(io::IO, symbol::BioSymbol)
	write(io, prefix(symbol), '_', convert(Char, symbol)) #TODO: factor out char retrieval.
	return nothing
	end

	function Base.print(io::IO, symbol::BioSymbol)
	write(io, convert(Char, symbol))
	return nothing
	end

	function createlookupvectors(table)

	# Determine maximum required lengths.
	max_encoding = maximum(last.(table))
	max_char = codepoint(maximum(first.(table)))

	# Setup lookup vectors. #TODO: instead of undef, what could represent an invalid value not in the set?
	encoding_to_char = Vector{Char}(undef, length(0x0:max_encoding))
	char_to_encoding = Vector{UInt8}(undef, length(0x0:max_char))

	# Place information into vector.
	for (char, encoding) in table
	cp = codepoint(char)
	encoding_to_char[encoding] = cp
	char_to_encoding[cp] = encoding
	end

	return encoding_to_char, char_to_encoding
	end

	struct BumpedEncoding{b} end #TODO: not sure whether this is a good idea - attempting to be expressive.
	BumpedEncoding(b) = BumpedEncoding{b}()

	struct Lookup{B<:BumpedEncoding} #TODO: perhaps this is the Alphabet!?
	encoding_to_char
	char_to_encoding
	end

	function Lookup{BumpedEncoding{true}}(table)
	table = [(char, encoding + 0x01) for (char, encoding) in table] #TODO: remove addition and subtraction by using pointer offset.
	return Lookup{BumpedEncoding{true}}(createlookupvectors(table)...)
	end

	function Lookup{BumpedEncoding{false}}(table)definition_dna_4bit
	return Lookup{BumpedEncoding{false}}(createlookupvectors(table)...)
	end

	function Lookup(table)
	min_encoding = minimum(last.(table))
	bumped = min_encoding == 0x0
	return Lookup{BumpedEncoding{bumped}}(table)
	end

	"Retrieve symbol char."
	function (l::Lookup{BumpedEncoding{false}})(c::Char)
	return l.char_to_encoding[codepoint(c)]
	end

	"Retrieve symbol char."
	function (l::Lookup{BumpedEncoding{true}})(c::Char)
	return l.char_to_encoding[codepoint(c)] - 0x01 #TODO: remove addition and subtraction by using pointer offset.
	end

	"Retrieve symbol encoding."
	function (l::Lookup{BumpedEncoding{false}})(c::UInt8)
	return l.encoding_to_char[c]
	end

	"Retrieve symbol encoding."
	function (l::Lookup{BumpedEncoding{true}})(c::UInt8)
	return l.encoding_to_char[c + 0x01] #TODO: remove addition and subtraction by using pointer offset.
	end

	"DNA 4bit symbol definitions."
	const definition_dna_4bit = [
	('A', 0b0001),
	('C', 0b0010),
	('G', 0b0100),
	('T', 0b1000),
	]

	const lookup_dna_4bit = Lookup(definition_dna_4bit)

	@time lookup_dna_4bit('A')
	@time lookup_dna_4bit(0x01)

	"DNA 2bit symbol definitions."
	const definition_dna_2bit = [
	('A', 0b10),
	('C', 0b01),
	('G', 0b11),
	('T', 0b00),
	]

	const lookup_dna_2bit = Lookup(definition_dna_2bit)

	@time lookup_dna_2bit('A')
	@time lookup_dna_2bit(0x02)

	BitsPerElem(::Type{DNA{TwoBit}}) = 2 # Actual TwoBit.
	BitsPerElem(::Type{DNA{4}}) = 4

	table(::Type{DNA{TwoBit}}) = lookup_dna_2bit
	table(::Type{DNA{4}}) = lookup_dna_4bit

	function (s::Type{<:DNA})(bits::UInt8) #TODO: Perhaps this is where we goto the Alphabet and ask whether the encoding or char is in the alphabet's set of symbols.
	return reinterpret(s, bits) #TODO: how do we know whether the bits are reasonable?
	end

	function (s::Type{<:DNA})(char::Char) #TODO: Perhaps this is where we goto the Alphabet and ask whether the encoding or char is in the alphabet's set of symbols.
	bits = table(s)(uppercase(char)) # retrieving encoding.
	return reinterpret(s, bits)
	end

	function Base.convert(char::Type{Char}, s::S) where S <: DNA
	bits = reinterpret(UInt8, s)
	return table(S)(bits)
	end

	function Base.convert(::Type{S}, char::Char) where S<:DNA
	return reinterpret(S, table(S)(char))
	end

	bits = table(DNA{TwoBit})(uppercase('c'))
	@time DNA{TwoBit}('c')

	convert(DNA{TwoBit}, 'A')

	convert(Char, DNA{TwoBit}('A'))

	collect(DNA{TwoBit}, ['G', 'A', 'T', 'C'])

	bits = table(DNA{4})(uppercase('c'))
	@time DNA{4}('c')

	convert(DNA{4}, 'A')

	convert(Char, DNA{4}('A'))

	collect(DNA{4}, ['G', 'A', 'T', 'C'])


	"Default outward facing DNA symbols with 4bit encoding."
	for (char, bits) in definition_dna_4bit
	var = Symbol("DNA_", char)
	@eval begin
	const $(var) = reinterpret(DNA{4}, $(bits))
	end
	end

	da = DNA_A
	typeof(da)

	reinterpret(UInt8, DNA_A)
	reinterpret(UInt8, DNA_C)
	reinterpret(UInt8, DNA_T)
	reinterpret(UInt8, DNA_G)

	function Base.convert(::Type{SA}, s::SB) where {SA <: DNA, SB <: DNA}
	char_b = convert(Char, s) #Note: converting between encodings through common Char.
	return convert(SA, char_b)
	end


	d2 = convert(DNA{TwoBit}, DNA_A)
	d4 = convert(DNA{4}, DNA{TwoBit}('A'))

	bitstring(d2)
	bitstring(d4)


	"""
	Compress symbol encoding.
	Retrieve important bits from encoded byte.
	"""
	function compress(nt::S) where S <: BioSymbol

	bv = BitVector()
	bv.len = BitsPerElem(S)
	push!(bv.chunks, reinterpret(UInt8, nt))

	return bv
	end

	compress(DNA_A)
	compress(DNA{TwoBit}('A'))


	"""
	Illustrative container.
	This would be LongSequence, NTupleKMer, etc.
	"""
	struct CompressedSymbols{S<:BioSymbol} <: BioSequence
	bv::BitVector
	function CompressedSymbols{S}(nts::Vector{S}) where S <: BioSymbol
	return new(pack(CompressedSymbols{S}, nts::Vector{S})) # Note: this is where you could dispatch to different packers.
	end
	end

	function CompressedSymbols(nts::Vector{S}) where S <: BioSymbol
	return CompressedSymbols{S}(nts)
	end

	function CompressedSymbols{S}(nts) where S <: BioSymbol
	return CompressedSymbols{S}(collect(S, nts))
	end

	"Quick container trait."
	ElemOrder(::Type{CompressedSymbols}) = true # LeftToRight, RightToLeft.

	"""
	Pack symbols into a specified container type.
	"""
	function pack(::Type{CompressedSymbols{S}}, nts::Vector{S}) where S <: BioSymbol

	#TODO: explore IOBuffer for writing chunks.
	#TODO: explore the TanscodingStreams.jl paradigm. This packer/compressor looks like a codec.

	N = BitsPerElem(S)

	bv = BitVector(undef,length(nts) * N)

	nts = compress.(nts)

	if ElemOrder(CompressedSymbols) # TODO: dispatch to the best packing algorithm based on this trait.
	reverse!.(nts) #Note: reverse makes bits easier to reconcile at the moment.
	end

	i = 1
	for bits in nts
	bv[i:i+N-1] .= bits
	i = i + N
	end

	# return container(bv)
	return bv
	end


	# A: 0b10
	# C: 0b01
	# G: 0b11
	# T: 0b00
	# A: 0b10
	packed2 = CompressedSymbols(collect(DNA{TwoBit}, [ 'A', 'C', 'G', 'T', 'A']))
	reshape(packed2.bv, 2, :)'


	# A: 0b0001
	# C: 0b0010
	# G: 0b0100
	# T: 0b1000
	# A: 0b0001
	packed4 = CompressedSymbols([DNA_A, DNA_C, DNA_G, DNA_T, DNA_A])
	reshape(packed4.bv, 4, :)'

	"""
	Move compressed encoding back into a byte.
	"""
	function Base.UInt8(bv::BitVector)

	length(bv) <= 8 \|\| error()

	p = Ptr{UInt8}(pointer(bv.chunks))
	return unsafe_load(p) #TODO: not sure wether this is tracked for GC.

	# return S(unsafe_load(p))

	# p = Ptr{S}(pointer(bv.chunks))
	# return unsafe_load(p) #TODO: not sure wether this is tracked for GC.

	end

	"""
	Unpack compressed symbols.
	"""
	function unpack(container::CompressedSymbols{S}) where S <: BioSymbol

	n = BitsPerElem(S)
	L = length(container.bv)

	bvs = [container.bv[lo:hi] for (lo, hi) in zip(1:n:L, n:n:L)]

	if ElemOrder(CompressedSymbols) # TODO: dispatch to the best packing algorithm based on this trait.
	reverse!.(bvs) #Note: reverse makes bits easier to reconcile at the moment.
	end

	ints = UInt8.(bvs)

	return S.(ints) #Note: going through the symbol constructor checks -- options here. TODO: use Alphabet to check whether UInts are in the Alphabet's set of Symbols.
	end

	unpack(packed2)

	unpack(packed4)

	packed = CompressedSymbols{DNA{TwoBit}}(DNA{4}.(['A','C','G','T','A']))
	unpack(packed)

	@time packed = CompressedSymbols{DNA{TwoBit}}("ACGTA")
	unpack(packed)

	@time packed = CompressedSymbols{DNA{4}}("ACGTA")
	unpack(packed)