CiaranOMara/parametric_symbols_and_alphabet_sets.jl

## parametric_symbols_and_alphabet_sets.jl
#=
The purpose of this gist is to explore what a high-level interface or workflow might look like with `BioSymbols` as parametric primitive types.
For ease of conceptual development of a potential high-level interface, I have used `BitVector`s at the middle-level.
I have made no attempt in this example at low-level optimisations.
Operations performed with the `BitVector` would be replaced by optimised bit shiting procedures.
Ideally, the packing and unpacking methods would be refactored into iterators.
=#

abstract type BioSymbolEncoding end

abstract type BioSymbol end

abstract type NucleicAcid <: BioSymbol end

struct TwoBit <: BioSymbolEncoding end

primitive type DNA{E} <: NucleicAcid 8 end

abstract type BioSequence end # The container of compressed symbol encodings.

prefix(::DNA) = "DNA"

Base.bitstring(nt::DNA) = bitstring(reinterpret(UInt8, nt))

function Base.show(io::IO, symbol::S) where S<: BioSymbol
    write(io, prefix(symbol), '_', lookup_char(alphabet(S), reinterpret(UInt8, symbol)))
    return nothing
end

function Base.print(io::IO, symbol::BioSymbol)
    write(io, lookup_char(alphabet(S), reinterpret(UInt8, symbol)))
    return nothing
end

function createlookupvectors(definitions; empty_char = typemax(Char), empty_encoding = typemax(UInt8))

    # Determine maximum required lengths.
    max_encoding = maximum(last.(definitions)) + 1 #Note: adjusting for offset pointer access.
    max_char = codepoint(maximum(first.(definitions)))

    # Setup lookup vectors.
    encoding_to_char = fill(empty_char, length(0b0:max_encoding)) #Note: assuming typemax sufficiently out of bounds.
    char_to_encoding = fill(empty_encoding, length(0b0:max_char)) #Note: assuming typemax sufficiently out of bounds.

    # Place information into vector.
    for (char, encoding) in definitions
        cp = codepoint(char)
        encoding_to_char[encoding + 1] = cp # TODO: is there a more efficient method of loading via pointer offest?
        char_to_encoding[cp] = encoding

        # p_encoding_to_char = Ptr{Char}(pointer(encoding_to_char) + encoding * 8) #TODO: can we assume contiguous addressing here?
        # unsafe_store!(p_encoding_to_char, cp)
    end

    return encoding_to_char, char_to_encoding, empty_char, empty_encoding
end

struct Alphabet
    encoding_to_char # Note: larger at the top.
    char_to_encoding
    empty_char
    empty_encoding

    function Alphabet(table)
        return new(createlookupvectors(table)...)
    end
end

function _encoding(a::Alphabet, c::Char)
    encoding = a.char_to_encoding[codepoint(c)]
    return encoding
end

function _char(a::Alphabet, encoding::UInt8)
    encoding = a.encoding_to_char[encoding + 1]
    # p = Ptr{Char}(pointer(a.encoding_to_char) + encoding * 8) #TODO: can we assume contiguous addressing here?
    # return unsafe_load(p) #TODO: not sure whether this is tracked for GC.
end

function has_encoding(a::Alphabet, bits::UInt8)
    char = lookup_char(a, bits)
    resolved = lookup_encoding(a, char)
    return bits == resolved # Note: Determines whether we're back where we started.
end

function has_char(a::Alphabet, char::Char)
    encoding = lookup_encoding(a, char)
    resolved = lookup_char(a, encoding)
    return char == resolved # Note: Determines whether we're back where we started.
end

"Retrieve symbol char."
function lookup_encoding(a::Alphabet, c::Char)
    c = uppercase(c) #TODO: how safe do we want to be? What about having a longer lookup vector?
    encoding = _encoding(a, c)# function _char(a::Alphabet{BumpedEncoding{false}}, encoding::UInt8)
    encoding == a.empty_encoding && error("Not in the Alphabet's set of symbols.")
    return encoding
end

"Retrieve symbol encoding."
function lookup_char(a::Alphabet, encoding::UInt8)
    char = _char(a, encoding)
    char == a.empty_char && error("Not in the Alphabet's set of symbols.")
    return char
end

"DNA 4bit symbol definitions."
const definitions_dna_4bit = [
    ('A', 0b0001),
    ('C', 0b0010),
    ('G', 0b0100),
    ('T', 0b1000),
]

const alphabet_dna_4bit = Alphabet(definitions_dna_4bit)

lookup_encoding(alphabet_dna_4bit, 'A')
lookup_encoding(alphabet_dna_4bit, 'C')
lookup_encoding(alphabet_dna_4bit, 'G')
lookup_encoding(alphabet_dna_4bit, 'T')
lookup_char(alphabet_dna_4bit, 0b0001)
lookup_char(alphabet_dna_4bit, 0b0010)
lookup_char(alphabet_dna_4bit, 0b0100)
lookup_char(alphabet_dna_4bit, 0b1000)

"DNA 2bit symbol definitions."
const definitions_dna_2bit = [
    ('A', 0b10),
    ('C', 0b01),
    ('G', 0b11),
    ('T', 0b00),
]

const alphabet_dna_2bit = Alphabet(definitions_dna_2bit)

lookup_encoding(alphabet_dna_2bit, 'A')
lookup_encoding(alphabet_dna_2bit, 'C')
lookup_encoding(alphabet_dna_2bit, 'G')
lookup_encoding(alphabet_dna_2bit, 'T')
lookup_char(alphabet_dna_2bit, 0b10)
lookup_char(alphabet_dna_2bit, 0b01)
lookup_char(alphabet_dna_2bit, 0b11)
lookup_char(alphabet_dna_2bit, 0b00)

BitsPerElem(::Type{DNA{TwoBit}}) = 2 # Actual TwoBit.
BitsPerElem(::Type{DNA{4}}) = 4

alphabet(::Type{DNA{TwoBit}}) = alphabet_dna_2bit
alphabet(::Type{DNA{4}}) = alphabet_dna_4bit

function (::Type{S})(bits::UInt8) where S <:BioSymbol
    has_encoding(alphabet(S), bits) || error("Not in the Alphabet's set of symbols.")
    return reinterpret(S, bits)
end

function (::Type{S})(char::Char) where S <:BioSymbol
    encoding = lookup_encoding(alphabet(S), char) # retrieving encoding, should throw error if char is not in the alphabet's set of symbols.
    return reinterpret(S, encoding)
end

function Base.convert(char::Type{Char}, symbol::S) where S <: BioSymbol
    encoding = reinterpret(UInt8, symbol)
    return lookup_char(alphabet(S), encoding)
end

function Base.convert(::Type{S}, char::Char) where S<:DNA #TODO: the body of this convert is the same as the constructor.
    encoding = lookup_encoding(alphabet(S), char)
    return reinterpret(S, encoding)
end

encoding = lookup_encoding(alphabet(DNA{TwoBit}), 'C')
encoding = lookup_encoding(alphabet(DNA{TwoBit}), 'c')
DNA{TwoBit}('c')

convert(DNA{TwoBit}, 'A')

convert(Char, DNA{TwoBit}('A'))

collect(DNA{TwoBit}, ['G', 'A', 'T', 'C'])

encoding = lookup_encoding(alphabet(DNA{4}), 'c')
DNA{4}('c')

convert(DNA{4}, 'A')

convert(Char, DNA{4}('A'))

collect(DNA{4}, ['G', 'A', 'T', 'C'])


"Default outward facing DNA symbols with 4bit encoding."
for (char, encoding) in definitions_dna_4bit
    var = Symbol("DNA_", char)
    @eval begin
        const $(var) = reinterpret(DNA{4}, $(encoding))
    end
end

da = DNA_A
typeof(da)

reinterpret(UInt8, DNA_A)
reinterpret(UInt8, DNA_C)
reinterpret(UInt8, DNA_T)
reinterpret(UInt8, DNA_G)

function Base.convert(::Type{SA}, s::SB) where {SA <: BioSymbol, SB <: BioSymbol}
    char_b = convert(Char, s) #Note: converting between encodings through common Char.
    return convert(SA, char_b)
end


d2 = convert(DNA{TwoBit}, DNA_A)
d4 = convert(DNA{4}, DNA{TwoBit}('A'))

bitstring(d2)
bitstring(d4)


"""
Compress symbol encoding.
Retrieve important bits from encoded byte.
"""
function compress(nt::S)  where S <: BioSymbol

    bv = BitVector()
    bv.len = BitsPerElem(S)
    push!(bv.chunks, reinterpret(UInt8, nt))

    return bv
end

compress(DNA_A)
compress(DNA{TwoBit}('A'))


"""
Illustrative container.
This would be LongSequence, NTupleKMer, etc.
"""
struct CompressedSymbols{S<:BioSymbol} <: BioSequence
    bv::BitVector
    function CompressedSymbols{S}(nts::Vector{S}) where S <: BioSymbol
        return new(pack(CompressedSymbols{S}, nts::Vector{S})) # Note: this is where you could dispatch to different packers.
    end
end

function CompressedSymbols(nts::Vector{S}) where S <: BioSymbol
    return CompressedSymbols{S}(nts)
end

function CompressedSymbols{S}(nts) where S <: BioSymbol
    return CompressedSymbols{S}(collect(S, nts))
end

"Quick container trait."
ElemOrder(::Type{CompressedSymbols}) = true # LeftToRight, RightToLeft.

"""
Pack symbols into a specified container type.
"""
function pack(::Type{CompressedSymbols{S}}, nts::Vector{S}) where S <: BioSymbol

    #TODO: explore IOBuffer for writing chunks.
    #TODO: explore the TanscodingStreams.jl paradigm. This packer/compressor looks like a codec.

    N = BitsPerElem(S)

    bv = BitVector(undef,length(nts) * N)

    nts = compress.(nts)

    if ElemOrder(CompressedSymbols) # TODO: dispatch to the best packing algorithm based on this trait.
        reverse!.(nts) #Note: reverse makes bits easier to reconcile at the moment.
    end

    i = 1
    for bits in nts
        bv[i:i+N-1] .= bits
        i = i + N
    end

    return bv
end


# A: 0b10
# C: 0b01
# G: 0b11
# T: 0b00
# A: 0b10
packed2 = CompressedSymbols(collect(DNA{TwoBit}, [ 'A', 'C', 'G', 'T', 'A']))
reshape(packed2.bv, 2, :)'


# A: 0b0001
# C: 0b0010
# G: 0b0100
# T: 0b1000
# A: 0b0001
packed4 = CompressedSymbols([DNA_A, DNA_C, DNA_G, DNA_T, DNA_A])
reshape(packed4.bv, 4, :)'

"""
Move compressed intsencoding back into a byte.
"""
function Base.UInt8(bv::BitVector)

    length(bv) <= 8 || error()

    p = Ptr{UInt8}(pointer(bv.chunks))
    return unsafe_load(p) #TODO: not sure whether the returned value is tracked for GC.
end

"""
Unpack compressed symbols.
"""
function unpack(container::CompressedSymbols{S}) where S <: BioSymbol

    n = BitsPerElem(S)
    L = length(container.bv)

    bvs = [container.bv[lo:hi] for (lo, hi) in zip(1:n:L, n:n:L)]

    if ElemOrder(CompressedSymbols) # TODO: dispatch to the best packing algorithm based on this trait.
        reverse!.(bvs) #Note: reverse makes bits easier to reconcile at the moment.
    end

    ints = UInt8.(bvs) #TODO: move into a constructor?

    return S.(ints) #Note: going through the symbol constructor, which checks whther the encoding is in the Alphabet's set of Symbols.
end

@time unpack(packed2)
@time unpack(packed4)

@time packed = CompressedSymbols{DNA{TwoBit}}(DNA{4}.(['A','C','G','T','A']))
@time unpack(packed)

@time packed = CompressedSymbols{DNA{TwoBit}}("ACGTA")
@time unpack(packed)

@time packed = CompressedSymbols{DNA{4}}("ACGTA")
@time unpack(packed)
	#=
	The purpose of this gist is to explore what a high-level interface or workflow might look like with `BioSymbols` as parametric primitive types.
	For ease of conceptual development of a potential high-level interface, I have used `BitVector`s at the middle-level.
	I have made no attempt in this example at low-level optimisations.
	Operations performed with the `BitVector` would be replaced by optimised bit shiting procedures.
	Ideally, the packing and unpacking methods would be refactored into iterators.
	=#

	abstract type BioSymbolEncoding end

	abstract type BioSymbol end

	abstract type NucleicAcid <: BioSymbol end

	struct TwoBit <: BioSymbolEncoding end

	primitive type DNA{E} <: NucleicAcid 8 end

	abstract type BioSequence end # The container of compressed symbol encodings.

	prefix(::DNA) = "DNA"

	Base.bitstring(nt::DNA) = bitstring(reinterpret(UInt8, nt))

	function Base.show(io::IO, symbol::S) where S<: BioSymbol
	write(io, prefix(symbol), '_', lookup_char(alphabet(S), reinterpret(UInt8, symbol)))
	return nothing
	end

	function Base.print(io::IO, symbol::BioSymbol)
	write(io, lookup_char(alphabet(S), reinterpret(UInt8, symbol)))
	return nothing
	end

	function createlookupvectors(definitions; empty_char = typemax(Char), empty_encoding = typemax(UInt8))

	# Determine maximum required lengths.
	max_encoding = maximum(last.(definitions)) + 1 #Note: adjusting for offset pointer access.
	max_char = codepoint(maximum(first.(definitions)))

	# Setup lookup vectors.
	encoding_to_char = fill(empty_char, length(0b0:max_encoding)) #Note: assuming typemax sufficiently out of bounds.
	char_to_encoding = fill(empty_encoding, length(0b0:max_char)) #Note: assuming typemax sufficiently out of bounds.

	# Place information into vector.
	for (char, encoding) in definitions
	cp = codepoint(char)
	encoding_to_char[encoding + 1] = cp # TODO: is there a more efficient method of loading via pointer offest?
	char_to_encoding[cp] = encoding

	# p_encoding_to_char = Ptr{Char}(pointer(encoding_to_char) + encoding * 8) #TODO: can we assume contiguous addressing here?
	# unsafe_store!(p_encoding_to_char, cp)
	end

	return encoding_to_char, char_to_encoding, empty_char, empty_encoding
	end

	struct Alphabet
	encoding_to_char # Note: larger at the top.
	char_to_encoding
	empty_char
	empty_encoding

	function Alphabet(table)
	return new(createlookupvectors(table)...)
	end
	end

	function _encoding(a::Alphabet, c::Char)
	encoding = a.char_to_encoding[codepoint(c)]
	return encoding
	end

	function _char(a::Alphabet, encoding::UInt8)
	encoding = a.encoding_to_char[encoding + 1]
	# p = Ptr{Char}(pointer(a.encoding_to_char) + encoding * 8) #TODO: can we assume contiguous addressing here?
	# return unsafe_load(p) #TODO: not sure whether this is tracked for GC.
	end

	function has_encoding(a::Alphabet, bits::UInt8)
	char = lookup_char(a, bits)
	resolved = lookup_encoding(a, char)
	return bits == resolved # Note: Determines whether we're back where we started.
	end

	function has_char(a::Alphabet, char::Char)
	encoding = lookup_encoding(a, char)
	resolved = lookup_char(a, encoding)
	return char == resolved # Note: Determines whether we're back where we started.
	end

	"Retrieve symbol char."
	function lookup_encoding(a::Alphabet, c::Char)
	c = uppercase(c) #TODO: how safe do we want to be? What about having a longer lookup vector?
	encoding = _encoding(a, c)# function _char(a::Alphabet{BumpedEncoding{false}}, encoding::UInt8)
	encoding == a.empty_encoding && error("Not in the Alphabet's set of symbols.")
	return encoding
	end

	"Retrieve symbol encoding."
	function lookup_char(a::Alphabet, encoding::UInt8)
	char = _char(a, encoding)
	char == a.empty_char && error("Not in the Alphabet's set of symbols.")
	return char
	end

	"DNA 4bit symbol definitions."
	const definitions_dna_4bit = [
	('A', 0b0001),
	('C', 0b0010),
	('G', 0b0100),
	('T', 0b1000),
	]

	const alphabet_dna_4bit = Alphabet(definitions_dna_4bit)

	lookup_encoding(alphabet_dna_4bit, 'A')
	lookup_encoding(alphabet_dna_4bit, 'C')
	lookup_encoding(alphabet_dna_4bit, 'G')
	lookup_encoding(alphabet_dna_4bit, 'T')
	lookup_char(alphabet_dna_4bit, 0b0001)
	lookup_char(alphabet_dna_4bit, 0b0010)
	lookup_char(alphabet_dna_4bit, 0b0100)
	lookup_char(alphabet_dna_4bit, 0b1000)

	"DNA 2bit symbol definitions."
	const definitions_dna_2bit = [
	('A', 0b10),
	('C', 0b01),
	('G', 0b11),
	('T', 0b00),
	]

	const alphabet_dna_2bit = Alphabet(definitions_dna_2bit)

	lookup_encoding(alphabet_dna_2bit, 'A')
	lookup_encoding(alphabet_dna_2bit, 'C')
	lookup_encoding(alphabet_dna_2bit, 'G')
	lookup_encoding(alphabet_dna_2bit, 'T')
	lookup_char(alphabet_dna_2bit, 0b10)
	lookup_char(alphabet_dna_2bit, 0b01)
	lookup_char(alphabet_dna_2bit, 0b11)
	lookup_char(alphabet_dna_2bit, 0b00)

	BitsPerElem(::Type{DNA{TwoBit}}) = 2 # Actual TwoBit.
	BitsPerElem(::Type{DNA{4}}) = 4

	alphabet(::Type{DNA{TwoBit}}) = alphabet_dna_2bit
	alphabet(::Type{DNA{4}}) = alphabet_dna_4bit

	function (::Type{S})(bits::UInt8) where S <:BioSymbol
	has_encoding(alphabet(S), bits) \|\| error("Not in the Alphabet's set of symbols.")
	return reinterpret(S, bits)
	end

	function (::Type{S})(char::Char) where S <:BioSymbol
	encoding = lookup_encoding(alphabet(S), char) # retrieving encoding, should throw error if char is not in the alphabet's set of symbols.
	return reinterpret(S, encoding)
	end

	function Base.convert(char::Type{Char}, symbol::S) where S <: BioSymbol
	encoding = reinterpret(UInt8, symbol)
	return lookup_char(alphabet(S), encoding)
	end

	function Base.convert(::Type{S}, char::Char) where S<:DNA #TODO: the body of this convert is the same as the constructor.
	encoding = lookup_encoding(alphabet(S), char)
	return reinterpret(S, encoding)
	end

	encoding = lookup_encoding(alphabet(DNA{TwoBit}), 'C')
	encoding = lookup_encoding(alphabet(DNA{TwoBit}), 'c')
	DNA{TwoBit}('c')

	convert(DNA{TwoBit}, 'A')

	convert(Char, DNA{TwoBit}('A'))

	collect(DNA{TwoBit}, ['G', 'A', 'T', 'C'])

	encoding = lookup_encoding(alphabet(DNA{4}), 'c')
	DNA{4}('c')

	convert(DNA{4}, 'A')

	convert(Char, DNA{4}('A'))

	collect(DNA{4}, ['G', 'A', 'T', 'C'])


	"Default outward facing DNA symbols with 4bit encoding."
	for (char, encoding) in definitions_dna_4bit
	var = Symbol("DNA_", char)
	@eval begin
	const $(var) = reinterpret(DNA{4}, $(encoding))
	end
	end

	da = DNA_A
	typeof(da)

	reinterpret(UInt8, DNA_A)
	reinterpret(UInt8, DNA_C)
	reinterpret(UInt8, DNA_T)
	reinterpret(UInt8, DNA_G)

	function Base.convert(::Type{SA}, s::SB) where {SA <: BioSymbol, SB <: BioSymbol}
	char_b = convert(Char, s) #Note: converting between encodings through common Char.
	return convert(SA, char_b)
	end


	d2 = convert(DNA{TwoBit}, DNA_A)
	d4 = convert(DNA{4}, DNA{TwoBit}('A'))

	bitstring(d2)
	bitstring(d4)


	"""
	Compress symbol encoding.
	Retrieve important bits from encoded byte.
	"""
	function compress(nt::S) where S <: BioSymbol

	bv = BitVector()
	bv.len = BitsPerElem(S)
	push!(bv.chunks, reinterpret(UInt8, nt))

	return bv
	end

	compress(DNA_A)
	compress(DNA{TwoBit}('A'))


	"""
	Illustrative container.
	This would be LongSequence, NTupleKMer, etc.
	"""
	struct CompressedSymbols{S<:BioSymbol} <: BioSequence
	bv::BitVector
	function CompressedSymbols{S}(nts::Vector{S}) where S <: BioSymbol
	return new(pack(CompressedSymbols{S}, nts::Vector{S})) # Note: this is where you could dispatch to different packers.
	end
	end

	function CompressedSymbols(nts::Vector{S}) where S <: BioSymbol
	return CompressedSymbols{S}(nts)
	end

	function CompressedSymbols{S}(nts) where S <: BioSymbol
	return CompressedSymbols{S}(collect(S, nts))
	end

	"Quick container trait."
	ElemOrder(::Type{CompressedSymbols}) = true # LeftToRight, RightToLeft.

	"""
	Pack symbols into a specified container type.
	"""
	function pack(::Type{CompressedSymbols{S}}, nts::Vector{S}) where S <: BioSymbol

	#TODO: explore IOBuffer for writing chunks.
	#TODO: explore the TanscodingStreams.jl paradigm. This packer/compressor looks like a codec.

	N = BitsPerElem(S)

	bv = BitVector(undef,length(nts) * N)

	nts = compress.(nts)

	if ElemOrder(CompressedSymbols) # TODO: dispatch to the best packing algorithm based on this trait.
	reverse!.(nts) #Note: reverse makes bits easier to reconcile at the moment.
	end

	i = 1
	for bits in nts
	bv[i:i+N-1] .= bits
	i = i + N
	end

	return bv
	end


	# A: 0b10
	# C: 0b01
	# G: 0b11
	# T: 0b00
	# A: 0b10
	packed2 = CompressedSymbols(collect(DNA{TwoBit}, [ 'A', 'C', 'G', 'T', 'A']))
	reshape(packed2.bv, 2, :)'


	# A: 0b0001
	# C: 0b0010
	# G: 0b0100
	# T: 0b1000
	# A: 0b0001
	packed4 = CompressedSymbols([DNA_A, DNA_C, DNA_G, DNA_T, DNA_A])
	reshape(packed4.bv, 4, :)'

	"""
	Move compressed intsencoding back into a byte.
	"""
	function Base.UInt8(bv::BitVector)

	length(bv) <= 8 \|\| error()

	p = Ptr{UInt8}(pointer(bv.chunks))
	return unsafe_load(p) #TODO: not sure whether the returned value is tracked for GC.
	end

	"""
	Unpack compressed symbols.
	"""
	function unpack(container::CompressedSymbols{S}) where S <: BioSymbol

	n = BitsPerElem(S)
	L = length(container.bv)

	bvs = [container.bv[lo:hi] for (lo, hi) in zip(1:n:L, n:n:L)]

	if ElemOrder(CompressedSymbols) # TODO: dispatch to the best packing algorithm based on this trait.
	reverse!.(bvs) #Note: reverse makes bits easier to reconcile at the moment.
	end

	ints = UInt8.(bvs) #TODO: move into a constructor?

	return S.(ints) #Note: going through the symbol constructor, which checks whther the encoding is in the Alphabet's set of Symbols.
	end

	@time unpack(packed2)
	@time unpack(packed4)

	@time packed = CompressedSymbols{DNA{TwoBit}}(DNA{4}.(['A','C','G','T','A']))
	@time unpack(packed)

	@time packed = CompressedSymbols{DNA{TwoBit}}("ACGTA")
	@time unpack(packed)

	@time packed = CompressedSymbols{DNA{4}}("ACGTA")
	@time unpack(packed)