pluser/entropy.jl

## entropy.jl
#-*- mode: julia-mode -*-

if abspath(PROGRAM_FILE) == @__FILE__
    using ArgParse
    using HDF5
    using CSV
end

using Memoize


#=
Measure the distance of specified vector.
data is the time series 1 dimensional data,
m is dimension of embedding dimension,
idx and jdx is the first element of vector to compair.
=#
function dist(data::AbstractVector{T}, m::Int, idx::Int, jdx::Int)::T where{T<:AbstractFloat}
    return maximum(abs.(data[idx:idx+m-1] .- data[jdx:jdx+m-1]))
end

"""
    within_array(data, dimension, tolerance, index)

Consider a vector sequence of m-dimensional embeddings
from 1-dimensional data. It determine whether a point
inside the radius r of the idxth point can be contained,
and return the result as an boolean array.
The result always contains one or more true values
because it includes itself.

The implementation uses the fact that as the dimension increases,
the distances of the points are getting wider.
"""
@memoize function within_array(data::AbstractVector{T}, m::Int, r::T, idx::Int)::Vector{Bool} where{T<:AbstractFloat}
    if m > 1
        N::Int = length(data) - m + 1
        need_calc = copy(within_array(data, m-1, r, idx))

        for i in 1:N
            if need_calc[i]
                need_calc[i] = abs(data[i+m-1] - data[idx+m-1]) <= r
            end
        end

        resize!(need_calc, N)
        return need_calc
    else
        return abs.(data .- data[idx]) .<= r
    end
end

#=
Count number of vectors which is matched with template.
As of including index vector itself, return value is always one or more.
data is the time series 1 dimensional data,
m is dimension of embedding dimension,
r is tolerance for counting.
=#
function count_matching(data::AbstractVector{<:AbstractFloat}, m::Int, r::AbstractFloat, idx::Int)::Int
    N::Int = length(data) - m + 1

    function naive_method()::Int
        counter::Int = 0
        for j in 1:N
            if dist(data, m, idx, j) <= r
                counter += 1
            end
        end
        return counter
    end

    function opt_method()::Int
        return count(within_array(data, m, r, idx))
    end

    #@assert naive_method() == opt_method()

    return opt_method()
end

#=
Sample Entropy.
The parameters L, m, and r must be fixed for each calculation.
L is the time series data, m is the length of
sequences to be compared, and r is the tolerance for
accepting matches.
Implement is based on https://doi.org/10.1152/ajpheart.2000.278.6.H2039
=#
function sampen(L::AbstractVector{<:AbstractFloat}, m::Int, r::AbstractFloat)
    N::Int = length(L)

    #tmpL = @view L[1:N-1]
    bimr_n = (i) -> begin
        count_matching(L, m, r, i) - 1
    end
    bmr_n = () -> begin
        sum::Int = 0
        for i in 1:N-m
            sum += bimr_n(i)
        end
        # the last vector should not be counted.
        sum -= 2*count_matching(L, m, r, N - m + 1)
        return sum
    end

    aimr_n = (i) -> count_matching(L, m + 1, r, i) - 1
    amr_n = () -> begin
        sum::Int = 0
        for i in 1:N-m
            sum += aimr_n(i)
        end
        return sum
    end

    return -log(amr_n()/bmr_n())
end

#=
Approximate Entropy.
The parameters N, m, and r must be fixed for each calculation.
N is the length of the time series, m is the length of
sequences to be compared, and r is the tolerance for
accepting matches.
Implement is based on https://doi.org/10.1007/BF01619355
=#
function apen(L::Array{<:AbstractFloat,1}, m::Int, r::AbstractFloat)
    N::Int = length(L)

    function cimr(m, idx)
        return count_matching(L, m, r, idx) / (N - m + 1)
    end

    function phi(m, r)
        sum::Float32 = 0
        for i in 1 : (N - m + 1)
            sum += log(cimr(m, i))
        end
        return sum / (N - m + 1)
    end

    return phi(m, r) - phi(m + 1, r)
end

#=
Kolmogorov Sinai Entropy.
KS-entropy is identical as taking limits of Approximate Entropy.
The parameters N, m, and r must be fixed for each calculation.
N is the length of the time series, m is the length of
sequences to be compared, and r is the tolerance for
accepting matches.
Implement is based on https://doi.org/10.1016/0370-1573(93)90012-3
=#
function ksen(L::Array{<:AbstractFloat,1}, m::Int, r::AbstractFloat)
    error("Not Implemented")
end


if abspath(PROGRAM_FILE) == @__FILE__

    function cmd_entrypoint()
        method_map = Dict([("sampen", sampen), ("apen", apen), ("ksen", ksen)])
        s = ArgParseSettings()
        s.description = "A calculator of Sample/Approximate/KS entropy"
        s.version = "1.0"

        add_arg_table(
        s,
        ["--input", "-i"],
        Dict(
            :help => "filename for the time series data.\nspecial arg '-' means read from stdin",
            :arg_type => String,
            :required => true
        ))

        add_arg_table(
        s,
        ["--method", "-w"],
        Dict(
            :help => "method to calculate entropy.\nit should be one of 'sampen', 'apen', 'ksen'.",
            :arg_type => String,
            :range_tester => (x -> x in keys(method_map)),
            :required => true
        ))

        add_arg_table(
        s,
        ["--dimension", "--dim", "-d", "-m"],
        Dict(
            :help => "dimension for embedding",
            :arg_type => Int,
            :required => true
        ))

        add_arg_table(
        s,
        ["--tolerance", "--tor", "-r"],
        Dict(
            :help => "tolerance for matching vertors",
            :arg_type => Float32,
            :required => true
        ))

        args = parse_args(s)

        if args["input"] != "-"
            # read from file
            filename = args["input"]
            h5open(filename, "r") do fd
                tsdata = read(fd, "tsdata")
            end
        else
            # read from stdin
            tsdata = parse.(Float32, split(readline(stdin), (' ', ',')))
        end

        func = method_map[args["method"]]
        dim = args["dimension"]
        tol = args["tolerance"]

        result = func(tsdata, dim, tol)
        println(result)
    end

    cmd_entrypoint()
end
	#-- mode: julia-mode --

	if abspath(PROGRAM_FILE) == @__FILE__
	using ArgParse
	using HDF5
	using CSV
	end

	using Memoize



	#=
	Measure the distance of specified vector.
	data is the time series 1 dimensional data,
	m is dimension of embedding dimension,
	idx and jdx is the first element of vector to compair.
	=#
	function dist(data::AbstractVector{T}, m::Int, idx::Int, jdx::Int)::T where{T<:AbstractFloat}
	return maximum(abs.(data[idx:idx+m-1] .- data[jdx:jdx+m-1]))
	end

	"""
	within_array(data, dimension, tolerance, index)

	Consider a vector sequence of m-dimensional embeddings
	from 1-dimensional data. It determine whether a point
	inside the radius r of the idxth point can be contained,
	and return the result as an boolean array.
	The result always contains one or more true values
	because it includes itself.

	The implementation uses the fact that as the dimension increases,
	the distances of the points are getting wider.
	"""
	@memoize function within_array(data::AbstractVector{T}, m::Int, r::T, idx::Int)::Vector{Bool} where{T<:AbstractFloat}
	if m > 1
	N::Int = length(data) - m + 1
	need_calc = copy(within_array(data, m-1, r, idx))

	for i in 1:N
	if need_calc[i]
	need_calc[i] = abs(data[i+m-1] - data[idx+m-1]) <= r
	end
	end

	resize!(need_calc, N)
	return need_calc
	else
	return abs.(data .- data[idx]) .<= r
	end
	end

	#=
	Count number of vectors which is matched with template.
	As of including index vector itself, return value is always one or more.
	data is the time series 1 dimensional data,
	m is dimension of embedding dimension,
	r is tolerance for counting.
	=#
	function count_matching(data::AbstractVector{<:AbstractFloat}, m::Int, r::AbstractFloat, idx::Int)::Int
	N::Int = length(data) - m + 1

	function naive_method()::Int
	counter::Int = 0
	for j in 1:N
	if dist(data, m, idx, j) <= r
	counter += 1
	end
	end
	return counter
	end

	function opt_method()::Int
	return count(within_array(data, m, r, idx))
	end

	#@assert naive_method() == opt_method()

	return opt_method()
	end

	#=
	Sample Entropy.
	The parameters L, m, and r must be fixed for each calculation.
	L is the time series data, m is the length of
	sequences to be compared, and r is the tolerance for
	accepting matches.
	Implement is based on https://doi.org/10.1152/ajpheart.2000.278.6.H2039
	=#
	function sampen(L::AbstractVector{<:AbstractFloat}, m::Int, r::AbstractFloat)
	N::Int = length(L)

	#tmpL = @view L[1:N-1]
	bimr_n = (i) -> begin
	count_matching(L, m, r, i) - 1
	end
	bmr_n = () -> begin
	sum::Int = 0
	for i in 1:N-m
	sum += bimr_n(i)
	end
	# the last vector should not be counted.
	sum -= 2*count_matching(L, m, r, N - m + 1)
	return sum
	end

	aimr_n = (i) -> count_matching(L, m + 1, r, i) - 1
	amr_n = () -> begin
	sum::Int = 0
	for i in 1:N-m
	sum += aimr_n(i)
	end
	return sum
	end

	return -log(amr_n()/bmr_n())
	end

	#=
	Approximate Entropy.
	The parameters N, m, and r must be fixed for each calculation.
	N is the length of the time series, m is the length of
	sequences to be compared, and r is the tolerance for
	accepting matches.
	Implement is based on https://doi.org/10.1007/BF01619355
	=#
	function apen(L::Array{<:AbstractFloat,1}, m::Int, r::AbstractFloat)
	N::Int = length(L)

	function cimr(m, idx)
	return count_matching(L, m, r, idx) / (N - m + 1)
	end

	function phi(m, r)
	sum::Float32 = 0
	for i in 1 : (N - m + 1)
	sum += log(cimr(m, i))
	end
	return sum / (N - m + 1)
	end

	return phi(m, r) - phi(m + 1, r)
	end

	#=
	Kolmogorov Sinai Entropy.
	KS-entropy is identical as taking limits of Approximate Entropy.
	The parameters N, m, and r must be fixed for each calculation.
	N is the length of the time series, m is the length of
	sequences to be compared, and r is the tolerance for
	accepting matches.
	Implement is based on https://doi.org/10.1016/0370-1573(93)90012-3
	=#
	function ksen(L::Array{<:AbstractFloat,1}, m::Int, r::AbstractFloat)
	error("Not Implemented")
	end


	if abspath(PROGRAM_FILE) == @__FILE__

	function cmd_entrypoint()
	method_map = Dict([("sampen", sampen), ("apen", apen), ("ksen", ksen)])
	s = ArgParseSettings()
	s.description = "A calculator of Sample/Approximate/KS entropy"
	s.version = "1.0"

	add_arg_table(
	s,
	["--input", "-i"],
	Dict(
	:help => "filename for the time series data.\nspecial arg '-' means read from stdin",
	:arg_type => String,
	:required => true
	))

	add_arg_table(
	s,
	["--method", "-w"],
	Dict(
	:help => "method to calculate entropy.\nit should be one of 'sampen', 'apen', 'ksen'.",
	:arg_type => String,
	:range_tester => (x -> x in keys(method_map)),
	:required => true
	))

	add_arg_table(
	s,
	["--dimension", "--dim", "-d", "-m"],
	Dict(
	:help => "dimension for embedding",
	:arg_type => Int,
	:required => true
	))

	add_arg_table(
	s,
	["--tolerance", "--tor", "-r"],
	Dict(
	:help => "tolerance for matching vertors",
	:arg_type => Float32,
	:required => true
	))

	args = parse_args(s)

	if args["input"] != "-"
	# read from file
	filename = args["input"]
	h5open(filename, "r") do fd
	tsdata = read(fd, "tsdata")
	end
	else
	# read from stdin
	tsdata = parse.(Float32, split(readline(stdin), (' ', ',')))
	end

	func = method_map[args["method"]]
	dim = args["dimension"]
	tol = args["tolerance"]

	result = func(tsdata, dim, tol)
	println(result)
	end

	cmd_entrypoint()
	end