jiahao/naivebayes.jl

## naivebayes.jl
struct MultinomialNaiveBayes{T, V<:AbstractVector}
    feature_ratios::V
    prior_ratio::T
end

"""
    fit(MultinomialNaiveBayes, [T,] features, labels, α = 1) -> MNB

fits a `MultinomialNaiveBayes` classifier `MNB` using the
`features` matrix and `labels` vector of `Bool`s.

The number of columns in `features` is assumed to be the same as the
length of `labels`.

`T` is the working numeric type, which defaults to `Float32`.
Other numeric types like `T = Rational` are possible, which permit
testing exact values of conditional probabilities.

`α` is a additive smoothing parameter used for regularization,
with 1 being Laplace smoothing, 0.5 being Jeffrys smoothing,
and other values being the more general Lidstone smoothing.
"""
function fit(::Type{MultinomialNaiveBayes{T, S}}, features, labels, α = 1) where
    {T<:Real, S<:AbstractVector{T}}

    m, n = size(features)
    feature_ratios = zeros(T, n) #probability ratios

    #Calculate prior ratio
    counts = zeros(Int, 2)
    counts_category = zeros(Int, 2)
    for i in 1:m
        k = if labels[i] 1 else 2 end
        counts[k] += 1
        counts_category[k] += sum(view(features, i, :))
    end
    prior_ratio = T(counts[1]) / T(counts[2])

    #Calculate feature ratios
    counts = zeros(Int, 2)
    for j in 1:n
        counts[1] = counts[2] = 0
        for i in 1:m
            k = if labels[i] 1 else 2 end
            counts[k] += features[i, j]
        end

        feature_ratios[j] = (T(counts[1] + α) / T(counts_category[1] + α * n)) /
            (T(counts[2] + α) / T(counts_category[2] + α * n))
    end

    return MultinomialNaiveBayes{T, S}(feature_ratios, prior_ratio)
end

#Default working numeric type
fit(::Type{MultinomialNaiveBayes}, features, labels, α = 1) =
    fit(MultinomialNaiveBayes, Float32, features, labels, α)

#Expand out working numeric type into parameters of MultinomialNaiveBayes
fit(::Type{MultinomialNaiveBayes}, T::Type{R}, features, labels, α = 1) where R<:Real =
    fit(MultinomialNaiveBayes{T, Vector{T}}, features, labels, α)


"""
    predict(MNB, [T,] features) -> pr_true

predicts the probabilities of the label being `true` or `false`
from the `MultinomialNaiveBayes` classifier `MNB` and the `features`.

The optional numeric type `T` specifies the working type to calculate
the return probabilities, which can differ from the numeric type used
to train the `MNB` model.
"""
function predict(MNB::MultinomialNaiveBayes, T, features)
    pr = predictratio(MNB, T, features)
    return pr / (1 + pr)
end

function predictratio(MNB::MultinomialNaiveBayes, T, features)

    #Initialize probability ratio with prior
    pr = T(MNB.prior_ratio)

    for (j, x) in enumerate(features)
        if x != 0
            pr *= (T(MNB.feature_ratios[j]))^x
        end
    end

    return pr
end

"""
    predictlogratio(MNB, [T,] features) -> log_ratio

predicts the log of the ratio of probabilities that the label is
`true` or `false` from the `MultinomialNaiveBayes` classifier `MNB`
and the `features`.

The optional numeric type `T` specifies the working type to calculate
the return probabilities, which can differ from the numeric type used
to train the `MNB` model.

This function will be slower and restricted to fewer numeric types, but
can offer better precision when extremely small or large ratios of
conditional probabilities exist.
"""
function predictlogratio(MNB::MultinomialNaiveBayes, T, features)

    logpr = T(log(MNB.prior_ratio))

    for (j, x) in enumerate(features)
        if x != 0
            logpr += x * T(log(MNB.feature_ratios[j]))
        end
    end

    return logpr
end

#The default working numeric type is the type used to train the classifier
for f in (:predict, :predictlog, :predictlogratio) @eval begin
    $f(MNB::MultinomialNaiveBayes{T}, features) where T =
        $f(MNB, T, features)
end end

##############################################################################
# A simple test
##############################################################################

#Example from
#https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html

using SparseArrays

featureset = SparseMatrixCSC(
#Chinese Beijing Shanghai Macao Japan Tokyo
[2 1 0 0 0 0
 2 0 1 0 0 0
 1 0 0 1 0 0
 1 0 0 0 1 1])

labels = BitVector([true, true, true, false])

MNB = fit(MultinomialNaiveBayes, Rational, featureset, labels, 1)

upr_yes = 3//4 * (3//7)^3 * 1//14 * 1//14
upr_no  = 1//4 * (2//9)^3 * 2//9  * 2//9
pr_yes = upr_yes // (upr_yes + upr_no)

testfeat = SparseVector([3, 0, 0, 0, 1, 1])
pred_yes = predict(MNB, Rational, testfeat)

@assert pred_yes == pr_yes

r = predictratio(MNB, Float32, testfeat)
lr = predictlogratio(MNB, Float32, testfeat)
@assert log(r) ≈ lr
@assert r/(1+r) ≈ pred_yes
	struct MultinomialNaiveBayes{T, V<:AbstractVector}
	feature_ratios::V
	prior_ratio::T
	end

	"""
	fit(MultinomialNaiveBayes, [T,] features, labels, α = 1) -> MNB

	fits a `MultinomialNaiveBayes` classifier `MNB` using the
	`features` matrix and `labels` vector of `Bool`s.

	The number of columns in `features` is assumed to be the same as the
	length of `labels`.

	`T` is the working numeric type, which defaults to `Float32`.
	Other numeric types like `T = Rational` are possible, which permit
	testing exact values of conditional probabilities.

	`α` is a additive smoothing parameter used for regularization,
	with 1 being Laplace smoothing, 0.5 being Jeffrys smoothing,
	and other values being the more general Lidstone smoothing.
	"""
	function fit(::Type{MultinomialNaiveBayes{T, S}}, features, labels, α = 1) where
	{T<:Real, S<:AbstractVector{T}}

	m, n = size(features)
	feature_ratios = zeros(T, n) #probability ratios

	#Calculate prior ratio
	counts = zeros(Int, 2)
	counts_category = zeros(Int, 2)
	for i in 1:m
	k = if labels[i] 1 else 2 end
	counts[k] += 1
	counts_category[k] += sum(view(features, i, :))
	end
	prior_ratio = T(counts[1]) / T(counts[2])

	#Calculate feature ratios
	counts = zeros(Int, 2)
	for j in 1:n
	counts[1] = counts[2] = 0
	for i in 1:m
	k = if labels[i] 1 else 2 end
	counts[k] += features[i, j]
	end

	feature_ratios[j] = (T(counts[1] + α) / T(counts_category[1] + α * n)) /
	(T(counts[2] + α) / T(counts_category[2] + α * n))
	end

	return MultinomialNaiveBayes{T, S}(feature_ratios, prior_ratio)
	end

	#Default working numeric type
	fit(::Type{MultinomialNaiveBayes}, features, labels, α = 1) =
	fit(MultinomialNaiveBayes, Float32, features, labels, α)

	#Expand out working numeric type into parameters of MultinomialNaiveBayes
	fit(::Type{MultinomialNaiveBayes}, T::Type{R}, features, labels, α = 1) where R<:Real =
	fit(MultinomialNaiveBayes{T, Vector{T}}, features, labels, α)



	"""
	predict(MNB, [T,] features) -> pr_true

	predicts the probabilities of the label being `true` or `false`
	from the `MultinomialNaiveBayes` classifier `MNB` and the `features`.

	The optional numeric type `T` specifies the working type to calculate
	the return probabilities, which can differ from the numeric type used
	to train the `MNB` model.
	"""
	function predict(MNB::MultinomialNaiveBayes, T, features)
	pr = predictratio(MNB, T, features)
	return pr / (1 + pr)
	end

	function predictratio(MNB::MultinomialNaiveBayes, T, features)

	#Initialize probability ratio with prior
	pr = T(MNB.prior_ratio)

	for (j, x) in enumerate(features)
	if x != 0
	pr *= (T(MNB.feature_ratios[j]))^x
	end
	end

	return pr
	end

	"""
	predictlogratio(MNB, [T,] features) -> log_ratio

	predicts the log of the ratio of probabilities that the label is
	`true` or `false` from the `MultinomialNaiveBayes` classifier `MNB`
	and the `features`.

	The optional numeric type `T` specifies the working type to calculate
	the return probabilities, which can differ from the numeric type used
	to train the `MNB` model.

	This function will be slower and restricted to fewer numeric types, but
	can offer better precision when extremely small or large ratios of
	conditional probabilities exist.
	"""
	function predictlogratio(MNB::MultinomialNaiveBayes, T, features)

	logpr = T(log(MNB.prior_ratio))

	for (j, x) in enumerate(features)
	if x != 0
	logpr += x * T(log(MNB.feature_ratios[j]))
	end
	end

	return logpr
	end

	#The default working numeric type is the type used to train the classifier
	for f in (:predict, :predictlog, :predictlogratio) @eval begin
	$f(MNB::MultinomialNaiveBayes{T}, features) where T =
	$f(MNB, T, features)
	end end

	##############################################################################
	# A simple test
	##############################################################################

	#Example from
	#https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html

	using SparseArrays

	featureset = SparseMatrixCSC(
	#Chinese Beijing Shanghai Macao Japan Tokyo
	[2 1 0 0 0 0
	2 0 1 0 0 0
	1 0 0 1 0 0
	1 0 0 0 1 1])

	labels = BitVector([true, true, true, false])

	MNB = fit(MultinomialNaiveBayes, Rational, featureset, labels, 1)

	upr_yes = 3//4 * (3//7)^3 * 1//14 * 1//14
	upr_no = 1//4 * (2//9)^3 * 2//9 * 2//9
	pr_yes = upr_yes // (upr_yes + upr_no)

	testfeat = SparseVector([3, 0, 0, 0, 1, 1])
	pred_yes = predict(MNB, Rational, testfeat)

	@assert pred_yes == pr_yes

	r = predictratio(MNB, Float32, testfeat)
	lr = predictlogratio(MNB, Float32, testfeat)
	@assert log(r) ≈ lr
	@assert r/(1+r) ≈ pred_yes