Skip to content

Instantly share code, notes, and snippets.

@jiahao
Last active April 28, 2019 23:24
Show Gist options
  • Save jiahao/c673c29c40a7075207e2b01493ec4d0b to your computer and use it in GitHub Desktop.
Save jiahao/c673c29c40a7075207e2b01493ec4d0b to your computer and use it in GitHub Desktop.
Multinomial naive Bayes in Julia, allowing for generic numeric types for the conditional probabilities. When using rational numbers, you can calculate exact probabilities without roundoff error.
struct MultinomialNaiveBayes{T, V<:AbstractVector}
feature_ratios::V
prior_ratio::T
end
"""
fit(MultinomialNaiveBayes, [T,] features, labels, α = 1) -> MNB
fits a `MultinomialNaiveBayes` classifier `MNB` using the
`features` matrix and `labels` vector of `Bool`s.
The number of columns in `features` is assumed to be the same as the
length of `labels`.
`T` is the working numeric type, which defaults to `Float32`.
Other numeric types like `T = Rational` are possible, which permit
testing exact values of conditional probabilities.
`α` is a additive smoothing parameter used for regularization,
with 1 being Laplace smoothing, 0.5 being Jeffrys smoothing,
and other values being the more general Lidstone smoothing.
"""
function fit(::Type{MultinomialNaiveBayes{T, S}}, features, labels, α = 1) where
{T<:Real, S<:AbstractVector{T}}
m, n = size(features)
feature_ratios = zeros(T, n) #probability ratios
#Calculate prior ratio
counts = zeros(Int, 2)
counts_category = zeros(Int, 2)
for i in 1:m
k = if labels[i] 1 else 2 end
counts[k] += 1
counts_category[k] += sum(view(features, i, :))
end
prior_ratio = T(counts[1]) / T(counts[2])
#Calculate feature ratios
counts = zeros(Int, 2)
for j in 1:n
counts[1] = counts[2] = 0
for i in 1:m
k = if labels[i] 1 else 2 end
counts[k] += features[i, j]
end
feature_ratios[j] = (T(counts[1] + α) / T(counts_category[1] + α * n)) /
(T(counts[2] + α) / T(counts_category[2] + α * n))
end
return MultinomialNaiveBayes{T, S}(feature_ratios, prior_ratio)
end
#Default working numeric type
fit(::Type{MultinomialNaiveBayes}, features, labels, α = 1) =
fit(MultinomialNaiveBayes, Float32, features, labels, α)
#Expand out working numeric type into parameters of MultinomialNaiveBayes
fit(::Type{MultinomialNaiveBayes}, T::Type{R}, features, labels, α = 1) where R<:Real =
fit(MultinomialNaiveBayes{T, Vector{T}}, features, labels, α)
"""
predict(MNB, [T,] features) -> pr_true
predicts the probabilities of the label being `true` or `false`
from the `MultinomialNaiveBayes` classifier `MNB` and the `features`.
The optional numeric type `T` specifies the working type to calculate
the return probabilities, which can differ from the numeric type used
to train the `MNB` model.
"""
function predict(MNB::MultinomialNaiveBayes, T, features)
pr = predictratio(MNB, T, features)
return pr / (1 + pr)
end
function predictratio(MNB::MultinomialNaiveBayes, T, features)
#Initialize probability ratio with prior
pr = T(MNB.prior_ratio)
for (j, x) in enumerate(features)
if x != 0
pr *= (T(MNB.feature_ratios[j]))^x
end
end
return pr
end
"""
predictlogratio(MNB, [T,] features) -> log_ratio
predicts the log of the ratio of probabilities that the label is
`true` or `false` from the `MultinomialNaiveBayes` classifier `MNB`
and the `features`.
The optional numeric type `T` specifies the working type to calculate
the return probabilities, which can differ from the numeric type used
to train the `MNB` model.
This function will be slower and restricted to fewer numeric types, but
can offer better precision when extremely small or large ratios of
conditional probabilities exist.
"""
function predictlogratio(MNB::MultinomialNaiveBayes, T, features)
logpr = T(log(MNB.prior_ratio))
for (j, x) in enumerate(features)
if x != 0
logpr += x * T(log(MNB.feature_ratios[j]))
end
end
return logpr
end
#The default working numeric type is the type used to train the classifier
for f in (:predict, :predictlog, :predictlogratio) @eval begin
$f(MNB::MultinomialNaiveBayes{T}, features) where T =
$f(MNB, T, features)
end end
##############################################################################
# A simple test
##############################################################################
#Example from
#https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html
using SparseArrays
featureset = SparseMatrixCSC(
#Chinese Beijing Shanghai Macao Japan Tokyo
[2 1 0 0 0 0
2 0 1 0 0 0
1 0 0 1 0 0
1 0 0 0 1 1])
labels = BitVector([true, true, true, false])
MNB = fit(MultinomialNaiveBayes, Rational, featureset, labels, 1)
upr_yes = 3//4 * (3//7)^3 * 1//14 * 1//14
upr_no = 1//4 * (2//9)^3 * 2//9 * 2//9
pr_yes = upr_yes // (upr_yes + upr_no)
testfeat = SparseVector([3, 0, 0, 0, 1, 1])
pred_yes = predict(MNB, Rational, testfeat)
@assert pred_yes == pr_yes
r = predictratio(MNB, Float32, testfeat)
lr = predictlogratio(MNB, Float32, testfeat)
@assert log(r) ≈ lr
@assert r/(1+r) ≈ pred_yes
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment