Skip to content

Instantly share code, notes, and snippets.

@SeroviICAI
Created February 24, 2023 08:48
Show Gist options
  • Save SeroviICAI/6b43f572a03bad2edcc09f3680298efa to your computer and use it in GitHub Desktop.
Save SeroviICAI/6b43f572a03bad2edcc09f3680298efa to your computer and use it in GitHub Desktop.
Multinomial Naive Bayes classifier for text data with example on use.
import numpy as np
__all__ = ["MultinomialNaiveBayes"]
class MultinomialNaiveBayes:
"""
Multinomial Naive Bayes classifier for text data.
Parameters:
----------
alpha: float, default=1.0
Smoothing parameter for Laplace smoothing. 0 for no smoothing.
fit_prior: bool, default=True
Whether to learn class prior probabilities from data or use uniform priors.
class_prior: array-like of shape (n_classes,), default=None
Prior probabilities of the classes. If None, the priors are learned from the data.
Attributes:
----------
classes_: array of shape (n_classes,)
Unique class labels in the training data.
class_log_prior_: array of shape (n_classes,)
Log prior probability of each class.
feature_log_prob_: array of shape (n_classes, n_features)
Log probability of each feature given each class.
"""
def __init__(self, alpha=1.0):
self.alpha = alpha
self.classes_ = None
self.class_log_prior = None
self.feature_log_prob = None
def fit(self, X, y):
"""
Fit Naive Bayes classifier according to X, y.
Parameters:
----------
X: array-like of shape (n_samples, n_features)
Training data.
y: array-like of shape (n_samples,)
Target values.
Returns:
-------
self: object
Returns self.
"""
# Convert X and y into numpy arrays
X, y = np.array(X), np.array(y)
# Get unique classes and their counts in y
self.classes_ = np.unique(y)
# Calculate log probabilities of each feature given each class
self._update_feature_log_prob(X, y)
# Calculate log prior probabilities of each class
self._update_class_log_prior(y)
return self
def predict(self, X):
"""
Perform classification on an array of test vectors X.
Parameters:
----------
X: array-like of shape (n_samples, n_features)
Test data.
Returns:
-------
y_pred: array-like of shape (n_samples,)
Predicted target values for X.
"""
# Calculate joint log likelihoods for each sample
joint_log_likelihood = self._joint_log_likelihood(X)
# Return class with the highest log probability
return self.classes_[np.argmax(joint_log_likelihood, axis=1)]
def predict_proba(self, X):
"""
Return probability estimates for the test data X.
Parameters:
----------
X: array-like of shape (n_samples, n_features)
Test data.
Returns:
-------
y_prob: array-like of shape (n_samples, n_classes)
Probability estimates.
"""
# Calculate joint log likelihoods for each sample
joint_log_likelihood = self._joint_log_likelihood(X)
# Convert log probabilities into probabilities
class_probs = np.exp(joint_log_likelihood)
# Normalize probabilities for each sample
return class_probs / np.sum(class_probs, axis=1, keepdims=True)
def _joint_log_likelihood(self, X):
"""
Compute the unnormalized posterior log probability of X.
Parameters:
----------
X: array-like of shape (n_samples, n_features)
Test data.
Returns:
-------
joint_log_likelihood: array-like of shape (n_samples, n_classes)
Unnormalized log probabilities.
"""
# Return the joint log-likelihood
return X @ self.feature_log_prob.T + self.class_log_prior
def _update_feature_log_prob(self, X, y):
"""
Estimate feature log probability from data.
Parameters:
----------
X: array-like of shape (n_samples, n_features)
Training data.
y: array-like of shape (n_samples,)
Target values.
Returns:
-------
None
"""
# Initialize feature log probabilities matrix with shape (n_classes, n_features)
self.feature_log_prob = np.zeros((len(self.classes_), X.shape[1]))
# Calculate feature log probabilities for each class
for i, c in enumerate(self.classes_):
X_class = X[y == c]
feature_counts = np.sum(X_class, axis=0)
total_count = np.sum(feature_counts)
self.feature_log_prob[i] = np.log((feature_counts + self.alpha)
/ (total_count + self.alpha * X.shape[1]))
def _update_class_log_prior(self, y):
"""
Estimate class log prior probability from data.
Parameters:
----------
y: array-like of shape (n_samples,)
Target values.
Returns:
-------
None
"""
# Calculate class log prior probabilities
class_counts = np.bincount(y, minlength=len(self.classes_))
self.class_log_prior = np.log(class_counts / y.shape[0])
# Example on how the model works
X_train = np.array([[1, 1, 0, 0],
[1, 1, 0, 1],
[0, 1, 1, 0],
[0, 1, 1, 1],
[1, 0, 0, 0],
[1, 0, 0, 1],
[0, 0, 1, 0],
[0, 0, 1, 1]])
y_train = np.array([0, 0, 0, 0, 1, 1, 1, 1])
clf = MultinomialNaiveBayes(alpha=1)
clf.fit(X_train, y_train)
X_test = np.array([[1, 0, 0, 0],
[1, 1, 1, 1],
[0, 0, 1, 0]])
y_pred = clf.predict(X_test)
print(y_pred) # Output: [1 0 1]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment