Skip to content

Instantly share code, notes, and snippets.

@Aunsiels
Last active June 20, 2019 13:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Aunsiels/9afe9fa86c1f24d7abd97d017685baf3 to your computer and use it in GitHub Desktop.
Save Aunsiels/9afe9fa86c1f24d7abd97d017685baf3 to your computer and use it in GitHub Desktop.
# An implementation of the gaussian naive bayes with missing values
# It has more or less the same interface as a sklearn classifier
import collections
import numpy as np
import math
def get_prior(y, y_unique):
y_counts = collections.Counter(y)
for key in y_unique:
y_counts[key] /= y.shape[0]
return y_counts
def get_means_and_variances(X, y, y_unique):
n_features = X.shape[1]
n_classes = y_unique.shape[0]
means = np.ones((n_classes, n_features))
variances = np.ones((n_classes, n_features))
specific_input = dict()
for out_class in y_unique:
specific_input[out_class] = X[y == out_class]
for i in range(n_classes):
for j in range(n_features):
temp = specific_input[y_unique[i]][:, j]
temp = temp[~np.isnan(temp)]
if temp.shape[0] <= 1:
means[i, j] = 0
variances[i, j] = 0
else:
means[i, j] = np.mean(temp)
variances[i, j] = np.var(temp) * (temp.shape[0] / (temp.shape[0] - 1))
return means, variances
def get_gaussian(x, mean, variance):
if variance == 0:
return 1.0
return 1.0 / math.sqrt(2 * math.pi * variance ** 2) *\
math.exp(-(x - mean) ** 2 / 2.0 / variance ** 2)
def get_all_likelihoods(x, means, variances):
result = []
for i in range(means.shape[0]):
product = 1.0
for j in range(means.shape[1]):
if not np.isnan(x[j]):
product *= get_gaussian(x[j], means[i, j], variances[i, j])
result.append(product)
return np.array(result)
def get_all_posterior(likelihoods, prior):
total = 0
res = []
for i, likelihood in enumerate(likelihoods):
temp = likelihood * prior[i]
res.append(temp)
total += temp
res = np.array(res)
if total != 0:
res /= total
return res
class GaussianNBWithMissingValues:
def __init__(self, **params):
if params:
self.set_params(params)
else:
self.means = None
self.variances = None
self.prior = None
self.y_unique = None
def fit(self, X, y):
self.y_unique = np.unique(y)
self.prior = get_prior(y, self.y_unique)
self.means, self.variances = get_means_and_variances(X, y, self.y_unique)
return self
def predict_proba(self, X):
likelihoods = np.apply_along_axis(lambda x: get_all_likelihoods(x, self.means, self.variances), 1, X)
proba = np.apply_along_axis(lambda x: get_all_posterior(x, self.prior), 1, likelihoods)
return proba
def predict(self, X):
proba = self.predict_proba(X)
indexes = np.argmax(proba, axis=1)
res = []
for index in indexes:
res.append(self.y_unique[index])
return np.array(res)
def score(self, X, y):
predictions = self.predict(X)
return sum(predictions == y) / y.shape[0]
def get_params(self, deep=True):
if self.means is None:
return {"means": None,
"variance": None,
"prior": None,
"y_unique": None
}
return {"means": self.means.copy(),
"variance": self.variances.copy(),
"prior": self.prior.copy(),
"y_unique": self.y_unique.copy()
}
def set_params(self, params):
self.means = params["means"]
self.variance = params["variance"]
self.prior = params["prior"]
self.y_unique = params["y_unique"]
if __init__ == "__main__":
# A small example on how to use it
X = np.array(
[
[1, 2,3],
[2, np.nan, 4],
[np.nan, 2, 6],
[3, 7, np.nan],
[4, 3,3],
[5, np.nan, 5],
[np.nan, 4, 7],
[6, 5, np.nan],
])
y = np.array([1,0,1,0,1,0,1,0])
clf.fit(X, y)
print(clf.predict(X))
print(clf.score(X, y))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment