Last active
June 20, 2019 13:52
-
-
Save Aunsiels/9afe9fa86c1f24d7abd97d017685baf3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# An implementation of the gaussian naive bayes with missing values | |
# It has more or less the same interface as a sklearn classifier | |
import collections | |
import numpy as np | |
import math | |
def get_prior(y, y_unique): | |
y_counts = collections.Counter(y) | |
for key in y_unique: | |
y_counts[key] /= y.shape[0] | |
return y_counts | |
def get_means_and_variances(X, y, y_unique): | |
n_features = X.shape[1] | |
n_classes = y_unique.shape[0] | |
means = np.ones((n_classes, n_features)) | |
variances = np.ones((n_classes, n_features)) | |
specific_input = dict() | |
for out_class in y_unique: | |
specific_input[out_class] = X[y == out_class] | |
for i in range(n_classes): | |
for j in range(n_features): | |
temp = specific_input[y_unique[i]][:, j] | |
temp = temp[~np.isnan(temp)] | |
if temp.shape[0] <= 1: | |
means[i, j] = 0 | |
variances[i, j] = 0 | |
else: | |
means[i, j] = np.mean(temp) | |
variances[i, j] = np.var(temp) * (temp.shape[0] / (temp.shape[0] - 1)) | |
return means, variances | |
def get_gaussian(x, mean, variance): | |
if variance == 0: | |
return 1.0 | |
return 1.0 / math.sqrt(2 * math.pi * variance ** 2) *\ | |
math.exp(-(x - mean) ** 2 / 2.0 / variance ** 2) | |
def get_all_likelihoods(x, means, variances): | |
result = [] | |
for i in range(means.shape[0]): | |
product = 1.0 | |
for j in range(means.shape[1]): | |
if not np.isnan(x[j]): | |
product *= get_gaussian(x[j], means[i, j], variances[i, j]) | |
result.append(product) | |
return np.array(result) | |
def get_all_posterior(likelihoods, prior): | |
total = 0 | |
res = [] | |
for i, likelihood in enumerate(likelihoods): | |
temp = likelihood * prior[i] | |
res.append(temp) | |
total += temp | |
res = np.array(res) | |
if total != 0: | |
res /= total | |
return res | |
class GaussianNBWithMissingValues: | |
def __init__(self, **params): | |
if params: | |
self.set_params(params) | |
else: | |
self.means = None | |
self.variances = None | |
self.prior = None | |
self.y_unique = None | |
def fit(self, X, y): | |
self.y_unique = np.unique(y) | |
self.prior = get_prior(y, self.y_unique) | |
self.means, self.variances = get_means_and_variances(X, y, self.y_unique) | |
return self | |
def predict_proba(self, X): | |
likelihoods = np.apply_along_axis(lambda x: get_all_likelihoods(x, self.means, self.variances), 1, X) | |
proba = np.apply_along_axis(lambda x: get_all_posterior(x, self.prior), 1, likelihoods) | |
return proba | |
def predict(self, X): | |
proba = self.predict_proba(X) | |
indexes = np.argmax(proba, axis=1) | |
res = [] | |
for index in indexes: | |
res.append(self.y_unique[index]) | |
return np.array(res) | |
def score(self, X, y): | |
predictions = self.predict(X) | |
return sum(predictions == y) / y.shape[0] | |
def get_params(self, deep=True): | |
if self.means is None: | |
return {"means": None, | |
"variance": None, | |
"prior": None, | |
"y_unique": None | |
} | |
return {"means": self.means.copy(), | |
"variance": self.variances.copy(), | |
"prior": self.prior.copy(), | |
"y_unique": self.y_unique.copy() | |
} | |
def set_params(self, params): | |
self.means = params["means"] | |
self.variance = params["variance"] | |
self.prior = params["prior"] | |
self.y_unique = params["y_unique"] | |
if __init__ == "__main__": | |
# A small example on how to use it | |
X = np.array( | |
[ | |
[1, 2,3], | |
[2, np.nan, 4], | |
[np.nan, 2, 6], | |
[3, 7, np.nan], | |
[4, 3,3], | |
[5, np.nan, 5], | |
[np.nan, 4, 7], | |
[6, 5, np.nan], | |
]) | |
y = np.array([1,0,1,0,1,0,1,0]) | |
clf.fit(X, y) | |
print(clf.predict(X)) | |
print(clf.score(X, y)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment