Aunsiels/gaussiannb_with_missing_values.py

## gaussiannb_with_missing_values.py
# An implementation of the gaussian naive bayes with missing values
# It has more or less the same interface as a sklearn classifier

import collections
import numpy as np
import math


def get_prior(y, y_unique):
    y_counts = collections.Counter(y)
    for key in y_unique:
        y_counts[key] /= y.shape[0]
    return y_counts


def get_means_and_variances(X, y, y_unique):
    n_features = X.shape[1]
    n_classes = y_unique.shape[0]
    means = np.ones((n_classes, n_features))
    variances = np.ones((n_classes, n_features))
    specific_input = dict()
    for out_class in y_unique:
        specific_input[out_class] = X[y == out_class]
    for i in range(n_classes):
        for j in range(n_features):
            temp = specific_input[y_unique[i]][:, j]
            temp = temp[~np.isnan(temp)]
            if temp.shape[0] <= 1:
                means[i, j] = 0
                variances[i, j] = 0
            else:
                means[i, j] = np.mean(temp)
                variances[i, j] = np.var(temp) * (temp.shape[0] / (temp.shape[0] - 1))
    return means, variances


def get_gaussian(x, mean, variance):
    if variance == 0:
        return 1.0
    return 1.0 / math.sqrt(2 * math.pi * variance ** 2) *\
        math.exp(-(x - mean) ** 2 / 2.0 / variance ** 2)


def get_all_likelihoods(x, means, variances):
    result = []
    for i in range(means.shape[0]):
        product = 1.0
        for j in range(means.shape[1]):
            if not np.isnan(x[j]):
                product *= get_gaussian(x[j], means[i, j], variances[i, j])
        result.append(product)
    return np.array(result)


def get_all_posterior(likelihoods, prior):
    total = 0
    res = []
    for i, likelihood in enumerate(likelihoods):
        temp = likelihood * prior[i]
        res.append(temp)
        total += temp
    res = np.array(res)
    if total != 0:
        res /= total
    return res


class GaussianNBWithMissingValues:

    def __init__(self, **params):
        if params:
            self.set_params(params)
        else:
            self.means = None
            self.variances = None
            self.prior = None
            self.y_unique = None

    def fit(self, X, y):
        self.y_unique = np.unique(y)
        self.prior = get_prior(y, self.y_unique)
        self.means, self.variances = get_means_and_variances(X, y, self.y_unique)
        return self

    def predict_proba(self, X):
        likelihoods = np.apply_along_axis(lambda x: get_all_likelihoods(x, self.means, self.variances), 1, X)
        proba = np.apply_along_axis(lambda x: get_all_posterior(x, self.prior), 1, likelihoods)
        return proba

    def predict(self, X):
        proba = self.predict_proba(X)
        indexes = np.argmax(proba, axis=1)
        res = []
        for index in indexes:
            res.append(self.y_unique[index])
        return np.array(res)

    def score(self, X, y):
        predictions = self.predict(X)
        return sum(predictions == y) / y.shape[0]

    def get_params(self, deep=True):
        if self.means is None:
            return {"means": None,
                    "variance": None,
                    "prior": None,
                    "y_unique": None
                   }
        return {"means": self.means.copy(),
                "variance": self.variances.copy(),
                "prior": self.prior.copy(),
                "y_unique": self.y_unique.copy()
               }

    def set_params(self, params):
        self.means = params["means"]
        self.variance = params["variance"]
        self.prior = params["prior"]
        self.y_unique = params["y_unique"]


if __init__ == "__main__":
    # A small example on how to use it
    X = np.array(
    [
        [1, 2,3],
        [2, np.nan, 4],
        [np.nan, 2, 6],
        [3, 7, np.nan],
        [4, 3,3],
        [5, np.nan, 5],
        [np.nan, 4, 7],
        [6, 5, np.nan],
    ])
    y = np.array([1,0,1,0,1,0,1,0])
    clf.fit(X, y)
    print(clf.predict(X))
    print(clf.score(X, y))
	# An implementation of the gaussian naive bayes with missing values
	# It has more or less the same interface as a sklearn classifier

	import collections
	import numpy as np
	import math


	def get_prior(y, y_unique):
	y_counts = collections.Counter(y)
	for key in y_unique:
	y_counts[key] /= y.shape[0]
	return y_counts


	def get_means_and_variances(X, y, y_unique):
	n_features = X.shape[1]
	n_classes = y_unique.shape[0]
	means = np.ones((n_classes, n_features))
	variances = np.ones((n_classes, n_features))
	specific_input = dict()
	for out_class in y_unique:
	specific_input[out_class] = X[y == out_class]
	for i in range(n_classes):
	for j in range(n_features):
	temp = specific_input[y_unique[i]][:, j]
	temp = temp[~np.isnan(temp)]
	if temp.shape[0] <= 1:
	means[i, j] = 0
	variances[i, j] = 0
	else:
	means[i, j] = np.mean(temp)
	variances[i, j] = np.var(temp) * (temp.shape[0] / (temp.shape[0] - 1))
	return means, variances


	def get_gaussian(x, mean, variance):
	if variance == 0:
	return 1.0
	return 1.0 / math.sqrt(2 * math.pi * variance ** 2) *\
	math.exp(-(x - mean) 2 / 2.0 / variance 2)


	def get_all_likelihoods(x, means, variances):
	result = []
	for i in range(means.shape[0]):
	product = 1.0
	for j in range(means.shape[1]):
	if not np.isnan(x[j]):
	product *= get_gaussian(x[j], means[i, j], variances[i, j])
	result.append(product)
	return np.array(result)


	def get_all_posterior(likelihoods, prior):
	total = 0
	res = []
	for i, likelihood in enumerate(likelihoods):
	temp = likelihood * prior[i]
	res.append(temp)
	total += temp
	res = np.array(res)
	if total != 0:
	res /= total
	return res


	class GaussianNBWithMissingValues:

	def __init__(self, **params):
	if params:
	self.set_params(params)
	else:
	self.means = None
	self.variances = None
	self.prior = None
	self.y_unique = None

	def fit(self, X, y):
	self.y_unique = np.unique(y)
	self.prior = get_prior(y, self.y_unique)
	self.means, self.variances = get_means_and_variances(X, y, self.y_unique)
	return self

	def predict_proba(self, X):
	likelihoods = np.apply_along_axis(lambda x: get_all_likelihoods(x, self.means, self.variances), 1, X)
	proba = np.apply_along_axis(lambda x: get_all_posterior(x, self.prior), 1, likelihoods)
	return proba

	def predict(self, X):
	proba = self.predict_proba(X)
	indexes = np.argmax(proba, axis=1)
	res = []
	for index in indexes:
	res.append(self.y_unique[index])
	return np.array(res)

	def score(self, X, y):
	predictions = self.predict(X)
	return sum(predictions == y) / y.shape[0]

	def get_params(self, deep=True):
	if self.means is None:
	return {"means": None,
	"variance": None,
	"prior": None,
	"y_unique": None
	}
	return {"means": self.means.copy(),
	"variance": self.variances.copy(),
	"prior": self.prior.copy(),
	"y_unique": self.y_unique.copy()
	}

	def set_params(self, params):
	self.means = params["means"]
	self.variance = params["variance"]
	self.prior = params["prior"]
	self.y_unique = params["y_unique"]


	if __init__ == "__main__":
	# A small example on how to use it
	X = np.array(
	[
	[1, 2,3],
	[2, np.nan, 4],
	[np.nan, 2, 6],
	[3, 7, np.nan],
	[4, 3,3],
	[5, np.nan, 5],
	[np.nan, 4, 7],
	[6, 5, np.nan],
	])
	y = np.array([1,0,1,0,1,0,1,0])
	clf.fit(X, y)
	print(clf.predict(X))
	print(clf.score(X, y))