Skip to content

Instantly share code, notes, and snippets.

@puraminy
Last active December 19, 2019 21:08
Show Gist options
  • Save puraminy/c1b0a96218eed1bd2bfc7aedbcff0961 to your computer and use it in GitHub Desktop.
Save puraminy/c1b0a96218eed1bd2bfc7aedbcff0961 to your computer and use it in GitHub Desktop.
Bayes Classifier Gaussian
import numpy as np
from scipy.stats import multivariate_normal
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score
import seaborn as sn
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
import matplotlib.pyplot as plt
from sklearn import preprocessing
def plot_confusion_matrix(cm, class_names, normalize= False, annotate=True, title='Confusion Matrix'):
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
aylabels = class_names
axlabels = class_names
plt.figure(figsize=(10, 10))
plt.title(title)
sn.set(font_scale=1.4) # for label size
format = '.2f' if normalize else 'd'
sn.heatmap(cm, annot=annotate,fmt=format, annot_kws={"size": 12}, cmap="YlGnBu", xticklabels=axlabels, yticklabels=aylabels) # font size
plt.show()
def sigma_pool(inp_list, lam):
s_p = [((train_labels == i).sum() - 1) * inp_list[i] for i in range(class_num)]
s_p = np.sum(s_p, axis=0) / (len(train_labels) -class_num)
return [(1 - lam) * item + lam * s_p for item in inp_list]
def estimate(data, labels, smooth=False):
mus = [[]] *class_num
sigmas = [[]] * class_num
splitted_data = [data[np.where(labels == l)] for l in range(class_num)]
for i, class_data in enumerate(splitted_data):
mus[i] = np.mean(class_data, axis=0)
sigmas[i] = np.cov(class_data.transpose())
if smooth:
sigmas = sigma_pool(sigmas, 0.1)
return mus, sigmas
def classify(data):
# scipy.stats.norm(0, 1).pdf
posteriors = [multivariate_normal.pdf(data, mus[i], sigmas[i])
* priors[i] for i in range(class_num)]
posteriors = np.transpose(posteriors)
data_labels = np.argmax(posteriors, axis=1)
return data_labels
def get_priors(labels):
priors = []
n = len(labels)
for i in range(class_num):
priors.append(float((labels==i).sum()) / n)
return priors
def normalize(data):
# in this part we scale data between [0.1]
min_max_scaler = preprocessing.MinMaxScaler()
normalized_data = min_max_scaler.fit_transform(data)
return normalized_data
import pandas as pd
data = pd.read_csv("pendigits.tra")
train_data = data.iloc[:,0:15]
train_labels = data.iloc[:,16]
data = pd.read_csv("pendigits.tra")
test_data = data.iloc[:,0:15]
test_labels = data.iloc[:,16]
class_num = 10
# normalize
train_data = normalize(train_data)
test_data = normalize(test_data)
print("splitting the data for any class..")
mus, sigmas = estimate(train_data, train_labels, smooth=False)
print("data splitted.")
# priors based on data counts
priors = get_priors(train_labels)
# probablities for all classes
print("classifying...")
predicted_labels = classify(test_data)
# acc
acc = accuracy_score(test_labels, predicted_labels)
print(acc)
class_names =list(set(test_labels))
print(class_names)
cm = confusion_matrix(test_labels, predicted_labels, class_names)
cr = classification_report(test_labels, predicted_labels)
plot_confusion_matrix(cm, class_names, True)
print(cr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment