Last active
December 19, 2019 21:08
-
-
Save puraminy/c1b0a96218eed1bd2bfc7aedbcff0961 to your computer and use it in GitHub Desktop.
Bayes Classifier Gaussian
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from scipy.stats import multivariate_normal | |
from sklearn.feature_selection import VarianceThreshold | |
from sklearn.metrics import accuracy_score | |
import seaborn as sn | |
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score | |
import matplotlib.pyplot as plt | |
from sklearn import preprocessing | |
def plot_confusion_matrix(cm, class_names, normalize= False, annotate=True, title='Confusion Matrix'): | |
if normalize: | |
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] | |
aylabels = class_names | |
axlabels = class_names | |
plt.figure(figsize=(10, 10)) | |
plt.title(title) | |
sn.set(font_scale=1.4) # for label size | |
format = '.2f' if normalize else 'd' | |
sn.heatmap(cm, annot=annotate,fmt=format, annot_kws={"size": 12}, cmap="YlGnBu", xticklabels=axlabels, yticklabels=aylabels) # font size | |
plt.show() | |
def sigma_pool(inp_list, lam): | |
s_p = [((train_labels == i).sum() - 1) * inp_list[i] for i in range(class_num)] | |
s_p = np.sum(s_p, axis=0) / (len(train_labels) -class_num) | |
return [(1 - lam) * item + lam * s_p for item in inp_list] | |
def estimate(data, labels, smooth=False): | |
mus = [[]] *class_num | |
sigmas = [[]] * class_num | |
splitted_data = [data[np.where(labels == l)] for l in range(class_num)] | |
for i, class_data in enumerate(splitted_data): | |
mus[i] = np.mean(class_data, axis=0) | |
sigmas[i] = np.cov(class_data.transpose()) | |
if smooth: | |
sigmas = sigma_pool(sigmas, 0.1) | |
return mus, sigmas | |
def classify(data): | |
# scipy.stats.norm(0, 1).pdf | |
posteriors = [multivariate_normal.pdf(data, mus[i], sigmas[i]) | |
* priors[i] for i in range(class_num)] | |
posteriors = np.transpose(posteriors) | |
data_labels = np.argmax(posteriors, axis=1) | |
return data_labels | |
def get_priors(labels): | |
priors = [] | |
n = len(labels) | |
for i in range(class_num): | |
priors.append(float((labels==i).sum()) / n) | |
return priors | |
def normalize(data): | |
# in this part we scale data between [0.1] | |
min_max_scaler = preprocessing.MinMaxScaler() | |
normalized_data = min_max_scaler.fit_transform(data) | |
return normalized_data | |
import pandas as pd | |
data = pd.read_csv("pendigits.tra") | |
train_data = data.iloc[:,0:15] | |
train_labels = data.iloc[:,16] | |
data = pd.read_csv("pendigits.tra") | |
test_data = data.iloc[:,0:15] | |
test_labels = data.iloc[:,16] | |
class_num = 10 | |
# normalize | |
train_data = normalize(train_data) | |
test_data = normalize(test_data) | |
print("splitting the data for any class..") | |
mus, sigmas = estimate(train_data, train_labels, smooth=False) | |
print("data splitted.") | |
# priors based on data counts | |
priors = get_priors(train_labels) | |
# probablities for all classes | |
print("classifying...") | |
predicted_labels = classify(test_data) | |
# acc | |
acc = accuracy_score(test_labels, predicted_labels) | |
print(acc) | |
class_names =list(set(test_labels)) | |
print(class_names) | |
cm = confusion_matrix(test_labels, predicted_labels, class_names) | |
cr = classification_report(test_labels, predicted_labels) | |
plot_confusion_matrix(cm, class_names, True) | |
print(cr) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment