Last active
December 24, 2019 20:27
-
-
Save puraminy/6b2b1e6be77fd28531b0f85b485d01c0 to your computer and use it in GitHub Desktop.
LDA (Linear Discrimination Analysis)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from sklearn.metrics import confusion_matrix | |
from sklearn.metrics import accuracy_score | |
from sklearn import preprocessing | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.neighbors import RadiusNeighborsClassifier | |
from sklearn.naive_bayes import GaussianNB | |
import matplotlib.pyplot as plt | |
path = 'drive/My Drive/FMNIST/' | |
x_train = pd.read_csv(path+'Train_Data.csv') | |
y_train = pd.read_csv(path+'Train_Labels.csv') | |
x_test = pd.read_csv(path+'Test_Data.csv') | |
y_test = pd.read_csv(path+'Test_Labels.csv') | |
# print(x_train.shape) | |
def preprocess(data): | |
# in this part we scale data between [0.1] | |
min_max_scaler = preprocessing.MinMaxScaler() | |
x_train_minmax = min_max_scaler.fit_transform(data) | |
return x_train_minmax | |
x_train = preprocess(x_train) | |
x_test = preprocess(x_test) | |
X = pd.DataFrame(x_train) | |
y = pd.DataFrame(y_train) | |
#print(X.shape) | |
#print(y.shape) | |
X.columns = range(182) | |
X['class']=y.to_numpy() | |
# X.head() | |
df = X | |
class_num = len(df.groupby('class')) | |
# print(class_num) | |
# print(x_train.shape) | |
import warnings | |
warnings.filterwarnings('ignore') | |
class_means = pd.DataFrame(columns=range(class_num)) | |
for c, rows in df.groupby('class'): | |
class_means[c] = rows.mean() | |
class_means = class_means.drop(['class']) | |
class_means | |
# Calculating within scatter matrix | |
sw = np.zeros((182,182)) | |
for c, rows in df.groupby('class'): | |
rows = rows.drop(['class'], axis=1) | |
s = np.zeros((182,182)) | |
for index, row in rows.iterrows(): | |
x, mc = row.values.reshape(182,1), class_means[c].values.reshape(182,1) | |
s += (x-mc).dot((x-mc).T) | |
sw+= s | |
f_means = df.mean() | |
# print(f_means) | |
f_means = f_means.drop(['class']) | |
# Calculating Between scatter matrix | |
sb = np.zeros((182,182)) | |
for c in class_means: | |
n = len(df.loc[df['class']==c].index) | |
mc, m = class_means[c].values.reshape(182,1), f_means.values.reshape(182,1) | |
sb += n *(mc - m).dot((mc - m).T) | |
# print("SB=",sb) | |
eigen_values, eigen_vectors = np.linalg.eig(np.linalg.inv(sw).dot(sb)) | |
sorted_values = -np.sort(-eigen_values) | |
# plot sorted eigen values | |
fig = plt.figure(figsize=(10,8)) | |
plt.plot(sorted_values) | |
plt.savefig("eigen_values.png") | |
def top_features(C, values, vectors, sel_features): | |
# sort eigen values indexes in descending order | |
sorted_indexes=np.argsort(-values) | |
# select top indexes (sel_features) | |
sel_indexes = sorted_indexes[:sel_features] | |
# select correspoidings eigen vectors | |
sel_vectors=vectors[:, sel_indexes] | |
# project data onto selected eigen vectors | |
P = C.dot(np.real(sel_vectors)) | |
return P | |
import matplotlib as mpl | |
X= X.drop(['class'], axis=1) | |
X_lda = top_features(x_train, eigen_values, eigen_vectors, 2) | |
from sklearn.preprocessing import LabelEncoder | |
le = LabelEncoder() | |
y = le.fit_transform(y_train) | |
# Uncomment to compare it with the LDA package of sklearn | |
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis | |
# lda = LinearDiscriminantAnalysis() | |
# X_lda = lda.fit_transform(X, y_train) | |
fig = plt.figure(figsize=(10,8)) | |
plt.xlabel('LD1') | |
plt.ylabel('LD2') | |
plt.scatter( | |
X_lda[:,0].real, | |
X_lda[:,1].real, | |
c=y, | |
cmap='rainbow', | |
alpha=0.7) | |
fig.savefig('lda.png') | |
# evalute classifer | |
def eval(classifier, x_train, y_train, x_test, y_test): | |
classifier.fit(x_train, y_train) | |
y_pred = classifier.predict(x_test) | |
acc = accuracy_score(y_test, y_pred).round(4) | |
return acc | |
print("=================================== Without LDA") | |
classifier = GaussianNB() | |
acc = eval(classifier, x_train, y_train, x_test, y_test) | |
print("acc:", acc) | |
acc_list = [] | |
print("evaluation using different numbers of LDA components....") | |
for sel_features in range(1,182): | |
# print(sel_features, end=" ") | |
# print(f"=================================== With LDA with top {sel_features} eigen vectors") | |
LDA_train = top_features(x_train, eigen_values, eigen_vectors, sel_features) | |
LDA_test = top_features(x_test, eigen_values, eigen_vectors, sel_features) | |
acc = eval(classifier, LDA_train, y_train, LDA_test, y_test) | |
acc_list.append(acc) | |
# print("acc 25:", acc_list[25]) | |
fig = plt.figure(figsize=(10,8)) | |
plt.plot(acc_list) | |
plt.ylabel("Accuracy") | |
plt.xlabel("Number of LDA Eigen vectors") | |
plt.savefig("acc.png") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment