Skip to content

Instantly share code, notes, and snippets.

@puraminy
Last active December 24, 2019 20:27
Show Gist options
  • Save puraminy/6b2b1e6be77fd28531b0f85b485d01c0 to your computer and use it in GitHub Desktop.
Save puraminy/6b2b1e6be77fd28531b0f85b485d01c0 to your computer and use it in GitHub Desktop.
LDA (Linear Discrimination Analysis)
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
path = 'drive/My Drive/FMNIST/'
x_train = pd.read_csv(path+'Train_Data.csv')
y_train = pd.read_csv(path+'Train_Labels.csv')
x_test = pd.read_csv(path+'Test_Data.csv')
y_test = pd.read_csv(path+'Test_Labels.csv')
# print(x_train.shape)
def preprocess(data):
# in this part we scale data between [0.1]
min_max_scaler = preprocessing.MinMaxScaler()
x_train_minmax = min_max_scaler.fit_transform(data)
return x_train_minmax
x_train = preprocess(x_train)
x_test = preprocess(x_test)
X = pd.DataFrame(x_train)
y = pd.DataFrame(y_train)
#print(X.shape)
#print(y.shape)
X.columns = range(182)
X['class']=y.to_numpy()
# X.head()
df = X
class_num = len(df.groupby('class'))
# print(class_num)
# print(x_train.shape)
import warnings
warnings.filterwarnings('ignore')
class_means = pd.DataFrame(columns=range(class_num))
for c, rows in df.groupby('class'):
class_means[c] = rows.mean()
class_means = class_means.drop(['class'])
class_means
# Calculating within scatter matrix
sw = np.zeros((182,182))
for c, rows in df.groupby('class'):
rows = rows.drop(['class'], axis=1)
s = np.zeros((182,182))
for index, row in rows.iterrows():
x, mc = row.values.reshape(182,1), class_means[c].values.reshape(182,1)
s += (x-mc).dot((x-mc).T)
sw+= s
f_means = df.mean()
# print(f_means)
f_means = f_means.drop(['class'])
# Calculating Between scatter matrix
sb = np.zeros((182,182))
for c in class_means:
n = len(df.loc[df['class']==c].index)
mc, m = class_means[c].values.reshape(182,1), f_means.values.reshape(182,1)
sb += n *(mc - m).dot((mc - m).T)
# print("SB=",sb)
eigen_values, eigen_vectors = np.linalg.eig(np.linalg.inv(sw).dot(sb))
sorted_values = -np.sort(-eigen_values)
# plot sorted eigen values
fig = plt.figure(figsize=(10,8))
plt.plot(sorted_values)
plt.savefig("eigen_values.png")
def top_features(C, values, vectors, sel_features):
# sort eigen values indexes in descending order
sorted_indexes=np.argsort(-values)
# select top indexes (sel_features)
sel_indexes = sorted_indexes[:sel_features]
# select correspoidings eigen vectors
sel_vectors=vectors[:, sel_indexes]
# project data onto selected eigen vectors
P = C.dot(np.real(sel_vectors))
return P
import matplotlib as mpl
X= X.drop(['class'], axis=1)
X_lda = top_features(x_train, eigen_values, eigen_vectors, 2)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y_train)
# Uncomment to compare it with the LDA package of sklearn
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# lda = LinearDiscriminantAnalysis()
# X_lda = lda.fit_transform(X, y_train)
fig = plt.figure(figsize=(10,8))
plt.xlabel('LD1')
plt.ylabel('LD2')
plt.scatter(
X_lda[:,0].real,
X_lda[:,1].real,
c=y,
cmap='rainbow',
alpha=0.7)
fig.savefig('lda.png')
# evalute classifer
def eval(classifier, x_train, y_train, x_test, y_test):
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
acc = accuracy_score(y_test, y_pred).round(4)
return acc
print("=================================== Without LDA")
classifier = GaussianNB()
acc = eval(classifier, x_train, y_train, x_test, y_test)
print("acc:", acc)
acc_list = []
print("evaluation using different numbers of LDA components....")
for sel_features in range(1,182):
# print(sel_features, end=" ")
# print(f"=================================== With LDA with top {sel_features} eigen vectors")
LDA_train = top_features(x_train, eigen_values, eigen_vectors, sel_features)
LDA_test = top_features(x_test, eigen_values, eigen_vectors, sel_features)
acc = eval(classifier, LDA_train, y_train, LDA_test, y_test)
acc_list.append(acc)
# print("acc 25:", acc_list[25])
fig = plt.figure(figsize=(10,8))
plt.plot(acc_list)
plt.ylabel("Accuracy")
plt.xlabel("Number of LDA Eigen vectors")
plt.savefig("acc.png")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment