Last active
December 24, 2019 18:28
-
-
Save puraminy/b6808c9ec8ff9a1be715b4572321f162 to your computer and use it in GitHub Desktop.
PCA for feature selection
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from sklearn.metrics import confusion_matrix | |
from sklearn.metrics import accuracy_score | |
from sklearn import preprocessing | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.neighbors import RadiusNeighborsClassifier | |
from sklearn.naive_bayes import GaussianNB | |
import matplotlib.pyplot as plt | |
path = 'drive/My Drive/FMNIST/' | |
x_train = pd.read_csv(path+'Train_Data.csv') | |
y_train = pd.read_csv(path+'Train_Labels.csv') | |
x_test = pd.read_csv(path+'Test_Data.csv') | |
y_test = pd.read_csv(path+'Test_Labels.csv') | |
print(x_train.shape) | |
# x_train.shape | |
r = -1 | |
x_train = x_train.iloc[:r] | |
y_train = y_train.iloc[:r] | |
x_test = x_test.iloc[:r] | |
y_test = y_test.iloc[:r] | |
def preprocess(data): | |
# in this part we scale data between [0.1] | |
min_max_scaler = preprocessing.MinMaxScaler() | |
x_train_minmax = min_max_scaler.fit_transform(data) | |
return x_train_minmax | |
x_train = preprocess(x_train) | |
x_test = preprocess(x_test) | |
from numpy import array | |
from numpy import mean | |
from numpy import cov | |
from numpy.linalg import eig | |
def myPCA(A): | |
# calculate the mean of each column | |
M = mean(A.T, axis=1) | |
# center columns by subtracting column means | |
C = A - M | |
# calculate covariance matrix of centered matrix | |
V = cov(C.T) | |
# eigendecomposition of covariance matrix | |
values, vectors = eig(V) | |
return C, values, vectors, M | |
# sorting | |
norm_data, values, vectors, train_means = myPCA(x_train) | |
sorted_values = -np.sort(-values) | |
# plot sorted eigen values | |
fig = plt.figure(figsize=(10,8)) | |
plt.plot(sorted_values) | |
plt.savefig("eigen_values.png") | |
def top_features(C, values, vectors, sel_features): | |
# sort eigen values indexes in descending order | |
sorted_indexes=np.argsort(-values) | |
# select top indexes (sel_features) | |
sel_indexes = sorted_indexes[:sel_features] | |
# select correspoidings eigen vectors | |
sel_vectors=vectors[sel_indexes, :] | |
# project data onto selected eigen vectors | |
P = C.dot(sel_vectors.T) | |
return P | |
# evalute classifer | |
def eval(classifier, x_train, y_train, x_test, y_test): | |
classifier.fit(x_train, y_train) | |
y_pred = classifier.predict(x_test) | |
acc = accuracy_score(y_test, y_pred).round(4) | |
return acc | |
print("=================================== Without PCA") | |
classifier = GaussianNB() | |
acc = eval(classifier, x_train, y_train, x_test, y_test) | |
print("acc:", acc) | |
norm_test = x_test - train_means | |
acc_list = [] | |
print("evaluation using different numbers of pca components....") | |
for sel_features in range(1,182): | |
# print(sel_features, end=" ") | |
# print(f"=================================== With PCA with top {sel_features} eigen vectors") | |
pca_train = top_features(norm_data, values, vectors, sel_features) | |
pca_test = top_features(norm_test, values, vectors, sel_features) | |
acc = eval(classifier, pca_train, y_train, pca_test, y_test) | |
acc_list.append(acc) | |
# print("acc 25:", acc_list[25]) | |
fig = plt.figure(figsize=(10,8)) | |
plt.plot(acc_list) | |
plt.ylabel("Accuracy") | |
plt.xlabel("Number of PCA Eigen vectors") | |
plt.savefig("acc.png") | |
from sklearn.preprocessing import LabelEncoder | |
le = LabelEncoder() | |
y = le.fit_transform(y_train) | |
pca_train = top_features(norm_data, values, vectors, 2) | |
fig = plt.figure(figsize=(10,8)) | |
plt.xlabel('PCA1') | |
plt.ylabel('PCA2') | |
plt.scatter( | |
pca_train[:,0], | |
pca_train[:,1], | |
c=y, | |
cmap='rainbow', | |
alpha=0.7) | |
fig.savefig('pca.png') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment