Skip to content

Instantly share code, notes, and snippets.

@puraminy
Last active December 24, 2019 18:28
Show Gist options
  • Save puraminy/b6808c9ec8ff9a1be715b4572321f162 to your computer and use it in GitHub Desktop.
Save puraminy/b6808c9ec8ff9a1be715b4572321f162 to your computer and use it in GitHub Desktop.
PCA for feature selection
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
path = 'drive/My Drive/FMNIST/'
x_train = pd.read_csv(path+'Train_Data.csv')
y_train = pd.read_csv(path+'Train_Labels.csv')
x_test = pd.read_csv(path+'Test_Data.csv')
y_test = pd.read_csv(path+'Test_Labels.csv')
print(x_train.shape)
# x_train.shape
r = -1
x_train = x_train.iloc[:r]
y_train = y_train.iloc[:r]
x_test = x_test.iloc[:r]
y_test = y_test.iloc[:r]
def preprocess(data):
# in this part we scale data between [0.1]
min_max_scaler = preprocessing.MinMaxScaler()
x_train_minmax = min_max_scaler.fit_transform(data)
return x_train_minmax
x_train = preprocess(x_train)
x_test = preprocess(x_test)
from numpy import array
from numpy import mean
from numpy import cov
from numpy.linalg import eig
def myPCA(A):
# calculate the mean of each column
M = mean(A.T, axis=1)
# center columns by subtracting column means
C = A - M
# calculate covariance matrix of centered matrix
V = cov(C.T)
# eigendecomposition of covariance matrix
values, vectors = eig(V)
return C, values, vectors, M
# sorting
norm_data, values, vectors, train_means = myPCA(x_train)
sorted_values = -np.sort(-values)
# plot sorted eigen values
fig = plt.figure(figsize=(10,8))
plt.plot(sorted_values)
plt.savefig("eigen_values.png")
def top_features(C, values, vectors, sel_features):
# sort eigen values indexes in descending order
sorted_indexes=np.argsort(-values)
# select top indexes (sel_features)
sel_indexes = sorted_indexes[:sel_features]
# select correspoidings eigen vectors
sel_vectors=vectors[sel_indexes, :]
# project data onto selected eigen vectors
P = C.dot(sel_vectors.T)
return P
# evalute classifer
def eval(classifier, x_train, y_train, x_test, y_test):
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
acc = accuracy_score(y_test, y_pred).round(4)
return acc
print("=================================== Without PCA")
classifier = GaussianNB()
acc = eval(classifier, x_train, y_train, x_test, y_test)
print("acc:", acc)
norm_test = x_test - train_means
acc_list = []
print("evaluation using different numbers of pca components....")
for sel_features in range(1,182):
# print(sel_features, end=" ")
# print(f"=================================== With PCA with top {sel_features} eigen vectors")
pca_train = top_features(norm_data, values, vectors, sel_features)
pca_test = top_features(norm_test, values, vectors, sel_features)
acc = eval(classifier, pca_train, y_train, pca_test, y_test)
acc_list.append(acc)
# print("acc 25:", acc_list[25])
fig = plt.figure(figsize=(10,8))
plt.plot(acc_list)
plt.ylabel("Accuracy")
plt.xlabel("Number of PCA Eigen vectors")
plt.savefig("acc.png")
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y_train)
pca_train = top_features(norm_data, values, vectors, 2)
fig = plt.figure(figsize=(10,8))
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.scatter(
pca_train[:,0],
pca_train[:,1],
c=y,
cmap='rainbow',
alpha=0.7)
fig.savefig('pca.png')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment