puraminy/pca_feature_selection.py

## pca_feature_selection.py
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt

path = 'drive/My Drive/FMNIST/'
x_train = pd.read_csv(path+'Train_Data.csv')
y_train = pd.read_csv(path+'Train_Labels.csv')

x_test = pd.read_csv(path+'Test_Data.csv')
y_test =  pd.read_csv(path+'Test_Labels.csv')
print(x_train.shape)

# x_train.shape

r = -1
x_train = x_train.iloc[:r]
y_train = y_train.iloc[:r]
x_test = x_test.iloc[:r]
y_test = y_test.iloc[:r]

def preprocess(data):
    # in this part we scale data between [0.1]
    min_max_scaler = preprocessing.MinMaxScaler()
    x_train_minmax = min_max_scaler.fit_transform(data)
    return  x_train_minmax

x_train = preprocess(x_train)
x_test = preprocess(x_test)

from numpy import array
from numpy import mean
from numpy import cov
from numpy.linalg import eig

def myPCA(A):
  # calculate the mean of each column
  M = mean(A.T, axis=1)
  # center columns by subtracting column means
  C = A - M
  # calculate covariance matrix of centered matrix
  V = cov(C.T)
  # eigendecomposition of covariance matrix
  values, vectors = eig(V)
  return C, values, vectors, M

# sorting
norm_data, values, vectors, train_means = myPCA(x_train)
sorted_values = -np.sort(-values)

# plot sorted eigen values
fig = plt.figure(figsize=(10,8))
plt.plot(sorted_values)
plt.savefig("eigen_values.png")

def top_features(C, values, vectors, sel_features):
  # sort eigen values indexes in descending order
  sorted_indexes=np.argsort(-values)
  # select top indexes (sel_features)
  sel_indexes = sorted_indexes[:sel_features]
  # select correspoidings eigen vectors
  sel_vectors=vectors[sel_indexes, :]
  # project data onto selected eigen vectors
  P = C.dot(sel_vectors.T)
  return P

# evalute classifer
def eval(classifier, x_train, y_train, x_test, y_test):
  classifier.fit(x_train, y_train)
  y_pred = classifier.predict(x_test)
  acc = accuracy_score(y_test, y_pred).round(4)
  return acc

print("=================================== Without PCA")
classifier = GaussianNB()
acc = eval(classifier, x_train, y_train, x_test, y_test)
print("acc:", acc)

norm_test = x_test - train_means
acc_list = []
print("evaluation using different numbers of pca components....")
for sel_features in range(1,182):
  # print(sel_features, end=" ")
  # print(f"=================================== With PCA  with top {sel_features} eigen vectors")
  pca_train = top_features(norm_data, values, vectors, sel_features)
  pca_test = top_features(norm_test, values, vectors, sel_features)
  acc = eval(classifier, pca_train, y_train, pca_test, y_test)
  acc_list.append(acc)

# print("acc 25:", acc_list[25])

fig = plt.figure(figsize=(10,8))
plt.plot(acc_list)
plt.ylabel("Accuracy")
plt.xlabel("Number of PCA Eigen vectors")
plt.savefig("acc.png")

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y_train)

pca_train = top_features(norm_data, values, vectors, 2)
fig = plt.figure(figsize=(10,8))
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.scatter(
    pca_train[:,0],
    pca_train[:,1],
    c=y,
    cmap='rainbow',
    alpha=0.7)

fig.savefig('pca.png')
	import numpy as np
	import pandas as pd
	from sklearn.metrics import confusion_matrix
	from sklearn.metrics import accuracy_score
	from sklearn import preprocessing
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.neighbors import RadiusNeighborsClassifier
	from sklearn.naive_bayes import GaussianNB
	import matplotlib.pyplot as plt

	path = 'drive/My Drive/FMNIST/'
	x_train = pd.read_csv(path+'Train_Data.csv')
	y_train = pd.read_csv(path+'Train_Labels.csv')

	x_test = pd.read_csv(path+'Test_Data.csv')
	y_test = pd.read_csv(path+'Test_Labels.csv')
	print(x_train.shape)

	# x_train.shape

	r = -1
	x_train = x_train.iloc[:r]
	y_train = y_train.iloc[:r]
	x_test = x_test.iloc[:r]
	y_test = y_test.iloc[:r]

	def preprocess(data):
	# in this part we scale data between [0.1]
	min_max_scaler = preprocessing.MinMaxScaler()
	x_train_minmax = min_max_scaler.fit_transform(data)
	return x_train_minmax

	x_train = preprocess(x_train)
	x_test = preprocess(x_test)

	from numpy import array
	from numpy import mean
	from numpy import cov
	from numpy.linalg import eig

	def myPCA(A):
	# calculate the mean of each column
	M = mean(A.T, axis=1)
	# center columns by subtracting column means
	C = A - M
	# calculate covariance matrix of centered matrix
	V = cov(C.T)
	# eigendecomposition of covariance matrix
	values, vectors = eig(V)
	return C, values, vectors, M

	# sorting
	norm_data, values, vectors, train_means = myPCA(x_train)
	sorted_values = -np.sort(-values)

	# plot sorted eigen values
	fig = plt.figure(figsize=(10,8))
	plt.plot(sorted_values)
	plt.savefig("eigen_values.png")

	def top_features(C, values, vectors, sel_features):
	# sort eigen values indexes in descending order
	sorted_indexes=np.argsort(-values)
	# select top indexes (sel_features)
	sel_indexes = sorted_indexes[:sel_features]
	# select correspoidings eigen vectors
	sel_vectors=vectors[sel_indexes, :]
	# project data onto selected eigen vectors
	P = C.dot(sel_vectors.T)
	return P

	# evalute classifer
	def eval(classifier, x_train, y_train, x_test, y_test):
	classifier.fit(x_train, y_train)
	y_pred = classifier.predict(x_test)
	acc = accuracy_score(y_test, y_pred).round(4)
	return acc

	print("=================================== Without PCA")
	classifier = GaussianNB()
	acc = eval(classifier, x_train, y_train, x_test, y_test)
	print("acc:", acc)

	norm_test = x_test - train_means
	acc_list = []
	print("evaluation using different numbers of pca components....")
	for sel_features in range(1,182):
	# print(sel_features, end=" ")
	# print(f"=================================== With PCA with top {sel_features} eigen vectors")
	pca_train = top_features(norm_data, values, vectors, sel_features)
	pca_test = top_features(norm_test, values, vectors, sel_features)
	acc = eval(classifier, pca_train, y_train, pca_test, y_test)
	acc_list.append(acc)

	# print("acc 25:", acc_list[25])

	fig = plt.figure(figsize=(10,8))
	plt.plot(acc_list)
	plt.ylabel("Accuracy")
	plt.xlabel("Number of PCA Eigen vectors")
	plt.savefig("acc.png")

	from sklearn.preprocessing import LabelEncoder
	le = LabelEncoder()
	y = le.fit_transform(y_train)

	pca_train = top_features(norm_data, values, vectors, 2)
	fig = plt.figure(figsize=(10,8))
	plt.xlabel('PCA1')
	plt.ylabel('PCA2')
	plt.scatter(
	pca_train[:,0],
	pca_train[:,1],
	c=y,
	cmap='rainbow',
	alpha=0.7)

	fig.savefig('pca.png')