Skip to content

Instantly share code, notes, and snippets.

@lorenzodisidoro
Last active December 2, 2020 11:09
Show Gist options
  • Save lorenzodisidoro/4b300da693a71373837f2b8c9b2a3752 to your computer and use it in GitHub Desktop.
Save lorenzodisidoro/4b300da693a71373837f2b8c9b2a3752 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform
from scipy.linalg import eigh
from numpy import exp
## Import sample dataset
from sklearn.datasets import make_circles
X, y = make_circles(n_samples=1000, random_state=123, noise=0.1, factor=0.2)
print("Initial number of dataset feautures: ", len(X.T)) # should be 13
# Calculate pairwise squared Euclidean distances
# in the MxN dimensional dataset.
sq_dists = pdist(X, 'sqeuclidean')
# Convert pairwise distances into a square matrix
mat_sq_dists = squareform(sq_dists)
# Compute the symmetric kernel matrix
gamma=15
K = exp(-gamma * mat_sq_dists)
# Center the kernel matrix
N = K.shape[0]
one_n = np.ones((N, N)) / N
K = K - one_n.dot(K) - K.dot(one_n) + one_n.dot(K).dot(one_n)
# Obtaining eigenpairs from the centered kernel matrix
# scipy.linalg.eigh returns them in ascending order
eigvals, eigvecs = eigh(K)
eigvals, eigvecs = eigvals[::-1], eigvecs[:, ::-1]
# Collect the top k eigenvectors (projected examples)
n_components=2 # num of eigenvectors to return
X_pca = np.column_stack([eigvecs[:, i] for i in range(n_components)])
print("Kernel PCA: Number of selected dataset feautures: ", len(X_pca.T)) # should be 2
print(X_pca)
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(7, 3))
ax[0].scatter(X[y == 0, 0], X[y == 0, 1], color='red', marker='+', alpha=0.5)
ax[0].scatter(X[y == 1, 0], X[y == 1, 1], color='blue', marker='x', alpha=0.5)
ax[1].scatter(X_pca[y == 0, 0], X_pca[y == 0, 1], color='red', marker='+', alpha=0.5)
ax[1].scatter(X_pca[y == 1, 0], X_pca[y == 1, 1], color='blue', marker='x', alpha=0.5)
ax[0].set_ylim([-2, 2])
ax[0].set_yticks([])
ax[0].set_xlabel('Initial Dataset')
ax[1].set_xlabel('PC1')
ax[1].set_ylabel('PC2')
plt.tight_layout()
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment