Skip to content

Instantly share code, notes, and snippets.

@shadiakiki1986
Last active July 14, 2017 16:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shadiakiki1986/2c293e364563492c65bffdb6122b4e92 to your computer and use it in GitHub Desktop.
Save shadiakiki1986/2c293e364563492c65bffdb6122b4e92 to your computer and use it in GitHub Desktop.
sklearn PCA requires normalization
# modified example of
# http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA.transform
import numpy as np
from sklearn.decomposition import PCA
# Just the same PCA as the example
pca = PCA(n_components=2, svd_solver='full')
# same input as example
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, -1], [2, -1], [3, -2]])
X
array([[-1, -1],
[-2, -1],
[-3, -2],
[ 1, -1],
[ 2, -1],
[ 3, -2]])
pca.fit_transform(X)
array([[ 1. , -0.33333333],
[ 2. , -0.33333333],
[ 3. , 0.66666667],
[-1. , -0.33333333],
[-2. , -0.33333333],
[-3. , 0.66666667]])
# duplicate the last column
X = np.array([[-1, -1, -1], [-2, -1, -1], [-3, -2, -2], [1, -1, -1], [2, -1, -1], [3, -2, -2]])
X
array([[-1, -1, -1],
[-2, -1, -1],
[-3, -2, -2],
[ 1, -1, -1],
[ 2, -1, -1],
[ 3, -2, -2]])
pca.fit_transform(X)
array([[ 1. , -0.47140452],
[ 2. , -0.47140452],
[ 3. , 0.94280904],
[-1. , -0.47140452],
[-2. , -0.47140452],
[-3. , 0.94280904]])
# the above yielded a different answer, even though it scales with the first
########################
# add normalization
from sklearn import preprocessing
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, -1], [2, -1], [3, -2]])
preprocessing.normalize(pca.fit_transform(X), norm='l2', axis=0)
array([[ 0.18898224, -0.28867513],
[ 0.37796447, -0.28867513],
[ 0.56694671, 0.57735027],
[-0.18898224, -0.28867513],
[-0.37796447, -0.28867513],
[-0.56694671, 0.57735027]])
# duplicate last column
X = np.array([[-1, -1, -1], [-2, -1, -1], [-3, -2, -2], [1, -1, -1], [2, -1, -1], [3, -2, -2]])
preprocessing.normalize(pca.fit_transform(X), norm='l2', axis=0)
array([[ 0.18898224, -0.28867513],
[ 0.37796447, -0.28867513],
[ 0.56694671, 0.57735027],
[-0.18898224, -0.28867513],
[-0.37796447, -0.28867513],
[-0.56694671, 0.57735027]])
# same answer for duplicated and non-duplicated case
######################
# duplicate last column and scale it
X = np.array([[-1, -1, -1000], [-2, -1, -1000], [-3, -2, -2000], [1, -1, -1000], [2, -1, -1000], [3, -2, -2000]])
X
array([[ -1, -1, -1000],
[ -2, -1, -1000],
[ -3, -2, -2000],
[ 1, -1, -1000],
[ 2, -1, -1000],
[ 3, -2, -2000]])
preprocessing.normalize(pca.fit_transform(X), norm='l2', axis=0)
array([[-0.28867513, 0.18898224],
[-0.28867513, 0.37796447],
[ 0.57735027, 0.56694671],
[-0.28867513, -0.18898224],
[-0.28867513, -0.37796447],
[ 0.57735027, -0.56694671]])
# for some reason, above columns are swapped
# pre-normalize and post-normalize
preprocessing.normalize(pca.fit_transform(preprocessing.normalize(X,norm='l2',axis=0)), norm='l2', axis=0)
array([[ 0.18898224, -0.28867513],
[ 0.37796447, -0.28867513],
[ 0.56694671, 0.57735027],
[-0.18898224, -0.28867513],
[-0.37796447, -0.28867513],
[-0.56694671, 0.57735027]])
# got same answer again
########################
# add scaling
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, -1], [2, -1], [3, -2]])
min_max_scaler.fit_transform(pca.fit_transform(X))
array([[ 6.66666667e-01, 2.77555756e-16],
[ 8.33333333e-01, 0.00000000e+00],
[ 1.00000000e+00, 1.00000000e+00],
[ 3.33333333e-01, 1.11022302e-16],
[ 1.66666667e-01, 1.11022302e-16],
[ 0.00000000e+00, 1.00000000e+00]])
# duplicate last column
X = np.array([[-1, -1, -1], [-2, -1, -1], [-3, -2, -2], [1, -1, -1], [2, -1, -1], [3, -2, -2]])
min_max_scaler.fit_transform(pca.fit_transform(X))
array([[ 6.66666667e-01, 1.11022302e-16],
[ 8.33333333e-01, 0.00000000e+00],
[ 1.00000000e+00, 1.00000000e+00],
[ 3.33333333e-01, 5.55111512e-17],
[ 1.66666667e-01, 5.55111512e-17],
[ 0.00000000e+00, 1.00000000e+00]])
# almost same answer for duplicated and non-duplicated case
# pre and post scaling
min_max_scaler.fit_transform(pca.fit_transform(min_max_scaler.fit_transform(X)))
array([[ 1.11022302e-16, 6.66666667e-01],
[ 0.00000000e+00, 8.33333333e-01],
[ 1.00000000e+00, 1.00000000e+00],
[ 1.11022302e-16, 3.33333333e-01],
[ 1.11022302e-16, 1.66666667e-01],
[ 1.00000000e+00, 0.00000000e+00]])
# change in order of columns, but same answer roughly
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment