shadiakiki1986/test.py

## test.py
# modified example of
# http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA.transform

import numpy as np
from sklearn.decomposition import PCA

# Just the same PCA as the example
pca = PCA(n_components=2, svd_solver='full')

# same input as example
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, -1], [2, -1], [3, -2]])
X
array([[-1, -1],
       [-2, -1],
       [-3, -2],
       [ 1, -1],
       [ 2, -1],
       [ 3, -2]])
pca.fit_transform(X)
array([[ 1.        , -0.33333333],
       [ 2.        , -0.33333333],
       [ 3.        ,  0.66666667],
       [-1.        , -0.33333333],
       [-2.        , -0.33333333],
       [-3.        ,  0.66666667]])

# duplicate the last column
X = np.array([[-1, -1, -1], [-2, -1, -1], [-3, -2, -2], [1, -1, -1], [2, -1, -1], [3, -2, -2]])
X
array([[-1, -1, -1],
       [-2, -1, -1],
       [-3, -2, -2],
       [ 1, -1, -1],
       [ 2, -1, -1],
       [ 3, -2, -2]])

pca.fit_transform(X)
array([[ 1.        , -0.47140452],
       [ 2.        , -0.47140452],
       [ 3.        ,  0.94280904],
       [-1.        , -0.47140452],
       [-2.        , -0.47140452],
       [-3.        ,  0.94280904]])

# the above yielded a different answer, even though it scales with the first

########################
# add normalization
from sklearn import preprocessing
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, -1], [2, -1], [3, -2]])
preprocessing.normalize(pca.fit_transform(X), norm='l2', axis=0)
array([[ 0.18898224, -0.28867513],
       [ 0.37796447, -0.28867513],
       [ 0.56694671,  0.57735027],
       [-0.18898224, -0.28867513],
       [-0.37796447, -0.28867513],
       [-0.56694671,  0.57735027]])

# duplicate last column
X = np.array([[-1, -1, -1], [-2, -1, -1], [-3, -2, -2], [1, -1, -1], [2, -1, -1], [3, -2, -2]])
preprocessing.normalize(pca.fit_transform(X), norm='l2', axis=0)
array([[ 0.18898224, -0.28867513],
       [ 0.37796447, -0.28867513],
       [ 0.56694671,  0.57735027],
       [-0.18898224, -0.28867513],
       [-0.37796447, -0.28867513],
       [-0.56694671,  0.57735027]])
# same answer for duplicated and non-duplicated case

######################
# duplicate last column and scale it
X = np.array([[-1, -1, -1000], [-2, -1, -1000], [-3, -2, -2000], [1, -1, -1000], [2, -1, -1000], [3, -2, -2000]])
X
array([[   -1,    -1, -1000],
       [   -2,    -1, -1000],
       [   -3,    -2, -2000],
       [    1,    -1, -1000],
       [    2,    -1, -1000],
       [    3,    -2, -2000]])
preprocessing.normalize(pca.fit_transform(X), norm='l2', axis=0)
array([[-0.28867513,  0.18898224],
       [-0.28867513,  0.37796447],
       [ 0.57735027,  0.56694671],
       [-0.28867513, -0.18898224],
       [-0.28867513, -0.37796447],
       [ 0.57735027, -0.56694671]])
# for some reason, above columns are swapped

# pre-normalize and post-normalize
preprocessing.normalize(pca.fit_transform(preprocessing.normalize(X,norm='l2',axis=0)), norm='l2', axis=0)
array([[ 0.18898224, -0.28867513],
       [ 0.37796447, -0.28867513],
       [ 0.56694671,  0.57735027],
       [-0.18898224, -0.28867513],
       [-0.37796447, -0.28867513],
       [-0.56694671,  0.57735027]])
# got same answer again

########################
# add scaling
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, -1], [2, -1], [3, -2]])
min_max_scaler.fit_transform(pca.fit_transform(X))
array([[  6.66666667e-01,   2.77555756e-16],
       [  8.33333333e-01,   0.00000000e+00],
       [  1.00000000e+00,   1.00000000e+00],
       [  3.33333333e-01,   1.11022302e-16],
       [  1.66666667e-01,   1.11022302e-16],
       [  0.00000000e+00,   1.00000000e+00]])

# duplicate last column
X = np.array([[-1, -1, -1], [-2, -1, -1], [-3, -2, -2], [1, -1, -1], [2, -1, -1], [3, -2, -2]])
min_max_scaler.fit_transform(pca.fit_transform(X))
array([[  6.66666667e-01,   1.11022302e-16],
       [  8.33333333e-01,   0.00000000e+00],
       [  1.00000000e+00,   1.00000000e+00],
       [  3.33333333e-01,   5.55111512e-17],
       [  1.66666667e-01,   5.55111512e-17],
       [  0.00000000e+00,   1.00000000e+00]])
# almost same answer for duplicated and non-duplicated case

# pre and post scaling
min_max_scaler.fit_transform(pca.fit_transform(min_max_scaler.fit_transform(X)))
array([[  1.11022302e-16,   6.66666667e-01],
       [  0.00000000e+00,   8.33333333e-01],
       [  1.00000000e+00,   1.00000000e+00],
       [  1.11022302e-16,   3.33333333e-01],
       [  1.11022302e-16,   1.66666667e-01],
       [  1.00000000e+00,   0.00000000e+00]])
# change in order of columns, but same answer roughly
	# modified example of
	# http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA.transform

	import numpy as np
	from sklearn.decomposition import PCA

	# Just the same PCA as the example
	pca = PCA(n_components=2, svd_solver='full')

	# same input as example
	X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, -1], [2, -1], [3, -2]])
	X
	array([[-1, -1],
	[-2, -1],
	[-3, -2],
	[ 1, -1],
	[ 2, -1],
	[ 3, -2]])
	pca.fit_transform(X)
	array([[ 1. , -0.33333333],
	[ 2. , -0.33333333],
	[ 3. , 0.66666667],
	[-1. , -0.33333333],
	[-2. , -0.33333333],
	[-3. , 0.66666667]])

	# duplicate the last column
	X = np.array([[-1, -1, -1], [-2, -1, -1], [-3, -2, -2], [1, -1, -1], [2, -1, -1], [3, -2, -2]])
	X
	array([[-1, -1, -1],
	[-2, -1, -1],
	[-3, -2, -2],
	[ 1, -1, -1],
	[ 2, -1, -1],
	[ 3, -2, -2]])

	pca.fit_transform(X)
	array([[ 1. , -0.47140452],
	[ 2. , -0.47140452],
	[ 3. , 0.94280904],
	[-1. , -0.47140452],
	[-2. , -0.47140452],
	[-3. , 0.94280904]])

	# the above yielded a different answer, even though it scales with the first

	########################
	# add normalization
	from sklearn import preprocessing
	X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, -1], [2, -1], [3, -2]])
	preprocessing.normalize(pca.fit_transform(X), norm='l2', axis=0)
	array([[ 0.18898224, -0.28867513],
	[ 0.37796447, -0.28867513],
	[ 0.56694671, 0.57735027],
	[-0.18898224, -0.28867513],
	[-0.37796447, -0.28867513],
	[-0.56694671, 0.57735027]])

	# duplicate last column
	X = np.array([[-1, -1, -1], [-2, -1, -1], [-3, -2, -2], [1, -1, -1], [2, -1, -1], [3, -2, -2]])
	preprocessing.normalize(pca.fit_transform(X), norm='l2', axis=0)
	array([[ 0.18898224, -0.28867513],
	[ 0.37796447, -0.28867513],
	[ 0.56694671, 0.57735027],
	[-0.18898224, -0.28867513],
	[-0.37796447, -0.28867513],
	[-0.56694671, 0.57735027]])
	# same answer for duplicated and non-duplicated case

	######################
	# duplicate last column and scale it
	X = np.array([[-1, -1, -1000], [-2, -1, -1000], [-3, -2, -2000], [1, -1, -1000], [2, -1, -1000], [3, -2, -2000]])
	X
	array([[ -1, -1, -1000],
	[ -2, -1, -1000],
	[ -3, -2, -2000],
	[ 1, -1, -1000],
	[ 2, -1, -1000],
	[ 3, -2, -2000]])
	preprocessing.normalize(pca.fit_transform(X), norm='l2', axis=0)
	array([[-0.28867513, 0.18898224],
	[-0.28867513, 0.37796447],
	[ 0.57735027, 0.56694671],
	[-0.28867513, -0.18898224],
	[-0.28867513, -0.37796447],
	[ 0.57735027, -0.56694671]])
	# for some reason, above columns are swapped

	# pre-normalize and post-normalize
	preprocessing.normalize(pca.fit_transform(preprocessing.normalize(X,norm='l2',axis=0)), norm='l2', axis=0)
	array([[ 0.18898224, -0.28867513],
	[ 0.37796447, -0.28867513],
	[ 0.56694671, 0.57735027],
	[-0.18898224, -0.28867513],
	[-0.37796447, -0.28867513],
	[-0.56694671, 0.57735027]])
	# got same answer again

	########################
	# add scaling
	from sklearn import preprocessing
	min_max_scaler = preprocessing.MinMaxScaler()
	X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, -1], [2, -1], [3, -2]])
	min_max_scaler.fit_transform(pca.fit_transform(X))
	array([[ 6.66666667e-01, 2.77555756e-16],
	[ 8.33333333e-01, 0.00000000e+00],
	[ 1.00000000e+00, 1.00000000e+00],
	[ 3.33333333e-01, 1.11022302e-16],
	[ 1.66666667e-01, 1.11022302e-16],
	[ 0.00000000e+00, 1.00000000e+00]])

	# duplicate last column
	X = np.array([[-1, -1, -1], [-2, -1, -1], [-3, -2, -2], [1, -1, -1], [2, -1, -1], [3, -2, -2]])
	min_max_scaler.fit_transform(pca.fit_transform(X))
	array([[ 6.66666667e-01, 1.11022302e-16],
	[ 8.33333333e-01, 0.00000000e+00],
	[ 1.00000000e+00, 1.00000000e+00],
	[ 3.33333333e-01, 5.55111512e-17],
	[ 1.66666667e-01, 5.55111512e-17],
	[ 0.00000000e+00, 1.00000000e+00]])
	# almost same answer for duplicated and non-duplicated case

	# pre and post scaling
	min_max_scaler.fit_transform(pca.fit_transform(min_max_scaler.fit_transform(X)))
	array([[ 1.11022302e-16, 6.66666667e-01],
	[ 0.00000000e+00, 8.33333333e-01],
	[ 1.00000000e+00, 1.00000000e+00],
	[ 1.11022302e-16, 3.33333333e-01],
	[ 1.11022302e-16, 1.66666667e-01],
	[ 1.00000000e+00, 0.00000000e+00]])
	# change in order of columns, but same answer roughly