Last active
June 21, 2018 16:51
-
-
Save bellbind/ede1466d17647a0133b3625afbf665c5 to your computer and use it in GitHub Desktop.
[python3][pandas][numpy][sklearn]PCA(Principal Component Analysis) with python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ブランド名 | プロっぽい | カッコいい | 好き | かわいい | 一般的 | ミーハー | 初心者 | ダサイ | 嫌い | わからない | 欲しい | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
ロシニョール | 33 | 15 | 38 | 11 | 71 | 94 | 7 | 4 | 8 | 5 | 37 | |
ヤマハ | 10 | 6 | 24 | 18 | 75 | 46 | 29 | 5 | 5 | 12 | 10 | |
オガサカ | 67 | 5 | 18 | 1 | 32 | 4 | 12 | 25 | 24 | 11 | 14 | |
カザマ | 25 | 5 | 12 | 2 | 33 | 4 | 18 | 40 | 23 | 31 | 8 | |
アトミック | 52 | 32 | 24 | 5 | 32 | 32 | 3 | 11 | 5 | 17 | 35 | |
ニシザワ | 23 | 7 | 12 | 1 | 8 | 1 | 3 | 32 | 15 | 28 | 11 | |
クナイスル | 32 | 13 | 12 | 0 | 37 | 2 | 6 | 23 | 16 | 33 | 9 | |
K2 | 48 | 26 | 24 | 3 | 20 | 32 | 2 | 10 | 21 | 20 | 23 | |
フィッシャー | 27 | 14 | 18 | 2 | 20 | 7 | 3 | 9 | 12 | 37 | 11 | |
ブリザード | 41 | 25 | 20 | 4 | 24 | 8 | 2 | 12 | 12 | 28 | 17 | |
ミズノ | 5 | 3 | 1 | 0 | 48 | 6 | 27 | 74 | 27 | 30 | 3 | |
オーリン | 19 | 17 | 22 | 13 | 22 | 37 | 8 | 17 | 10 | 21 | 16 | |
スワロー | 5 | 1 | 0 | 2 | 18 | 1 | 43 | 69 | 26 | 27 | 2 | |
ケスレー | 47 | 19 | 12 | 1 | 17 | 2 | 2 | 14 | 11 | 44 | 14 | |
エラン | 36 | 8 | 13 | 2 | 7 | 11 | 0 | 10 | 12 | 36 | 8 | |
フォルクル | 32 | 7 | 10 | 0 | 1 | 4 | 0 | 11 | 8 | 28 | 8 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# PCA(principal component analysis) with numpy | |
import numpy as np | |
import matplotlib as mpl | |
import matplotlib.pyplot as plt | |
import pandas | |
# data from book ISBN:978-4-480-09861-0 | |
frame = pandas.read_csv("./data.csv", index_col=0) | |
ids = frame.index | |
props = frame.columns | |
M = frame.values | |
n = len(frame) # number of rows | |
X = M - M.mean(axis=0) | |
# Cxx: variance-covariance matrix | |
Cxx = (X.T @ X) / n | |
# Or, simply use with np.cov() as: | |
#Cxx = np.cov(M, rowvar=False, bias=True) | |
# NOTE: result eigval/vecs is not orderd | |
eigvals, eigvecs = np.linalg.eig(Cxx) | |
eigidx = eigvals.argsort()[::-1] # descending order index | |
eigvals, eigvecs = eigvals[eigidx], eigvecs[:, eigidx] | |
rates = eigvals / sum(eigvals) | |
print(f"2 explained variance rate: {sum(rates[:2])}") | |
# component vectors (NOTE: nagate for plot layout in the former book) | |
v1 = -eigvecs[:, 0] | |
v2 = -eigvecs[:, 1] | |
# component scores | |
f1 = X @ v1 | |
f2 = X @ v2 | |
# plot | |
mpl.rc("font", family="Noto Sans CJK JP") | |
fig, (s1, s2) = plt.subplots(1, 2, figsize=(16, 8)) | |
def plot(s, xs, ys, labels): | |
s.axhline(color="gray") | |
s.axvline(color="gray") | |
s.scatter(xs, ys) | |
for label, x, y in zip(labels, xs, ys): | |
s.annotate(label, (x, y)) | |
pass | |
pass | |
plot(s1, v1, v2, props) | |
plot(s2, f1, f2, ids) | |
fig.savefig("pca-numpy-result.png") | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# PCA(principal component analysis) with sklearn | |
import sklearn.decomposition # pip install scipy sklearn | |
import matplotlib as mpl | |
import matplotlib.pyplot as plt | |
import pandas | |
# data from book ISBN:978-4-480-09861-0 | |
frame = pandas.read_csv("./data.csv", index_col=0) | |
ids = frame.index | |
props = frame.columns | |
M = frame.values | |
n = len(frame) # number of rows | |
pca = sklearn.decomposition.PCA(n_components=2) | |
pca.fit(M) | |
# component vectors | |
v1, v2 = pca.components_ | |
# component scores | |
trans = pca.transform(M) | |
f1 = trans[:, 0] | |
f2 = trans[:, 1] | |
# plot | |
mpl.rc("font", family="Noto Sans CJK JP") | |
fig, (s1, s2) = plt.subplots(1, 2, figsize=(16, 8)) | |
def plot(s, xs, ys, labels): | |
s.axhline(color="gray") | |
s.axvline(color="gray") | |
s.scatter(xs, ys) | |
for label, x, y in zip(labels, xs, ys): | |
s.annotate(label, (x, y)) | |
pass | |
pass | |
plot(s1, v1, v2, props) | |
plot(s2, f1, f2, ids) | |
fig.savefig("pca-sklearn-result.png") | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The values in data.csv are referred from: https://www.amazon.co.jp/dp/4480098615
Note on Font