Last active
August 22, 2018 12:37
-
-
Save kohiro37/d18507ab949542b27ac6655aa443c024 to your computer and use it in GitHub Desktop.
Comparison between Euclidean distance and Cosine similarity
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from sklearn.datasets import load_iris | |
from sklearn.preprocessing import MinMaxScaler | |
from scipy.spatial.distance import pdist | |
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster | |
get_ipython().run_line_magic('matplotlib', 'inline') | |
import matplotlib.pyplot as plt | |
def iris_df(): | |
# scikit-learnのirisデータセット読み込み | |
iris = load_iris() | |
# irisデータセットをPandasのDetaFrameに変換 | |
df = pd.DataFrame(data= np.c_[iris['data'], iris['target']], columns= iris['feature_names'] + ['target']) | |
# 正規化 | |
items = df.columns | |
index = df.index | |
df = pd.DataFrame(MinMaxScaler().fit_transform(df)) | |
df.columns = items #カラム名を再設定 | |
df.index = index # インデックス名を再設定 | |
return df | |
def draw_dendrogram(Z, labels): | |
# デンドログラムの作成 | |
plt.figure(figsize=(14, 5)) | |
plt.ylabel('Distance') | |
dendrogram( | |
Z, | |
leaf_rotation=90., | |
leaf_font_size=8., | |
labels=labels | |
) | |
plt.show() | |
def draw_scatter(df): | |
# targetごとに色分けしてプロットする | |
groups = df.groupby('fcluster') | |
for status, group in groups: | |
plt.plot(group[df.columns[2]], group[df.columns[3]], marker='o', linestyle='', ms=4) | |
plt.grid(True) | |
plt.show() | |
if __name__ == '__main__': | |
df = iris_df() | |
print(df) | |
metric_t = ('euclidean', 'cosine') # ユークリッド距離、コサイン類似度 | |
th_t = (4.0, 0.6) # 分割する距離の閾値 | |
for metric, th in zip(metric_t, th_t): | |
# petal length (cm)とpetal width (cm) で階層的クラスタリング | |
Z = linkage(pdist(df[df.columns[2:4]].as_matrix(), metric=metric), method='ward') | |
draw_dendrogram(Z, df.index.values) | |
# 距離をもとにフラットクラスターを作成 | |
df['fcluster'] = fcluster(Z, th, criterion='distance') | |
draw_scatter(df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment