Skip to content

Instantly share code, notes, and snippets.

@kohiro37
Last active August 22, 2018 12:37
Show Gist options
  • Save kohiro37/d18507ab949542b27ac6655aa443c024 to your computer and use it in GitHub Desktop.
Save kohiro37/d18507ab949542b27ac6655aa443c024 to your computer and use it in GitHub Desktop.
Comparison between Euclidean distance and Cosine similarity
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
def iris_df():
# scikit-learnのirisデータセット読み込み
iris = load_iris()
# irisデータセットをPandasのDetaFrameに変換
df = pd.DataFrame(data= np.c_[iris['data'], iris['target']], columns= iris['feature_names'] + ['target'])
# 正規化
items = df.columns
index = df.index
df = pd.DataFrame(MinMaxScaler().fit_transform(df))
df.columns = items #カラム名を再設定
df.index = index # インデックス名を再設定
return df
def draw_dendrogram(Z, labels):
# デンドログラムの作成
plt.figure(figsize=(14, 5))
plt.ylabel('Distance')
dendrogram(
Z,
leaf_rotation=90.,
leaf_font_size=8.,
labels=labels
)
plt.show()
def draw_scatter(df):
# targetごとに色分けしてプロットする
groups = df.groupby('fcluster')
for status, group in groups:
plt.plot(group[df.columns[2]], group[df.columns[3]], marker='o', linestyle='', ms=4)
plt.grid(True)
plt.show()
if __name__ == '__main__':
df = iris_df()
print(df)
metric_t = ('euclidean', 'cosine') # ユークリッド距離、コサイン類似度
th_t = (4.0, 0.6) # 分割する距離の閾値
for metric, th in zip(metric_t, th_t):
# petal length (cm)とpetal width (cm) で階層的クラスタリング
Z = linkage(pdist(df[df.columns[2:4]].as_matrix(), metric=metric), method='ward')
draw_dendrogram(Z, df.index.values)
# 距離をもとにフラットクラスターを作成
df['fcluster'] = fcluster(Z, th, criterion='distance')
draw_scatter(df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment