Skip to content

Instantly share code, notes, and snippets.

@umaz
Created February 5, 2019 12:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save umaz/1849c7a09cf6aeb48dce1426cbe62542 to your computer and use it in GitHub Desktop.
Save umaz/1849c7a09cf6aeb48dce1426cbe62542 to your computer and use it in GitHub Desktop.
階層型クラスタリング
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, set_link_color_palette
np.set_printoptions(precision=2)
docs = np.array([
#形態素毎にスペースで区切られた文書の配列
])
vectorizer = TfidfVectorizer(use_idf=True)
vecs = vectorizer.fit_transform(docs).toarray()
z = linkage(vecs, method='ward')
# 3. 図のフォーマットを指定
plt.figure(figsize=(100, 50))
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.size'] = 10
plt.title('Hierarchical Clustering Dendrogram', fontsize=20)
plt.xlabel('Observation Points', fontsize=10)
plt.ylabel('Distance', fontsize=10)
# 4. デンドログラムの作成
set_link_color_palette(['purple', 'lawngreen', 'green', 'blue', 'orange', 'red']) # 6クラスタまでの色を指定
dendrogram(z,
leaf_font_size=10, # 横軸の文字の大きさを指定
color_threshold=2.1, # ユークリッド平方距離が7以上を同色で表示
above_threshold_color='black'
) # ユークリッド平方距離が7以上を黒色に指定
# 5. 各地点のクラスタ―番号の出力
group = fcluster(z, 2.1, criterion='distance') # ユークリッド平方距離で分けたい場合
#group = fcluster(z, 10, criterion='maxclust') # クラスタ数で分けたい場合
print (group)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment