Created
February 5, 2019 12:49
-
-
Save umaz/1849c7a09cf6aeb48dce1426cbe62542 to your computer and use it in GitHub Desktop.
階層型クラスタリング
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.cluster import KMeans | |
from sklearn.decomposition import PCA | |
import matplotlib.pyplot as plt | |
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, set_link_color_palette | |
np.set_printoptions(precision=2) | |
docs = np.array([ | |
#形態素毎にスペースで区切られた文書の配列 | |
]) | |
vectorizer = TfidfVectorizer(use_idf=True) | |
vecs = vectorizer.fit_transform(docs).toarray() | |
z = linkage(vecs, method='ward') | |
# 3. 図のフォーマットを指定 | |
plt.figure(figsize=(100, 50)) | |
plt.rcParams['font.family'] = 'Times New Roman' | |
plt.rcParams['font.size'] = 10 | |
plt.title('Hierarchical Clustering Dendrogram', fontsize=20) | |
plt.xlabel('Observation Points', fontsize=10) | |
plt.ylabel('Distance', fontsize=10) | |
# 4. デンドログラムの作成 | |
set_link_color_palette(['purple', 'lawngreen', 'green', 'blue', 'orange', 'red']) # 6クラスタまでの色を指定 | |
dendrogram(z, | |
leaf_font_size=10, # 横軸の文字の大きさを指定 | |
color_threshold=2.1, # ユークリッド平方距離が7以上を同色で表示 | |
above_threshold_color='black' | |
) # ユークリッド平方距離が7以上を黒色に指定 | |
# 5. 各地点のクラスタ―番号の出力 | |
group = fcluster(z, 2.1, criterion='distance') # ユークリッド平方距離で分けたい場合 | |
#group = fcluster(z, 10, criterion='maxclust') # クラスタ数で分けたい場合 | |
print (group) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment