Created
January 3, 2016 02:52
-
-
Save satzz/1e2b43999fd1a63977db to your computer and use it in GitHub Desktop.
k-means法で文書のクラスタリング ref: http://qiita.com/satzz/items/a3a3986750c52fd360d0
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn.cluster import KMeans |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
m = MeCab.Tagger(' -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
print '# LSI Model' | |
dimension = num_topics+3 | |
lsi_model = gensim.models.LsiModel(bow_docs.values(), num_topics=dimension) | |
lsi_docs = {} | |
for i, docname in enumerate(files): | |
vec = bow_docs[docname] | |
lsi_docs[i] = lsi_model[vec] | |
def vec2dense(vec, num_terms): | |
return list(gensim.matutils.corpus2dense([vec], num_terms=num_terms).T[0]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
print '# Clustering' | |
data_all = [vec2dense(lsi_docs[i],dimension) for i, docname in enumerate(files)] | |
normalized = [vec/np.linalg.norm(vec) for vec in data_all] | |
result = KMeans(n_clusters=num_topics).fit_predict(normalized) | |
for i,docname in enumerate(files): | |
print docname,'cluster',result[i] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# MORPHOLOGICAL ANALYSIS | |
number of features 1561 | |
# BAG OF WORDS | |
# LSI Model | |
# Clustering | |
birth-1 cluster 3 | |
birth-2 cluster 3 | |
birth-3 cluster 3 | |
birth-4 cluster 3 | |
birth-5 cluster 3 | |
birth-6 cluster 3 | |
birth-7 cluster 3 | |
birth-8 cluster 3 | |
design-1 cluster 2 | |
design-2 cluster 2 | |
design-3 cluster 2 | |
design-4 cluster 2 | |
design-5 cluster 2 | |
design-6 cluster 2 | |
design-7 cluster 2 | |
design-8 cluster 2 | |
ekiden-1 cluster 6 | |
ekiden-2 cluster 6 | |
ekiden-3 cluster 6 | |
ekiden-4 cluster 6 | |
ekiden-5 cluster 6 | |
ekiden-6 cluster 6 | |
ekiden-7 cluster 6 | |
ekiden-8 cluster 6 | |
fe-1 cluster 5 | |
fe-2 cluster 5 | |
fe-3 cluster 5 | |
fe-4 cluster 5 | |
fe-5 cluster 5 | |
fe-6 cluster 5 | |
fe-7 cluster 5 | |
fe-8 cluster 5 | |
ikukyu-1 cluster 9 | |
ikukyu-2 cluster 9 | |
ikukyu-3 cluster 9 | |
ikukyu-4 cluster 9 | |
ikukyu-5 cluster 9 | |
ikukyu-6 cluster 9 | |
ikukyu-7 cluster 9 | |
ikukyu-8 cluster 9 | |
riken-1 cluster 4 | |
riken-2 cluster 4 | |
riken-3 cluster 4 | |
riken-4 cluster 4 | |
riken-5 cluster 4 | |
riken-6 cluster 4 | |
riken-7 cluster 4 | |
riken-8 cluster 4 | |
starwars-1 cluster 1 | |
starwars-2 cluster 1 | |
starwars-3 cluster 1 | |
starwars-4 cluster 1 | |
starwars-5 cluster 1 | |
starwars-6 cluster 1 | |
starwars-7 cluster 1 | |
starwars-8 cluster 1 | |
takahama-1 cluster 7 | |
takahama-2 cluster 7 | |
takahama-3 cluster 7 | |
takahama-4 cluster 7 | |
takahama-5 cluster 7 | |
takahama-6 cluster 7 | |
takahama-7 cluster 7 | |
takahama-8 cluster 7 | |
thief-1 cluster 8 | |
thief-2 cluster 8 | |
thief-3 cluster 8 | |
thief-4 cluster 8 | |
thief-5 cluster 8 | |
thief-6 cluster 8 | |
thief-7 cluster 8 | |
thief-8 cluster 8 | |
tunnel-1 cluster 0 | |
tunnel-2 cluster 0 | |
tunnel-3 cluster 0 | |
tunnel-4 cluster 0 | |
tunnel-5 cluster 0 | |
tunnel-6 cluster 0 | |
tunnel-7 cluster 0 | |
tunnel-8 cluster 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment