Skip to content

Instantly share code, notes, and snippets.

@satzz
Created January 3, 2016 02:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save satzz/1e2b43999fd1a63977db to your computer and use it in GitHub Desktop.
Save satzz/1e2b43999fd1a63977db to your computer and use it in GitHub Desktop.
k-means法で文書のクラスタリング ref: http://qiita.com/satzz/items/a3a3986750c52fd360d0
import numpy as np
from sklearn.cluster import KMeans
m = MeCab.Tagger(' -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')
print '# LSI Model'
dimension = num_topics+3
lsi_model = gensim.models.LsiModel(bow_docs.values(), num_topics=dimension)
lsi_docs = {}
for i, docname in enumerate(files):
vec = bow_docs[docname]
lsi_docs[i] = lsi_model[vec]
def vec2dense(vec, num_terms):
return list(gensim.matutils.corpus2dense([vec], num_terms=num_terms).T[0])
print '# Clustering'
data_all = [vec2dense(lsi_docs[i],dimension) for i, docname in enumerate(files)]
normalized = [vec/np.linalg.norm(vec) for vec in data_all]
result = KMeans(n_clusters=num_topics).fit_predict(normalized)
for i,docname in enumerate(files):
print docname,'cluster',result[i]
# MORPHOLOGICAL ANALYSIS
number of features 1561
# BAG OF WORDS
# LSI Model
# Clustering
birth-1 cluster 3
birth-2 cluster 3
birth-3 cluster 3
birth-4 cluster 3
birth-5 cluster 3
birth-6 cluster 3
birth-7 cluster 3
birth-8 cluster 3
design-1 cluster 2
design-2 cluster 2
design-3 cluster 2
design-4 cluster 2
design-5 cluster 2
design-6 cluster 2
design-7 cluster 2
design-8 cluster 2
ekiden-1 cluster 6
ekiden-2 cluster 6
ekiden-3 cluster 6
ekiden-4 cluster 6
ekiden-5 cluster 6
ekiden-6 cluster 6
ekiden-7 cluster 6
ekiden-8 cluster 6
fe-1 cluster 5
fe-2 cluster 5
fe-3 cluster 5
fe-4 cluster 5
fe-5 cluster 5
fe-6 cluster 5
fe-7 cluster 5
fe-8 cluster 5
ikukyu-1 cluster 9
ikukyu-2 cluster 9
ikukyu-3 cluster 9
ikukyu-4 cluster 9
ikukyu-5 cluster 9
ikukyu-6 cluster 9
ikukyu-7 cluster 9
ikukyu-8 cluster 9
riken-1 cluster 4
riken-2 cluster 4
riken-3 cluster 4
riken-4 cluster 4
riken-5 cluster 4
riken-6 cluster 4
riken-7 cluster 4
riken-8 cluster 4
starwars-1 cluster 1
starwars-2 cluster 1
starwars-3 cluster 1
starwars-4 cluster 1
starwars-5 cluster 1
starwars-6 cluster 1
starwars-7 cluster 1
starwars-8 cluster 1
takahama-1 cluster 7
takahama-2 cluster 7
takahama-3 cluster 7
takahama-4 cluster 7
takahama-5 cluster 7
takahama-6 cluster 7
takahama-7 cluster 7
takahama-8 cluster 7
thief-1 cluster 8
thief-2 cluster 8
thief-3 cluster 8
thief-4 cluster 8
thief-5 cluster 8
thief-6 cluster 8
thief-7 cluster 8
thief-8 cluster 8
tunnel-1 cluster 0
tunnel-2 cluster 0
tunnel-3 cluster 0
tunnel-4 cluster 0
tunnel-5 cluster 0
tunnel-6 cluster 0
tunnel-7 cluster 0
tunnel-8 cluster 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment