Last active
January 2, 2016 13:15
-
-
Save satzz/009f60f9a304cd81de29 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import os | |
from os.path import join, dirname | |
import MeCab | |
import gensim | |
import numpy as np | |
from sklearn.cluster import KMeans | |
labels = [ | |
'birth', # 5年ぶり出生率増加 | |
'ekiden', # ニューイヤー駅伝関連 | |
'tunnel', # 君津のトンネルのモルタル剥離 | |
'ikukyu', # 国会議員の育休取得 | |
'fe', # 日本食品標準成分表のひじきの鉄分含有量修正 | |
'takahama', # 高浜原発関連 | |
'thief', # キングオブコメディの高橋健一逮捕 | |
'starwars', # スターウォーズ(フォースの覚醒)関連 | |
'design', # 国立競技場のデザイン関連 | |
'riken', # 理研の新元素命名権獲得 | |
] | |
num_topics = len(labels) | |
m = MeCab.Tagger(' -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd') | |
#m = MeCab.Tagger('') | |
article_path = join(dirname(__file__), 'articles') | |
for root, dirs, files in os.walk(article_path): | |
print '# MORPHOLOGICAL ANALYSIS' | |
docs = {} | |
for docname in files: | |
docs[docname] = [] | |
f = open(join(article_path,docname)) | |
lines = f.readlines() | |
for text in lines: | |
res = m.parseToNode(text) | |
while res: | |
arr = res.feature.split(",") | |
word = arr[6] | |
docs[docname].append(word) | |
res = res.next | |
dct = gensim.corpora.Dictionary(docs.values()) | |
dct.filter_extremes(no_below=2, no_above=0.1) | |
filtered = dct.token2id.keys() | |
print 'number of features', len(filtered) | |
# for key in filtered: | |
# print key | |
print "# BAG OF WORDS" | |
bow_docs = {} | |
for docname in files: | |
bow_docs[docname] = dct.doc2bow(docs[docname]) | |
print '# LSI Model' | |
dimension = num_topics+3 | |
lsi_model = gensim.models.LsiModel(bow_docs.values(), num_topics=dimension) | |
lsi_docs = {} | |
for i, docname in enumerate(files): | |
vec = bow_docs[docname] | |
lsi_docs[i] = lsi_model[vec] | |
def vec2dense(vec, num_terms): | |
return list(gensim.matutils.corpus2dense([vec], num_terms=num_terms).T[0]) | |
print '# Clustering' | |
data_all = [vec2dense(lsi_docs[i],dimension) for i, docname in enumerate(files)] | |
normalized = [vec/np.linalg.norm(vec) for vec in data_all] | |
result = KMeans(n_clusters=num_topics).fit_predict(normalized) | |
for i,docname in enumerate(files): | |
print docname,'cluster',result[i] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment