Skip to content

Instantly share code, notes, and snippets.

@satzz
Last active January 2, 2016 13:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save satzz/009f60f9a304cd81de29 to your computer and use it in GitHub Desktop.
Save satzz/009f60f9a304cd81de29 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import os
from os.path import join, dirname
import MeCab
import gensim
import numpy as np
from sklearn.cluster import KMeans
labels = [
'birth', # 5年ぶり出生率増加
'ekiden', # ニューイヤー駅伝関連
'tunnel', # 君津のトンネルのモルタル剥離
'ikukyu', # 国会議員の育休取得
'fe', # 日本食品標準成分表のひじきの鉄分含有量修正
'takahama', # 高浜原発関連
'thief', # キングオブコメディの高橋健一逮捕
'starwars', # スターウォーズ(フォースの覚醒)関連
'design', # 国立競技場のデザイン関連
'riken', # 理研の新元素命名権獲得
]
num_topics = len(labels)
m = MeCab.Tagger(' -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')
#m = MeCab.Tagger('')
article_path = join(dirname(__file__), 'articles')
for root, dirs, files in os.walk(article_path):
print '# MORPHOLOGICAL ANALYSIS'
docs = {}
for docname in files:
docs[docname] = []
f = open(join(article_path,docname))
lines = f.readlines()
for text in lines:
res = m.parseToNode(text)
while res:
arr = res.feature.split(",")
word = arr[6]
docs[docname].append(word)
res = res.next
dct = gensim.corpora.Dictionary(docs.values())
dct.filter_extremes(no_below=2, no_above=0.1)
filtered = dct.token2id.keys()
print 'number of features', len(filtered)
# for key in filtered:
# print key
print "# BAG OF WORDS"
bow_docs = {}
for docname in files:
bow_docs[docname] = dct.doc2bow(docs[docname])
print '# LSI Model'
dimension = num_topics+3
lsi_model = gensim.models.LsiModel(bow_docs.values(), num_topics=dimension)
lsi_docs = {}
for i, docname in enumerate(files):
vec = bow_docs[docname]
lsi_docs[i] = lsi_model[vec]
def vec2dense(vec, num_terms):
return list(gensim.matutils.corpus2dense([vec], num_terms=num_terms).T[0])
print '# Clustering'
data_all = [vec2dense(lsi_docs[i],dimension) for i, docname in enumerate(files)]
normalized = [vec/np.linalg.norm(vec) for vec in data_all]
result = KMeans(n_clusters=num_topics).fit_predict(normalized)
for i,docname in enumerate(files):
print docname,'cluster',result[i]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment