Skip to content

Instantly share code, notes, and snippets.

@tos-kamiya
Created April 3, 2016 08:55
Show Gist options
  • Save tos-kamiya/16ac33386c6b22be8f04a5eab366a148 to your computer and use it in GitHub Desktop.
Save tos-kamiya/16ac33386c6b22be8f04a5eab366a148 to your computer and use it in GitHub Desktop.
習作: MeCabで分かち書きしてScikit-Learnでk-means法によるクラスタリング
# coding: utf-8
# ref: http://tt-house.com/2014/11/scikit-learn-text-clustering-python.html
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
import MeCab
def to_wakachigaki(text):
m = MeCab.Tagger("-Ochasen")
n = m.parseToNode(text)
r = []
while n:
if n.surface:
r.append(n.surface)
n = n.next
return ' '.join(r)
def main():
# 入力となるテキストたち
_items = [
to_wakachigaki('私負けましたわ。'),
to_wakachigaki('隣の客はよく柿食う客だ。'),
to_wakachigaki('庭には二羽鶏がいる。'),
to_wakachigaki("私の名前はボブです。"),
to_wakachigaki("こんにちは世界。"),
to_wakachigaki("私の名前はアリスです。"),
to_wakachigaki("ガルパンはいいぞ。"),
to_wakachigaki("京都市において、雷注意報が発令されました。"),
to_wakachigaki("私の名前はゴエモンです。"),
]
vectorizer = TfidfVectorizer(
use_idf=True
)
X = vectorizer.fit_transform(_items)
lsa = TruncatedSVD(4) # 分類の数
X = lsa.fit_transform(X)
X = Normalizer(copy=False).fit_transform(X)
km = KMeans(
init='k-means++',
)
km.fit(X)
print(km.labels_)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment