Skip to content

Instantly share code, notes, and snippets.

@uiur
Created November 7, 2015 06:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save uiur/7863c035aef576f39a29 to your computer and use it in GitHub Desktop.
Save uiur/7863c035aef576f39a29 to your computer and use it in GitHub Desktop.
lda livedoor news
# coding: utf-8
import glob
import MeCab
import gensim
from gensim import corpora, matutils
import numpy as np
import codecs
import sys
sys.stdout = codecs.getwriter('utf_8')(sys.stdout)
mecab = MeCab.Tagger('mecabrc -d /usr/lib/mecab/dic/mecab-ipadic-neologd')
stopwords = open('./stopwords.txt').read().split("\n")
def tokenize(text):
node = mecab.parseToNode(text)
words = []
while node:
feature = node.feature.split(',')
if feature[0] != '記号' and feature[1] != '数':
words.append(node.surface)
node = node.next
words = [word for word in words if not (word in stopwords)]
return words
documents = [
"\n".join(open(path).read().split("\n")[2:]) for path in glob.glob('./data/**/*.txt')
]
texts = [tokenize(document) for document in documents]
dictionary = corpora.Dictionary(texts)
dictionary.save('./test.dict')
dictionary = corpora.Dictionary.load('./test.dict')
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('./test.mm', corpus)
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
for topic in lda.show_topics(-1):
print topic[1]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment