Instantly share code, notes, and snippets.

Embed
What would you like to do?
gensimに日本語Wikipediaを取り込むためのスクリプト
# coding: utf-8
"""USAGE: %(program)s WIKI_XML_DUMP OUTPUT_PREFIX
"""
import logging
import os.path
import sys
import gensim.corpora.wikicorpus as wikicorpus
from gensim.corpora import Dictionary, MmCorpus, WikiCorpus
from gensim.models import TfidfModel
from gensim.utils import to_unicode
import MeCab
# Wiki is first scanned for all distinct word types (~7M). The types that
# appear in more than 10% of articles are removed and from the rest, the
# DEFAULT_DICT_SIZE most frequent types are kept.
DEFAULT_DICT_SIZE = 100000
tagger = MeCab.Tagger()
tagger.parse('')
def tokenize_ja(text):
node = tagger.parseToNode(to_unicode(text, encoding='utf8', errors='ignore'))
while node:
if node.feature.split(',')[0] == '名詞':
yield node.surface.lower()
node = node.next
def tokenize(content):
return [
to_unicode(token) for token in tokenize_ja(content)
if 2 <= len(token) <= 15 and not token.startswith('_')
]
if __name__ == '__main__':
# https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/scripts/make_wikicorpus.py
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) < 3:
print(globals()['__doc__'] % locals())
sys.exit(1)
src, dst = sys.argv[1], sys.argv[2]
wikicorpus.tokenize = tokenize
wiki = WikiCorpus(src)
# only keep the most frequent words
wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
# save dictionary and bag-of-words (term-document frequency matrix)
MmCorpus.serialize(dst + '_bow.mm', wiki, progress_cnt=10000, metadata=True)
wiki.dictionary.save_as_text(dst + '_wordids.txt.bz2')
# load back the id->word mapping directly from file
# this seems to save more memory, compared to keeping the wiki.dictionary object from above
dictionary = Dictionary.load_from_text(dst + '_wordids.txt.bz2')
del wiki
# initialize corpus reader and word->id mapping
mm = MmCorpus(dst + '_bow.mm')
# build tfidf
tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
tfidf.save(dst + '.tfidf_model')
# save tfidf vectors in matrix market format
MmCorpus.serialize(dst + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
logger.info('finished running %s' % program)
@takuti

This comment has been minimized.

Owner

takuti commented Jul 21, 2017

lda = LdaModel.load(dst)

# load TFIDF (BoW) of each of all Wikipedia articles
tfidf = MmCorpus(prefix + '_tfidf.mm')

# get title to document index mapping
docno2metadata = unpickle(prefix + '_bow.mm.metadata.cpickle')
title2docno = {tup_title[1]: int(docno) for docno, tup_title in docno2metadata.items()}

titles = ['ビール', 'カブトムシ', '', '夏祭り']
for title in titles:
    topics = lda[tfidf[title2docno[title]]]
    topic = sorted(topics, key=lambda t: t[1], reverse=True)[0][0]
    print('=== %s (topic %d) ===' % (title, topic))
    for word, p_word in lda.show_topic(topic, topn=10):
        print('%.5f\t%s' % (p_word, word))
=== ビール (topic 99) ===
0.04528 植物
0.02466 料理
0.02348 栽培
0.01843 品種
0.01610 ビール
0.01584 醸造
0.01410 ワイン
0.01373 kt
0.01318 生産
0.01272 農業
=== カブトムシ (topic 46) ===
0.00462 顕微鏡
0.00352 地震
0.00339 '''()
0.00303 障害
0.00268 生育
0.00248 哲学
0.00238 発生
0.00236 意味
0.00230 効果
0.00224 患者
=== 海 (topic 32) ===
0.02139 フェリー
0.01960 航路
0.01791 就航
0.01597 運航
0.01130 建造
0.01113 船舶
0.00976 諸島
0.00939 海洋
0.00835 造船
0.00803 ハワイ
=== 夏祭り (topic 62) ===
0.01825 寺院
0.01744 日蓮宗
0.01113 神社
0.00987 文化財
0.00772 大字
0.00706 古墳
0.00676 共編
0.00670 辞典
0.00647 学区
0.00625 角川
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment