Skip to content

Instantly share code, notes, and snippets.

@yuku
Created June 22, 2011 15:40
Show Gist options
  • Star 10 You must be signed in to star a gist
  • Fork 11 You must be signed in to fork a gist
  • Save yuku/1040366 to your computer and use it in GitHub Desktop.
Save yuku/1040366 to your computer and use it in GitHub Desktop.
gensimに日本語Wikipediaを取り込むためのスクリプト
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
import sys
import os.path
import bz2
from gensim.corpora import WikiCorpus
from gensim.corpora.wikicorpus import filterWiki
import MeCab
logger = logging.getLogger('jawikicorpus')
logger.setLevel(logging.INFO)
tagger = MeCab.Tagger()
DEFAULT_DICT_SIZE = 100000
ARTICLE_MIN_CHARS = 500
def jatokenize(text):
node = tagger.parseToNode(text.encode('utf-8')).next
while node:
if node.feature.split(',')[0] == '名詞':
yield node.surface.lower()
node = node.next
def tokenize(content):
return [token for token in jatokenize(content) if not token.startswith('_')]
class JaWikiCorpus(WikiCorpus):
def getArticles(self, return_raw=False):
articles, articles_all = 0, 0
intext, positions = False, 0
for lineno, line in enumerate(bz2.BZ2File(self.fname)):
if line.startswith(' <text'):
intext = True
line = line[line.find('>') + 1 : ]
lines = [line]
elif intext:
lines.append(line)
pos = line.find('</text>') # can be on the same line as <text>
if pos >= 0:
articles_all += 1
intext = False
if not lines:
continue
lines[-1] = line[:pos]
text = filterWiki(''.join(lines))
if len(text) > ARTICLE_MIN_CHARS: # article redirects are pruned here
articles += 1
if return_raw:
result = text
else:
result = tokenize(text) # text into tokens here
positions += len(result)
yield result
logger.info("finished iterating over Wikipedia corpus of %i documents with %i positions"
" (total %i articles before pruning)" %
(articles, positions, articles_all))
self.numDocs = articles # cache corpus length
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)
logging.info("running %s" % ' '.join(sys.argv))
program = os.path.basename(sys.argv[0])
if len(sys.argv) < 3:
print globals()['__doc__'] % locals()
sys.exit(1)
input, output = sys.argv[1:3]
if len(sys.argv) > 3:
keep_words = int(sys.argv[3])
else:
keep_words = DEFAULT_DICT_SIZE
wiki = JaWikiCorpus(input, keep_words=keep_words)
wiki.saveAsText(output)
del wiki
from gensim.corpora import MmCorpus
id2token = JaWikiCorpus.loadDictionary(output + '_wordids.txt')
mm = MmCorpus(output + '_bow.mm')
from gensim.models import TfidfModel
tfidf = TfidfModel(mm, id2word=id2token, normalize=True)
MmCorpus.saveCorpus(output + '_tfidf.mm', tfidf[mm], progressCnt=10000)
logging.info("finished running %s" % program)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment