munyagu/make_wikicorpus_ja.py

## make_wikicorpus_ja.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Copyright (C) 2012 Lars Buitinck <larsmans@gmail.com>
# Copyright (C) 2018 Shinsuke Nagasawa <nagasawa@barbwire.co.jp>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""
USAGE: python make_wikicorpus_ja.py WIKI_XML_DUMP OUTPUT_PREFIX
Convert articles from a Wikipedia dump to (sparse) vectors. The input is a
bz2-compressed dump of Wikipedia articles, in XML format.
This actually creates three files:
* `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids
* `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in
  Matrix Matrix format
* `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation
* `OUTPUT_PREFIX.tfidf_model`: TF-IDF model dump
The output Matrix Market files can then be compressed (e.g., by bzip2) to save
disk space; gensim's corpus iterators can work with compressed input, too.
`VOCABULARY_SIZE` controls how many of the most frequent words to keep (after
removing tokens that appear in more than 10%% of all documents). Defaults to
100,000.
If you have the `pattern` package installed, this script will use a fancy
lemmatization to get a lemma of each token (instead of plain alphabetic
tokenizer). The package is available at https://github.com/clips/pattern .
Example:
  python make_wikicorpus_ja.py ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki
"""

import logging
import os.path
import sys
import MeCab
import gensim.corpora.wikicorpus as wikicorpus
from builtins import str

from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus
from gensim.models import TfidfModel

# Wiki is first scanned for all distinct word types (~7M). The types that
# appear in more than 10% of articles are removed and from the rest, the
# DEFAULT_DICT_SIZE most frequent types are kept.
DEFAULT_DICT_SIZE = 100000

tagger = MeCab.Tagger()
tagger.parse('')


def tokenize_ja(text, lower):
    node = tagger.parseToNode(str(text))
    while node:
        if lower and node.feature.split(',')[0] == '名詞':
            yield node.surface.lower()
        node = node.next


def tokenize(content, token_min_len, token_max_len, lower):
    return [
        str(token) for token in tokenize_ja(content, lower)
        if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
    ]


if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s", ' '.join(sys.argv))

    # check and process input arguments
    if len(sys.argv) < 3:
        print(globals()['__doc__'] % locals())
        sys.exit(1)
    inp, outp = sys.argv[1:3]

    if not os.path.isdir(os.path.dirname(outp)):
        raise SystemExit("Error: The output directory does not exist. Create the directory and try again.")

    if len(sys.argv) > 3:
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE
    online = 'online' in program
    lemmatize = 'lemma' in program
    debug = 'nodebug' not in program

    if online:
        dictionary = HashDictionary(id_range=keep_words, debug=debug)
        dictionary.allow_update = True  # start collecting document frequencies
        wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary, tokenizer_func=tokenize, token_min_len=2,
                          token_max_len=15, lower=True)
        # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
        # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
        dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        dictionary.save_as_text(outp + '_wordids.txt.bz2')
        wiki.save(outp + '_corpus.pkl.bz2')
        dictionary.allow_update = False
    else:
        wiki = WikiCorpus(inp, lemmatize=lemmatize, tokenizer_func=tokenize, token_min_len=2, token_max_len=15,
                          lower=True)  # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
        # only keep the most frequent words (out of total ~8.2m unique tokens)
        wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        # save dictionary and bag-of-words (term-document frequency matrix)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)  # another ~9h
        wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
        # load back the id->word mapping directly from file
        # this seems to save more memory, compared to keeping the wiki.dictionary object from above
        dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
    del wiki

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s", program)
	#!/usr/bin/env python
	# -- coding: utf-8 --
	#
	# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
	# Copyright (C) 2012 Lars Buitinck <larsmans@gmail.com>
	# Copyright (C) 2018 Shinsuke Nagasawa <nagasawa@barbwire.co.jp>
	# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


	"""
	USAGE: python make_wikicorpus_ja.py WIKI_XML_DUMP OUTPUT_PREFIX
	Convert articles from a Wikipedia dump to (sparse) vectors. The input is a
	bz2-compressed dump of Wikipedia articles, in XML format.
	This actually creates three files:
	* `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids
	* `OUTPUT_PREFIX_bow.mm`: bag-of-words (word counts) representation, in
	Matrix Matrix format
	* `OUTPUT_PREFIX_tfidf.mm`: TF-IDF representation
	* `OUTPUT_PREFIX.tfidf_model`: TF-IDF model dump
	The output Matrix Market files can then be compressed (e.g., by bzip2) to save
	disk space; gensim's corpus iterators can work with compressed input, too.
	`VOCABULARY_SIZE` controls how many of the most frequent words to keep (after
	removing tokens that appear in more than 10%% of all documents). Defaults to
	100,000.
	If you have the `pattern` package installed, this script will use a fancy
	lemmatization to get a lemma of each token (instead of plain alphabetic
	tokenizer). The package is available at https://github.com/clips/pattern .
	Example:
	python make_wikicorpus_ja.py ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki
	"""

	import logging
	import os.path
	import sys
	import MeCab
	import gensim.corpora.wikicorpus as wikicorpus
	from builtins import str

	from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus
	from gensim.models import TfidfModel

	# Wiki is first scanned for all distinct word types (~7M). The types that
	# appear in more than 10% of articles are removed and from the rest, the
	# DEFAULT_DICT_SIZE most frequent types are kept.
	DEFAULT_DICT_SIZE = 100000

	tagger = MeCab.Tagger()
	tagger.parse('')


	def tokenize_ja(text, lower):
	node = tagger.parseToNode(str(text))
	while node:
	if lower and node.feature.split(',')[0] == '名詞':
	yield node.surface.lower()
	node = node.next


	def tokenize(content, token_min_len, token_max_len, lower):
	return [
	str(token) for token in tokenize_ja(content, lower)
	if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
	]


	if __name__ == '__main__':
	program = os.path.basename(sys.argv[0])
	logger = logging.getLogger(program)

	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
	logging.root.setLevel(level=logging.INFO)
	logger.info("running %s", ' '.join(sys.argv))

	# check and process input arguments
	if len(sys.argv) < 3:
	print(globals()['__doc__'] % locals())
	sys.exit(1)
	inp, outp = sys.argv[1:3]

	if not os.path.isdir(os.path.dirname(outp)):
	raise SystemExit("Error: The output directory does not exist. Create the directory and try again.")

	if len(sys.argv) > 3:
	keep_words = int(sys.argv[3])
	else:
	keep_words = DEFAULT_DICT_SIZE
	online = 'online' in program
	lemmatize = 'lemma' in program
	debug = 'nodebug' not in program

	if online:
	dictionary = HashDictionary(id_range=keep_words, debug=debug)
	dictionary.allow_update = True # start collecting document frequencies
	wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary, tokenizer_func=tokenize, token_min_len=2,
	token_max_len=15, lower=True)
	# ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
	MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
	# with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
	dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
	dictionary.save_as_text(outp + '_wordids.txt.bz2')
	wiki.save(outp + '_corpus.pkl.bz2')
	dictionary.allow_update = False
	else:
	wiki = WikiCorpus(inp, lemmatize=lemmatize, tokenizer_func=tokenize, token_min_len=2, token_max_len=15,
	lower=True) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
	# only keep the most frequent words (out of total ~8.2m unique tokens)
	wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
	# save dictionary and bag-of-words (term-document frequency matrix)
	MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
	wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
	# load back the id->word mapping directly from file
	# this seems to save more memory, compared to keeping the wiki.dictionary object from above
	dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
	del wiki

	# initialize corpus reader and word->id mapping
	mm = MmCorpus(outp + '_bow.mm')

	# build tfidf, ~50min
	tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
	tfidf.save(outp + '.tfidf_model')

	# save tfidf vectors in matrix market format
	# ~4h; result file is 15GB! bzip2'ed down to 4.5GB
	MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

	logger.info("finished running %s", program)