kgmyshin/jawikicorpus.py

## jawikicorpus.py
# -*- coding: utf-8 -*-

import logging
import os.path
import sys

from gensim.corpora.wikicorpus import filter_wiki
from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus
from gensim.models import TfidfModel


# Wiki is first scanned for all distinct word types (~7M). The types that
# appear in more than 10% of articles are removed and from the rest, the
# DEFAULT_DICT_SIZE most frequent types are kept.
DEFAULT_DICT_SIZE = 100000

def jatokenize(text):
    node = tagger.parseToNode(text.encode('utf-8')).next
    while node:
        if node.feature.split(',')[0] == '名詞':
            yield node.surface.lower()
        node = node.next

def tokenize(content):
    return [token for token in jatokenize(content) if not token.startswith('_')]

class JaWikiCorpus(WikiCorpus):
    def getArticles(self, return_raw=False):
        articles, articles_all = 0, 0
        intext, positions = False, 0
        for lineno, line in enumerate(bz2.BZ2File(self.fname)):
            if line.startswith('      <text'):
                intext = True
                line = line[line.find('>') + 1 : ]
                lines = [line]
            elif intext:
                lines.append(line)
            pos = line.find('</text>') # can be on the same line as <text>
            if pos >= 0:
                articles_all += 1
                intext = False
                if not lines:
                    continue
                lines[-1] = line[:pos]
                text = filter_wiki(''.join(lines))
                if len(text) > ARTICLE_MIN_CHARS: # article redirects are pruned here
                    articles += 1
                    if return_raw:
                        result = text
                    else:
                        result = tokenize(text) # text into tokens here
                        positions += len(result)
                    yield result

        logger.info("finished iterating over Wikipedia corpus of %i documents with %i positions"
                     " (total %i articles before pruning)" %
                     (articles, positions, articles_all))
        self.numDocs = articles # cache corpus length

if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    # check and process input arguments
    if len(sys.argv) < 3:
        print(globals()['__doc__'] % locals())
        sys.exit(1)
    inp, outp = sys.argv[1:3]
    if len(sys.argv) > 3:
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE
    online = 'online' in program
    lemmatize = 'lemma' in program
    debug = 'nodebug' not in program

    if online:
        dictionary = HashDictionary(id_range=keep_words, debug=debug)
        dictionary.allow_update = True # start collecting document frequencies
        wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
        # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
        dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        dictionary.save_as_text(outp + '_wordids.txt.bz2')
        wiki.save(outp + '_corpus.pkl.bz2')
        dictionary.allow_update = False
    else:
        wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
        # only keep the most frequent words (out of total ~8.2m unique tokens)
        wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        # save dictionary and bag-of-words (term-document frequency matrix)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
        wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
        # load back the id->word mapping directly from file
        # this seems to save more memory, compared to keeping the wiki.dictionary object from above
        dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
    del wiki

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)

    # save tfidf vectors in matrix market format
    # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)
	# -- coding: utf-8 --

	import logging
	import os.path
	import sys

	from gensim.corpora.wikicorpus import filter_wiki
	from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus
	from gensim.models import TfidfModel


	# Wiki is first scanned for all distinct word types (~7M). The types that
	# appear in more than 10% of articles are removed and from the rest, the
	# DEFAULT_DICT_SIZE most frequent types are kept.
	DEFAULT_DICT_SIZE = 100000

	def jatokenize(text):
	node = tagger.parseToNode(text.encode('utf-8')).next
	while node:
	if node.feature.split(',')[0] == '名詞':
	yield node.surface.lower()
	node = node.next

	def tokenize(content):
	return [token for token in jatokenize(content) if not token.startswith('_')]

	class JaWikiCorpus(WikiCorpus):
	def getArticles(self, return_raw=False):
	articles, articles_all = 0, 0
	intext, positions = False, 0
	for lineno, line in enumerate(bz2.BZ2File(self.fname)):
	if line.startswith(' <text'):
	intext = True
	line = line[line.find('>') + 1 : ]
	lines = [line]
	elif intext:
	lines.append(line)
	pos = line.find('</text>') # can be on the same line as <text>
	if pos >= 0:
	articles_all += 1
	intext = False
	if not lines:
	continue
	lines[-1] = line[:pos]
	text = filter_wiki(''.join(lines))
	if len(text) > ARTICLE_MIN_CHARS: # article redirects are pruned here
	articles += 1
	if return_raw:
	result = text
	else:
	result = tokenize(text) # text into tokens here
	positions += len(result)
	yield result

	logger.info("finished iterating over Wikipedia corpus of %i documents with %i positions"
	" (total %i articles before pruning)" %
	(articles, positions, articles_all))
	self.numDocs = articles # cache corpus length

	if __name__ == '__main__':
	program = os.path.basename(sys.argv[0])
	logger = logging.getLogger(program)

	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
	logging.root.setLevel(level=logging.INFO)
	logger.info("running %s" % ' '.join(sys.argv))

	# check and process input arguments
	if len(sys.argv) < 3:
	print(globals()['__doc__'] % locals())
	sys.exit(1)
	inp, outp = sys.argv[1:3]
	if len(sys.argv) > 3:
	keep_words = int(sys.argv[3])
	else:
	keep_words = DEFAULT_DICT_SIZE
	online = 'online' in program
	lemmatize = 'lemma' in program
	debug = 'nodebug' not in program

	if online:
	dictionary = HashDictionary(id_range=keep_words, debug=debug)
	dictionary.allow_update = True # start collecting document frequencies
	wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
	MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
	# with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
	dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
	dictionary.save_as_text(outp + '_wordids.txt.bz2')
	wiki.save(outp + '_corpus.pkl.bz2')
	dictionary.allow_update = False
	else:
	wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
	# only keep the most frequent words (out of total ~8.2m unique tokens)
	wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
	# save dictionary and bag-of-words (term-document frequency matrix)
	MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
	wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
	# load back the id->word mapping directly from file
	# this seems to save more memory, compared to keeping the wiki.dictionary object from above
	dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
	del wiki

	# initialize corpus reader and word->id mapping
	mm = MmCorpus(outp + '_bow.mm')

	# build tfidf, ~50min
	tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)

	# save tfidf vectors in matrix market format
	# ~4h; result file is 15GB! bzip2'ed down to 4.5GB
	MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

	logger.info("finished running %s" % program)