Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2010 Radim Rehurek <>
# Copyright (C) 2012 Lars Buitinck <>
# Copyright (C) 2018 Shinsuke Nagasawa <>
# Licensed under the GNU LGPL v2.1 -
Convert articles from a Wikipedia dump to (sparse) vectors. The input is a
bz2-compressed dump of Wikipedia articles, in XML format.
This actually creates three files:
* `OUTPUT_PREFIX_wordids.txt`: mapping between words and their integer ids
* ``: bag-of-words (word counts) representation, in
Matrix Matrix format
* ``: TF-IDF representation
* `OUTPUT_PREFIX.tfidf_model`: TF-IDF model dump
The output Matrix Market files can then be compressed (e.g., by bzip2) to save
disk space; gensim's corpus iterators can work with compressed input, too.
`VOCABULARY_SIZE` controls how many of the most frequent words to keep (after
removing tokens that appear in more than 10%% of all documents). Defaults to
If you have the `pattern` package installed, this script will use a fancy
lemmatization to get a lemma of each token (instead of plain alphabetic
tokenizer). The package is available at .
python ~/gensim/results/enwiki-latest-pages-articles.xml.bz2 ~/gensim/results/wiki
import logging
import os.path
import sys
import MeCab
import gensim.corpora.wikicorpus as wikicorpus
from builtins import str
from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus
from gensim.models import TfidfModel
# Wiki is first scanned for all distinct word types (~7M). The types that
# appear in more than 10% of articles are removed and from the rest, the
# DEFAULT_DICT_SIZE most frequent types are kept.
tagger = MeCab.Tagger()
def tokenize_ja(text, lower):
node = tagger.parseToNode(str(text))
while node:
if lower and node.feature.split(',')[0] == '名詞':
yield node.surface.lower()
node =
def tokenize(content, token_min_len, token_max_len, lower):
return [
str(token) for token in tokenize_ja(content, lower)
if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)"running %s", ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) < 3:
print(globals()['__doc__'] % locals())
inp, outp = sys.argv[1:3]
if not os.path.isdir(os.path.dirname(outp)):
raise SystemExit("Error: The output directory does not exist. Create the directory and try again.")
if len(sys.argv) > 3:
keep_words = int(sys.argv[3])
keep_words = DEFAULT_DICT_SIZE
online = 'online' in program
lemmatize = 'lemma' in program
debug = 'nodebug' not in program
if online:
dictionary = HashDictionary(id_range=keep_words, debug=debug)
dictionary.allow_update = True # start collecting document frequencies
wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary, tokenizer_func=tokenize, token_min_len=2,
token_max_len=15, lower=True)
# ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
MmCorpus.serialize(outp + '', wiki, progress_cnt=10000)
# with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
dictionary.save_as_text(outp + '_wordids.txt.bz2') + '_corpus.pkl.bz2')
dictionary.allow_update = False
wiki = WikiCorpus(inp, lemmatize=lemmatize, tokenizer_func=tokenize, token_min_len=2, token_max_len=15,
lower=True) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
# only keep the most frequent words (out of total ~8.2m unique tokens)
wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
# save dictionary and bag-of-words (term-document frequency matrix)
MmCorpus.serialize(outp + '', wiki, progress_cnt=10000) # another ~9h
wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
# load back the id->word mapping directly from file
# this seems to save more memory, compared to keeping the wiki.dictionary object from above
dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
del wiki
# initialize corpus reader and word->id mapping
mm = MmCorpus(outp + '')
# build tfidf, ~50min
tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) + '.tfidf_model')
# save tfidf vectors in matrix market format
# ~4h; result file is 15GB! bzip2'ed down to 4.5GB
MmCorpus.serialize(outp + '', tfidf[mm], progress_cnt=10000)"finished running %s", program)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment